In [2]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [3]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

### Step 1
- Load the train and test sets
- Check the schema, the variables have their right types?
- If not, how to correctly load the datasets?

In [22]:
train_df = sqlc.read.format('com.databricks.spark.csv')\
                .option('header', 'true')\
                .option('inferschema', 'true')\
                .option('mode', 'DROPMALFORMED')\
                .load('train.csv')

In [25]:
test_df = sqlc.read.format('com.databricks.spark.csv')\
                .option('header', 'true')\
                .option('inferschema', 'true')\
                .option('mode', 'DROPMALFORMED')\
                .load('test.csv')

In [23]:
train_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [26]:
test_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



### Step 2
- Explore the features of your dataset
- You can use DataFrame's ***describe*** method to get summary statistics
    - hint: ***toPandas*** may be useful to ease the manipulation of small dataframes
- Are there any ***NaN*** values in your dataset?
- If so, define value/values to fill these ***NaN*** values
    - hint: ***na*** property of DataFrames provide several methods of handling NA values

In [38]:
train_df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [31]:
train_df.summary().toPandas()

Unnamed: 0,summary,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
1,mean,446.0,0.3838383838383838,2.308641975308642,,,29.69911764705882,0.5230078563411896,0.3815937149270482,260318.54916792738,32.2042079685746,,
2,stddev,257.3538420152301,0.4865924542648575,0.8360712409770491,,,14.526497332334037,1.1027434322934315,0.8060572211299488,471609.26868834975,49.69342859718089,,
3,min,1.0,0.0,1.0,"""Andersson, Mr. August Edvard (""""Wennerstrom"""")""",female,0.42,0.0,0.0,110152,0.0,A10,C
4,25%,223.0,0.0,2.0,,,20.0,0.0,0.0,19996.0,7.8958,,
5,50%,446.0,0.0,3.0,,,28.0,0.0,0.0,236171.0,14.4542,,
6,75%,669.0,1.0,3.0,,,38.0,1.0,0.0,347743.0,31.0,,
7,max,891.0,1.0,3.0,"van Melkebeke, Mr. Philemon",male,80.0,8.0,6.0,WE/P 5735,512.3292,T,S


In [41]:
# Fill age with the mean = 30
train_df = train_df.na.fill({'Age': 30})
train_df.summary().toPandas()

Unnamed: 0,summary,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,count,891.0,891.0,891.0,891,891,891.0,891.0,891.0,891,891.0,204,889
1,mean,446.0,0.3838383838383838,2.308641975308642,,,29.758888888888887,0.5230078563411896,0.3815937149270482,260318.54916792738,32.2042079685746,,
2,stddev,257.3538420152301,0.4865924542648575,0.8360712409770491,,,13.002570039820949,1.1027434322934315,0.8060572211299488,471609.26868834975,49.69342859718089,,
3,min,1.0,0.0,1.0,"""Andersson, Mr. August Edvard (""""Wennerstrom"""")""",female,0.42,0.0,0.0,110152,0.0,A10,C
4,25%,223.0,0.0,2.0,,,22.0,0.0,0.0,19996.0,7.8958,,
5,50%,446.0,0.0,3.0,,,30.0,0.0,0.0,236171.0,14.4542,,
6,75%,669.0,1.0,3.0,,,35.0,1.0,0.0,347743.0,31.0,,
7,max,891.0,1.0,3.0,"van Melkebeke, Mr. Philemon",male,80.0,8.0,6.0,WE/P 5735,512.3292,T,S


In [43]:
# Drop rows with embarked = null
train_df = train_df.na.drop(how='all',subset=['Embarked'])
train_df.summary().toPandas()

Unnamed: 0,summary,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,count,889.0,889.0,889.0,889,889,889.0,889.0,889.0,889,889.0,202,889
1,mean,446.0,0.3824521934758155,2.3115860517435323,,,29.713352080989875,0.5241844769403825,0.3824521934758155,260763.9104704097,32.09668087739029,,
2,stddev,256.9981727771832,0.4862596883147733,0.8346997785705753,,,12.969134703869637,1.103704875596923,0.8067607445174785,472255.95121695305,49.69750431670795,,
3,min,1.0,0.0,1.0,"""Andersson, Mr. August Edvard (""""Wennerstrom"""")""",female,0.42,0.0,0.0,110152,0.0,A10,C
4,25%,224.0,0.0,2.0,,,22.0,0.0,0.0,19996.0,7.8958,,
5,50%,446.0,0.0,3.0,,,30.0,0.0,0.0,236852.0,14.4542,,
6,75%,668.0,1.0,3.0,,,35.0,1.0,0.0,348121.0,31.0,,
7,max,891.0,1.0,3.0,"van Melkebeke, Mr. Philemon",male,80.0,8.0,6.0,WE/P 5735,512.3292,T,S


In [46]:
test_df.summary().toPandas()

Unnamed: 0,summary,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,count,418.0,418.0,418,418,332.0,418.0,418.0,418,417.0,91,418
1,mean,1100.5,2.2655502392344498,,,30.272590361445783,0.4473684210526316,0.3923444976076555,223850.98986486485,35.6271884892086,,
2,stddev,120.81045760473994,0.8418375519640503,,,14.181209235624424,0.8967595611217135,0.9814288785371694,369523.7764694362,55.90757617997384,,
3,min,892.0,1.0,"""Assaf Khalil, Mrs. Mariana (Miriam"""")""""""",female,0.17,0.0,0.0,110469,0.0,A11,C
4,25%,996.0,1.0,,,21.0,0.0,0.0,17464.0,7.8958,,
5,50%,1100.0,3.0,,,27.0,0.0,0.0,230136.0,14.4542,,
6,75%,1205.0,3.0,,,39.0,1.0,0.0,347080.0,31.5,,
7,max,1309.0,3.0,"van Billiard, Master. Walter John",male,76.0,8.0,9.0,W.E.P. 5734,512.3292,G6,S


### Step 3
- How to handle categorical features?
    - hint: check the Estimators and Transformers
- Assemble all desired features into a Vector using the VectorAssembler Transformer
- Make sure to end up with a DataFrame with two columns: ***Survived*** and ***vFeatures***

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.util import MLUtils

### INSERT YOUR CODE HERE

### Step 4
- Apply a normalization Estimator of your choice to the ***features*** vector obtained in Step 3

In [None]:
from pyspark.ml.feature import StandardScaler

### INSERT YOUR CODE HERE

### Step 5
- Instead of doing transformations on separate steps, put everything together with a Pipeline

In [None]:
from pyspark.ml.pipeline import Pipeline

### INSERT YOUR CODE HERE

### Step 6
- Train a classifier of your choice (for instance, Random Forest) using your dataset of LabeledPoints
- Make predictions for the training data
- Use the evaluators to find the Area Under ROC and Accuracy of your model
- How is your model performing? Try to tune its parameters

In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

### INSERT YOUR CODE HERE

### Step 7
- Take a look at the test data - use DataFrame's ***createOrReplaceTempView*** method to perform SQL queries over the data
    - hint: check if there are any NULL values in the dataset - if so, handle them
- Apply the transformations to the test data
    - hint: include the model to the pipeline
- Make predictions using the model previously trained and the transformed test data

In [None]:
### INSERT YOUR CODE HERE

### Step 8

- Load the answers for the ***test*** data
- Combine it with your predictions into a single DataFrame
- Use the evaluator you created on ***Step 6***
- What was your score?

In [None]:
### INSERT YOUR CODE HERE