In [4]:
import pandas as pd
import numpy as np
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [6]:
filepath = 'C:/Users/Tim/Desktop/lighthouse/w7/d3/'
dataset = 'titanic_dataset.csv'

sc = SparkContext()
sqlContext = SQLContext(sc)

# df = pd.read_csv(filepath+dataset)

In [90]:
df = sqlContext.read.option("inferSchema",True).option("header",True).csv(filepath+dataset)
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [91]:
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import mean

In [92]:
# def info(x):
#     n_missing = x.isnull().sum().sort_values(ascending=False)
#     p_missing = (x.isnull().sum()/x.isnull().count()).sort_values(ascending=False)
#     dtype = x.dtypes
#     count = x.count()
#     missing_ = pd.concat([n_missing, p_missing, dtype, count],axis=1, keys = [
#         'number_missing',
#         'percent_missing',
#         'type',
#         'count'
#     ])
#     return missing_
# info(df)

df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



In [93]:
df = df.select([c for c in df.columns if c not in {'PassengerId','Cabin','Ticket','Name'}])

In [94]:
mean_val=df.select(mean(df.Age)).collect()
mean_val[0][0]

29.69911764705882

In [95]:
df = df.na.fill(mean_val[0][0],subset=['Age'])
df = df.na.drop()

In [96]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+--------+------+---+---+-----+-----+----+--------+
|Survived|Pclass|Sex|Age|SibSp|Parch|Fare|Embarked|
+--------+------+---+---+-----+-----+----+--------+
|       0|     0|  0|  0|    0|    0|   0|       0|
+--------+------+---+---+-----+-----+----+--------+



In [97]:
df.describe().show()

+-------+-------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|summary|           Survived|            Pclass|   Sex|               Age|             SibSp|              Parch|             Fare|Embarked|
+-------+-------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|  count|                889|               889|   889|               889|               889|                889|              889|     889|
|   mean|0.38245219347581555|2.3115860517435323|  null|29.653446370674192|0.5241844769403825|0.38245219347581555|32.09668087739029|    null|
| stddev|0.48625968831477334|0.8346997785705753|  null|12.968366309252314| 1.103704875596923| 0.8067607445174785|49.69750431670795|    null|
|    min|                  0|                 1|female|              0.42|                 0|                  0|              0.0|       C|
|    max|    

In [98]:
# X = df.select([c for c in df.columns if c not in {Survived}])
# y = df.select('Survived')

In [99]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler

In [141]:
other = [
    'Pclass',
    'Parch',
    'SibSp'
]
categoricals = [
    'Sex',
    'Embarked'
]
continuous = [
    'Age',
    'Fare'
]

stages = [] # stages in our Pipeline
for col in categoricals:
    stringIndexer = StringIndexer(inputCol=col, outputCol=col + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()],
                                     outputCols=[col + "classVec"])
    stages += [stringIndexer, encoder]
    
# for concol in continuous:
#     standardscaler = StandardScaler(inputCol=concol, outputCol = concol+'scaled', withStd = True)
#     stages += [standardscaler]

In [142]:
# Convert label into label indices using the StringIndexer
label_stringIdx =  StringIndexer(inputCol="Survived", outputCol="newlabel")
stages += [label_stringIdx]

In [143]:
assemblerInput = other + [c + "classVec" for c in categoricals] + continuous#[d + "scaled" for d in continuous]

In [144]:
assembler = VectorAssembler(inputCols=assemblerInput, outputCol="features")
stages += [assembler]

In [145]:
df.head(5)

[Row(Survived=0, Pclass=3, Sex='male', Age=22.0, SibSp=1, Parch=0, Fare=7.25, Embarked='S'),
 Row(Survived=1, Pclass=1, Sex='female', Age=38.0, SibSp=1, Parch=0, Fare=71.2833, Embarked='C'),
 Row(Survived=1, Pclass=3, Sex='female', Age=26.0, SibSp=0, Parch=0, Fare=7.925, Embarked='S'),
 Row(Survived=1, Pclass=1, Sex='female', Age=35.0, SibSp=1, Parch=0, Fare=53.1, Embarked='S'),
 Row(Survived=0, Pclass=3, Sex='male', Age=35.0, SibSp=0, Parch=0, Fare=8.05, Embarked='S')]

In [146]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
model = pipelineModel.transform(df)

In [147]:
from pyspark.ml.linalg import DenseVector
input_data = model.rdd.map(lambda x: (x["newlabel"], DenseVector(x["features"])))

In [148]:
df_train = sqlContext.createDataFrame(input_data, ["label", "features"])

In [149]:
df_train.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[3.0,0.0,1.0,1.0,...|
|  1.0|[1.0,0.0,1.0,0.0,...|
|  1.0|[3.0,0.0,0.0,0.0,...|
|  1.0|[1.0,0.0,1.0,0.0,...|
|  0.0|[3.0,0.0,0.0,1.0,...|
+-----+--------------------+
only showing top 5 rows



In [151]:
# Split the data into train and test sets
df_train = df_train.repartition(100, 'label')
train_data, test_data = df_train.randomSplit([.7,.3],seed=123)

In [152]:
train_data.groupby('label').agg({'label': 'count'}).show()

+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         405|
|  1.0|         234|
+-----+------------+



In [153]:
test_data.groupby('label').agg({'label': 'count'}).show()

+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         144|
|  1.0|         106|
+-----+------------+



In [154]:
from pyspark.ml.classification import LogisticRegression

In [155]:
lr = LogisticRegression(labelCol="label",
                        featuresCol="features",
                        maxIter=10,
                        regParam=0.3)
linearModel = lr.fit(train_data)

In [156]:
# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(linearModel.coefficients))
print("Intercept: " + str(linearModel.intercept))

Coefficients: [-0.30750717196605837,-0.017126025781781736,-0.052491181669684604,-1.022726360142806,-0.1667580499016104,0.22723265682069965,-0.009912410346669365,0.0024472705182523578]
Intercept: 1.1340869380909364


In [157]:
# Make predictions on test data using the transform() method.
predictions = linearModel.transform(test_data)

In [158]:
cm = predictions.select("label", "prediction")

In [159]:
# compare results with test_data.groupby('label').agg({'label': 'count'}).show()
cm.groupby('prediction').agg({'prediction': 'count'}).show()

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              192|
|       1.0|               58|
+----------+-----------------+



In [160]:
# cm.filter(cm.label == cm.prediction).count() / cm.count()

def accuracy_m(model): 
    predictions = model.transform(test_data)
    cm = predictions.select("label", "prediction")
    acc = cm.filter(cm.label == cm.prediction).count() / cm.count()
    print("Model accuracy: %.3f%%" % (acc * 100)) 
accuracy_m(model = linearModel)

Model accuracy: 76.000%


In [161]:
### Use ROC 
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print(evaluator.evaluate(predictions))
print(evaluator.getMetricName())

0.8215408805031448
areaUnderROC


In [162]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.03, 0.1, 0.3, 1])
             .build())

In [163]:
from time import *
start_time = time()

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(train_data)
# likely take a fair amount of time
end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)

Time to train model: 182.580 seconds


In [164]:
bestModel = cvModel.bestModel
bestModel.extractParamMap()

{Param(parent='LogisticRegression_56a5fc111f46', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,
 Param(parent='LogisticRegression_56a5fc111f46', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
 Param(parent='LogisticRegression_56a5fc111f46', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial'): 'auto',
 Param(parent='LogisticRegression_56a5fc111f46', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='LogisticRegression_56a5fc111f46', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LogisticRegression_56a5fc111f46', name='labelCol', doc='label column name.'): 'label',
 Param(parent='LogisticRegression_56a5fc111f46', name='maxIter', doc='max number of iterations (>= 0).'):

In [165]:
accuracy_m(model = cvModel)

Model accuracy: 76.000%


In [166]:
accuracy_m(model = bestModel)

Model accuracy: 76.000%
