## Titanic Linear Model

In [1]:
%%datalabframework getfilename

In [2]:
import datalabframework as dlf
logger = dlf.log.initLogger(__name__, kafka_topic="datalab", kafka_servers="kafka:9092")

2017-04-30 14:28:12,715 - jovyan - __main__ - INFO - init - {'datalab': {'framework': '0.1'}, 'notebook': {'filepath': '/home/jovyan/work/notebooks/models', 'filename': 'regression.ipynb'}, 'project': {'main': 'main.ipynb', 'rootpath': '/home/jovyan/work/notebooks'}}


In [3]:
# EXPORT

from pyspark.ml.classification import LogisticRegression

In [4]:
# EXPORT

def lr_model(df):
   
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    model = lr.fit(df)

    return model

In [5]:
DATA_ROOT = '/home/jovyan/work/data'
d = {   
    'input_sample' : 1.0,
    'input_source' : DATA_ROOT + '/titanic/data/set/train/features.parquet',
    'output_model' : DATA_ROOT + '/titanic/models/lr.model'
}
p = dlf.params.config_fromdict(d)

In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName(dlf.project.filename()).getOrCreate()
spark.version

'2.1.0'

In [7]:
df = spark.read.parquet(p.input_source)
df.printSchema()
df.show(10, truncate=False)

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Title: string (nullable = true)



+-----------+--------+------+------+------------------+-----+-----+-------+--------+------+
|PassengerId|Survived|Pclass|Sex   |Age               |SibSp|Parch|Fare   |Embarked|Title |
+-----------+--------+------+------+------------------+-----+-----+-------+--------+------+
|1          |0       |3     |male  |22.0              |1    |0    |7.25   |S       |Mr    |
|2          |1       |1     |female|38.0              |1    |0    |71.2833|C       |Mrs   |
|3          |1       |3     |female|26.0              |0    |0    |7.925  |S       |Miss  |
|4          |1       |1     |female|35.0              |1    |0    |53.1   |S       |Mrs   |
|5          |0       |3     |male  |35.0              |0    |0    |8.05   |S       |Mr    |
|6          |0       |3     |male  |28.467886947074096|0    |0    |8.4583 |Q       |Mr    |
|7          |0       |1     |male  |54.0              |0    |0    |51.8625|S       |Mr    |
|8          |0       |3     |male  |2.0               |3    |1    |21.075 |S    

In [8]:
from features import features
df_features = features.feature_vector(df, 
                  'PassengerId', 
                  'Survived', 
                  ['Pclass', 
                   'Sex', 
                   'Age', 
                   'SibSp', 
                   'Parch',
                   'Fare', 
                   'Embarked', 
                   'Title'])

importing Jupyter notebook from /home/jovyan/work/notebooks/features/features.ipynb


In [9]:
model = lr_model(df_features)

In [10]:
# no overwrite mode in python for spark yet :(
import shutil
try:
    shutil.rmtree(p.output_model)
except:
    pass

In [11]:
model.save(p.output_model)

In [12]:
df_features.show(n=5,truncate=False)

+-----------+----------------------------------------------------+-----+
|PassengerId|features                                            |label|
+-----------+----------------------------------------------------+-----+
|1          |(24,[0,1,2,3,5,6,8],[3.0,1.0,22.0,1.0,7.25,1.0,1.0])|0.0  |
|2          |(24,[0,2,3,5,7,10],[1.0,38.0,1.0,71.2833,1.0,1.0])  |1.0  |
|3          |(24,[0,2,5,6,9],[3.0,26.0,7.925,1.0,1.0])           |1.0  |
|4          |(24,[0,2,3,5,6,10],[1.0,35.0,1.0,53.1,1.0,1.0])     |1.0  |
|5          |(24,[0,1,2,5,6,8],[3.0,1.0,35.0,8.05,1.0,1.0])      |0.0  |
+-----------+----------------------------------------------------+-----+
only showing top 5 rows



In [13]:
prediction = model.transform(df_features)
prediction.show(5)

+-----------+--------------------+-----+--------------------+--------------------+----------+
|PassengerId|            features|label|       rawPrediction|         probability|prediction|
+-----------+--------------------+-----+--------------------+--------------------+----------+
|          1|(24,[0,1,2,3,5,6,...|  0.0|[2.56148487671463...|[0.92834130012375...|       0.0|
|          2|(24,[0,2,3,5,7,10...|  1.0|[-2.4922684671590...|[0.07640196973048...|       1.0|
|          3|(24,[0,2,5,6,9],[...|  1.0|[-0.8402052862241...|[0.30149155010755...|       1.0|
|          4|(24,[0,2,3,5,6,10...|  1.0|[-1.9965592496936...|[0.11956465232543...|       1.0|
|          5|(24,[0,1,2,5,6,8]...|  0.0|[2.13531689262064...|[0.89428870306582...|       0.0|
+-----------+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [14]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()

metric = evaluator.evaluate(prediction)
metric_name = evaluator.getMetricName()

print("Evaluation {} : {}".format(metric_name, metric))

Evaluation areaUnderROC : 0.8742077567933189
