In [3]:
import datalabframework as dlf

In [4]:
engine = dlf.engines.get('spark')
spark = engine.context()

In [5]:
#print out name and version
'{}:{}'.format(engine.info['context'], spark.sparkSession.version)

'spark:2.3.1'

In [6]:
df={}
for t in ['train', 'test']:
    df[t] = engine.read('.etl.extract.{}'.format(t))

In [7]:
from pyspark.sql.functions import isnan, when, count, col

for t in ['train', 'test']:
    df[t].select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df[t].columns]).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+

+-----------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|     0|   0|  0| 86|    0|    0|     0|   1|  327|       0|
+-----------+------+----+---+---+-----+-----+------+----+-----+--------+



In [8]:
# dropped columns
dropped_columns = ['Ticket', 'Cabin']

for t in ['train', 'test']:
    df[t] = df[t].drop(*dropped_columns)

In [9]:
df['train'].groupBy('Embarked').count().toPandas()

Unnamed: 0,Embarked,count
0,Q,77
1,,2
2,C,168
3,S,644


In [10]:
#simple fill for Fare, Price, Embarked
def fill_with_mode(df, colname):
    # which value is occuring most often?
    d = df.groupBy(colname).count().toPandas()
    fill_value = d.loc[d['count'].idxmax,colname]
    print('Filling column {} with value: {}'.format(colname, fill_value))

    #fill the na
    df = df.fillna(fill_value, colname)
    return df


from pyspark.sql.functions import avg
def fill_with_mean(df, colname):
    # which is the average / mean value?
    d = df.select(avg(colname)).collect()
    fill_value = d[0][0]
    print('Filling column {} with value: {}'.format(colname, fill_value))
    
    #fill the na
    df = df.fillna(fill_value, colname)
    return df

In [11]:
for t in ['train', 'test']:
    print('-- {} -----'.format(t))
    df[t] = fill_with_mode(df[t], 'Embarked')    
    df[t] = fill_with_mean(df[t], 'Fare')

-- train -----
Filling column Embarked with value: S
Filling column Fare with value: 32.2042079685746
-- test -----
Filling column Embarked with value: S
Filling column Fare with value: 35.6271884892086


In [12]:
fill_with_mean(df['train'], 'Age')

Filling column Age with value: 29.69911764705882


DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Fare: double, Embarked: string]

In [13]:
from pyspark.sql.functions import isnan, when, count, col

for t in ['train', 'test']:
    df[t].select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df[t].columns]).show()

+-----------+--------+------+----+---+---+-----+-----+----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Fare|Embarked|
+-----------+--------+------+----+---+---+-----+-----+----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|   0|       0|
+-----------+--------+------+----+---+---+-----+-----+----+--------+

+-----------+------+----+---+---+-----+-----+----+--------+
|PassengerId|Pclass|Name|Sex|Age|SibSp|Parch|Fare|Embarked|
+-----------+------+----+---+---+-----+-----+----+--------+
|          0|     0|   0|  0| 86|    0|    0|   0|       0|
+-----------+------+----+---+---+-----+-----+----+--------+



In [14]:
import sys
import datalabframework as dlf

dlf.project.rootpath()
from etl.features.features import featurize

importing Jupyter notebook from /home/natbusa/Projects/dsp-titanic/src/etl/features/features.ipynb


In [15]:
from pyspark.ml.feature import RFormula
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import coalesce

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def contains_na(df, columns=None):
    if not columns:
        columns = df.columns
    d = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in columns])
    return sum(d.collect()[0])>0

def learn_imputation(df):
    # featurize
    
    # create lr estimator
    lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Fit the model, where label is not null
    model = lr.fit(df.where(df.label.isNotNull()))
    
    # Summarize the model over the training set and print out some metrics
    trainingSummary = model.summary
    
    print("Learning Linear Regression Model:")
    print(" - RMSE: %f" % trainingSummary.rootMeanSquaredError)
    print(" - r2: %f" % trainingSummary.r2)
    
    return model

def impute(df, df_features, model, idCol, labelCol):
    # impute dependent variable
    df_impute = model.transform(df_features)
    
    # join prediction with original dataframe
    df = df.join(df_impute.select(col('id').alias(idCol),'prediction'), idCol, "leftouter") 

    # coalesce null using imputation
    df =  df.withColumn(labelCol,coalesce(df[labelCol],df.prediction)).drop('prediction')

    return df

In [16]:
# Learn from both trainn and test
cols = set(df['train'].columns) & set(df['test'].columns)
df_union = df['train'].select(*cols).union(df['test'].select(*cols))

#select regressors and featurize
pipeline = featurize(['Fare'], ['Pclass','SibSp','Parch'], ['Sex', 'Embarked'])
model = pipeline.fit(df_union)

# learn linear regression model
d = model.transform(df_union).select(col('PassengerId').alias('id'), col('Age').alias('label'), 'features')
impute_model = learn_imputation(d)

for t in ['train', 'test']:
    print('-- {} -----'.format(t))
    df[t] = impute(df[t], d, impute_model, 'PassengerId', 'Age')
    df[t].show()

Learning Linear Regression Model:
 - RMSE: 12.036436
 - r2: 0.301972
-- train -----
+-----------+--------+------+--------------------+------+------------------+-----+-----+-------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|               Age|SibSp|Parch|   Fare|Embarked|
+-----------+--------+------+--------------------+------+------------------+-----+-----+-------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|              22.0|    1|    0|   7.25|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|              38.0|    1|    0|71.2833|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|              26.0|    0|    0|  7.925|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|              35.0|    1|    0|   53.1|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|              35.0|    0|    0|   8.05|       S|
|          6|       0|     3|    Moran, Mr. James|  male|27.84446500

### No NA beyond this point

In [17]:
for t in ['train', 'test']:
    assert not contains_na(df[t])

### Write the output

In [18]:
for t in ['train', 'test']:
    engine.write(df[t], '.etl.clean.{}'.format(t), mode='overwrite')