In [1]:
import datalabframework as dlf

In [2]:
engine = dlf.engines.get('spark')
spark = engine.context()

In [3]:
#print out name and version
'{}:{}'.format(engine.info['context'], spark.sparkSession.version)

'spark:2.3.1'

In [4]:
df={}
for t in ['train', 'test']:
    df[t] = engine.read('.etl.extract.{}'.format(t))

In [5]:
from pyspark.sql.functions import isnan, when, count, col

for t in ['train', 'test']:
    df[t].select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df[t].columns]).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+

+-----------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|     0|   0|  0| 86|    0|    0|     0|   1|  327|       0|
+-----------+------+----+---+---+-----+-----+------+----+-----+--------+



In [6]:
# dropped columns
dropped_columns = ['Ticket', 'Cabin']

for t in ['train', 'test']:
    df[t] = df[t].drop(*dropped_columns)

In [7]:
df['train'].groupBy('Embarked').count().toPandas()

Unnamed: 0,Embarked,count
0,Q,77
1,,2
2,C,168
3,S,644


In [8]:
#simple fill for Fare, Price, Embarked
def fill_with_mode(df, colname):
    # which value is occuring most often?
    d = df.groupBy(colname).count().toPandas()
    fill_value = d.loc[d['count'].idxmax,colname]
    print('Filling column {} with value: {}'.format(colname, fill_value))

    #fill the na
    df = df.fillna(fill_value, colname)
    return df


from pyspark.sql.functions import avg
def fill_with_mean(df, colname):
    # which is the average / mean value?
    d = df.select(avg(colname)).collect()
    fill_value = d[0][0]
    print('Filling column {} with value: {}'.format(colname, fill_value))
    
    #fill the na
    df = df.fillna(fill_value, colname)
    return df

In [9]:
for t in ['train', 'test']:
    print('-- {} -----'.format(t))
    df[t] = fill_with_mode(df[t], 'Embarked')    
    df[t] = fill_with_mean(df[t], 'Fare')

-- train -----
Filling column Embarked with value: S
Filling column Fare with value: 32.2042079685746
-- test -----
Filling column Embarked with value: S
Filling column Fare with value: 35.6271884892086


In [10]:
fill_with_mean(df['train'], 'Age')

Filling column Age with value: 29.69911764705882


DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Fare: double, Embarked: string]

In [11]:
from pyspark.sql.functions import isnan, when, count, col

for t in ['train', 'test']:
    df[t].select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df[t].columns]).show()

+-----------+--------+------+----+---+---+-----+-----+----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Fare|Embarked|
+-----------+--------+------+----+---+---+-----+-----+----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|   0|       0|
+-----------+--------+------+----+---+---+-----+-----+----+--------+

+-----------+------+----+---+---+-----+-----+----+--------+
|PassengerId|Pclass|Name|Sex|Age|SibSp|Parch|Fare|Embarked|
+-----------+------+----+---+---+-----+-----+----+--------+
|          0|     0|   0|  0| 86|    0|    0|   0|       0|
+-----------+------+----+---+---+-----+-----+----+--------+



In [12]:
# #EXPORT 
# from pyspark.ml import Pipeline
# from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer
# from pyspark.sql.functions import col

# def string_indexer(features):
#     for c in features:
#         yield StringIndexer(inputCol=c, outputCol=c+'_I')

# def onehot(features):
#     #todo: reproducable mapping, 
#     #      here the mapping depends on the data provided
#     for c in features:
#         yield OneHotEncoder(inputCol=c, outputCol=c+'_C')

# def vectorize(stringCols, numericDiscreteCols, numericContinuosCols):
#     #todo: automatically classify columns
#     reg_all_discrete_cols = numericDiscreteCols + [c+'_I' for c in stringCols]
#     reg_all_cols = numericContinuosCols + [c+'_C' for c in reg_all_discrete_cols]

#     print(reg_all_cols)

#     si = list(string_indexer(stringCols))
#     oh = list(onehot(reg_all_discrete_cols))
#     ar = [VectorAssembler(inputCols=reg_all_cols, outputCol="features")]
    
#     stages=si+oh+ar
#     return stages

# def featurize(df, idCol, labelCol, numericContinuosCols, numericDiscreteCols, stringCols):

#     #set the pipeline
#     stages = vectorize(stringCols, numericDiscreteCols, numericContinuosCols)
#     pipeline = Pipeline(stages=stages)

#     #fit
#     model = pipeline.fit(df)

#     #transform
#     features_df = model.transform(df).select(col(idCol).alias('id'), col(labelCol).alias('label'),'features')
    
#     return features_df

In [13]:
import sys
import datalabframework as dlf

dlf.project.rootpath()
from etl.features.features import featurize

importing Jupyter notebook from /home/natbusa/Projects/dsp-titanic/src/etl/features/features.ipynb


In [16]:
from pyspark.ml.feature import RFormula
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import coalesce

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def contains_na(df, columns):
    d = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in columns])
    return sum(d.collect()[0])>0

def learn_imputation(df):
    # featurize
    
    # create lr estimator
    lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Fit the model, where label is not null
    model = lr.fit(df.where(df.label.isNotNull()))
    
    # Summarize the model over the training set and print out some metrics
    trainingSummary = model.summary
    
    print("Learning Linear Regression Model:")
    print(" - RMSE: %f" % trainingSummary.rootMeanSquaredError)
    print(" - r2: %f" % trainingSummary.r2)
    
    return model

def impute(df, df_features, model, idCol, labelCol):
    # impute dependent variable
    df_impute = model.transform(df_features)
    
    # join prediction with original dataframe
    df = df.join(df_impute.select(col('id').alias(idCol),'prediction'), idCol, "leftouter") 

    # coalesce null using imputation
    df =  df.withColumn(labelCol,coalesce(df[labelCol],df.prediction)).drop('prediction')

    return df

In [17]:
# Learn from both trainn and test
cols = set(df['train'].columns) & set(df['test'].columns)
df_union = df['train'].select(*cols).union(df['test'].select(*cols))

#select regressors and featurize
pipeline = featurize(['Fare'], ['Pclass','SibSp','Parch'], ['Sex', 'Embarked'])
model = pipeline.fit(df_union)

# learn linear regression model
d = model.transform(df_union).select(col('PassengerId').alias('id'), col('Age').alias('label'), 'features')
impute_model = learn_imputation(d)

for t in ['train', 'test']:
    print('-- {} -----'.format(t))
    df[t] = impute(df[t], d, impute_model, 'PassengerId', 'Age')
    df[t].show()

Learning Linear Regression Model:
 - RMSE: 10.769170
 - r2: 0.334880
-- train -----
+-----------+--------+------+--------------------+------+------------------+-----+-----+-------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|               Age|SibSp|Parch|   Fare|Embarked|
+-----------+--------+------+--------------------+------+------------------+-----+-----+-------+--------+
|        148|       0|     3|"Ford, Miss. Robi...|female|               9.0|    2|    2| 34.375|       S|
|        463|       0|     1|   Gee, Mr. Arthur H|  male|              47.0|    0|    0|   38.5|       S|
|        471|       0|     3|   Keefe, Mr. Arthur|  male|27.844465003907715|    0|    0|   7.25|       S|
|        496|       0|     3|Yousseff, Mr. Ger...|  male|26.552053955241124|    0|    0|14.4583|       C|
|        833|       0|     3|      Saad, Mr. Amin|  male|26.552053955241124|    0|    0| 7.2292|       C|
|        243|       0|     2|Coleridge, Mr. Re...|  male|           

### No NA beyond this point

In [18]:
for t in ['train', 'test']:
    assert not contains_na(df[t], df[t].columns)

### Write the output

In [19]:
for t in ['train', 'test']:
    engine.write(df[t], '.etl.clean.{}'.format(t), mode='overwrite')