In [3]:
%reload_ext sparkmagic.magics
%manage_spark

MagicsControllerWidget(children=(Tab(children=(ManageSessionWidget(children=(HTML(value='<br/>'), HTML(value='…

Added endpoint http://10.34.14.129:8999
Starting Spark application


LivyClientTimeoutException: Session 164 did not start up in 60 seconds.

In [67]:
from pyspark.sql.types import StructType, StructField, StringType, DateType, FloatType, IntegerType
import pyspark.sql.functions as F
from pyspark.sql.functions import when
from pyspark.sql.functions import col
from time import time

In [68]:
#Reading the joined parquet file 
df1 = spark.read.parquet("/Fannie-Mae/2016/FNMA_2016_Join_result_test.parquet/part*")
df2 = spark.read.parquet("/Fannie-Mae/2017/FNMA_2017_Join_result_test.parquet/part*")

In [69]:
#Renaming the ForeclosureDate column of 2016 to Default
df1 = df1.withColumnRenamed('ForeclosureDate','Default')
#Renaming the ForeclosureDate column of 2017 to Default
df2 = df2.withColumnRenamed('ForeclosureDate','Default')

In [70]:
df1 = df1.withColumn("Default",when(col("Default").isNull(),0).otherwise(1))
df2 = df2.withColumn("Default",when(col("Default").isNull(),0).otherwise(1))

In [71]:
df1 = df1.drop('LoanID','Channel','SellerName','OrDate','FirstPayment','FTHomeBuyer','LoanPurpose','PropertyType','OccStatus','PropertyState','ProductType','RelMortInd','Servicer','MaturityDate','CurDelStatus','ModFlag','ZeroBalEffDate','LastInstallDate','DispositionDate','PricipleForgiven','RMWPF','FPWA','ServicingIndicator','OrLTV','Zip','MortInsPerc','CoCreditScore','MortInsType','CurrInterestRate','CAUPB','MSA','ForeclosureCost','RepairCost','AssetRecCost','MiscCostsPF','ATFHP','NetSaleProceeds','CreditEnhProceeds','RPMWP','OtherForePro','NonInterestUPB','ReportingDate')

In [72]:
df2 = df2.drop('LoanID','Channel','SellerName','OrDate','FirstPayment','FTHomeBuyer','LoanPurpose','PropertyType','OccStatus','PropertyState','ProductType','RelMortInd','Servicer','MaturityDate','CurDelStatus','ModFlag','ZeroBalEffDate','LastInstallDate','DispositionDate','PricipleForgiven','RMWPF','FPWA','ServicingIndicator','OrLTV','Zip','MortInsPerc','CoCreditScore','MortInsType','CurrInterestRate','CAUPB','MSA','ForeclosureCost','RepairCost','AssetRecCost','MiscCostsPF','ATFHP','NetSaleProceeds','CreditEnhProceeds','RPMWP','OtherForePro','NonInterestUPB','ReportingDate')

In [73]:
df_2016 = df1.na.fill(0)
df_2017 = df2.na.fill(0)

In [74]:
## Let's stratify the data since we have a small amount of Foreclosures
positive_count_2016 = df_2016.filter(df_2016['Default'] == 1.0).count()

In [75]:
positive_count_2016

149

In [76]:
positive_count_2017 = df_2017.filter(df_2017['Default'] == 1.0).count()

In [77]:
positive_count_2017

115

In [78]:
data_size_2016 = df_2016.count()
strat_data_2016 = df_2016.sampleBy('Default', fractions={0: float(positive_count_2016)/ data_size_2016, 1: 1.0})

In [79]:
strat_data_2016.persist()

DataFrame[OrInterestRate: double, OrUnpaidPrinc: int, OrLoanTerm: int, OrCLTV: double, NumBorrow: double, DTIRat: double, CreditScore: double, NumUnits: int, LoanAge: int, MonthsToMaturity: double, AdMonthsToMaturity: double, ZeroBalCode: double, Default: int]

In [80]:
print(strat_data_2016.groupby('Default').count().toPandas())

   Default  count
0        1    149
1        0    147

In [81]:
data_size_2017 = df_2017.count()
strat_data_2017 = df_2017.sampleBy('Default', fractions={0: float(positive_count_2017)/ data_size_2017, 1: 1.0})

In [82]:
strat_data_2017.persist()

DataFrame[LoanAge: int, MonthsToMaturity: float, AdMonthsToMaturity: float, ZeroBalCode: float, Default: int, OrInterestRate: float, OrUnpaidPrinc: int, OrLoanTerm: int, OrCLTV: float, NumBorrow: float, DTIRat: float, CreditScore: float, NumUnits: int]

In [83]:
print(strat_data_2017.groupby('Default').count().toPandas())

   Default  count
0        1    115
1        0    116

In [84]:
train_data = strat_data_2016

In [85]:
test_data = strat_data_2017

In [86]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics

In [87]:
train_data.persist()

DataFrame[OrInterestRate: double, OrUnpaidPrinc: int, OrLoanTerm: int, OrCLTV: double, NumBorrow: double, DTIRat: double, CreditScore: double, NumUnits: int, LoanAge: int, MonthsToMaturity: double, AdMonthsToMaturity: double, ZeroBalCode: double, Default: int]

In [88]:
feature_cols_2016 = df_2016.drop('Default').drop('id').columns
assembler_2016 = VectorAssembler(inputCols=feature_cols_2016, outputCol='features')

In [89]:
lr = LogisticRegression(labelCol='Default', featuresCol='features')

In [90]:
pipeline = Pipeline(stages=[assembler_2016, lr])

In [91]:
paramGrid = ParamGridBuilder() \
    .addGrid(lr.maxIter, [1, 10, 100]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

In [92]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(labelCol='Default', predictionCol='prediction'),
                          numFolds=3)


In [93]:
time_s = time()
cv_model = crossval.fit(train_data)
time_e = time()

print ('Total training time: %f' % (time_e - time_s))

Total training time: 69.308086

In [94]:
def print_metrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print('Precision of True ', metrics.precision(1))
    print('Precision of False', metrics.precision(0))
    print('Recall of True    ', metrics.recall(1))
    print('Recall of False   ', metrics.recall(0))
    print('F-1 Score         ', metrics.fMeasure())
    print('Confusion Matrix\n', metrics.confusionMatrix().toArray())

In [95]:
test_data.persist()

DataFrame[LoanAge: int, MonthsToMaturity: float, AdMonthsToMaturity: float, ZeroBalCode: float, Default: int, OrInterestRate: float, OrUnpaidPrinc: int, OrLoanTerm: int, OrCLTV: float, NumBorrow: float, DTIRat: float, CreditScore: float, NumUnits: int]

In [96]:
predictions = cv_model.transform(test_data)
accuracy = cv_model.getEvaluator().evaluate(predictions)
print('F1 Accuracy: %f' % accuracy)

F1 Accuracy: 0.969680

In [97]:
predictions_and_labels = predictions.select("prediction", "Default").rdd.map(lambda r: (float(r[0]), float(r[1])))

In [98]:
print_metrics(predictions_and_labels)

Precision of True  0.990909090909091
Precision of False 0.9504132231404959
Recall of True     0.9478260869565217
Recall of False    0.9913793103448276
F-1 Score          0.9696969696969697
Confusion Matrix
 [[115.   1.]
 [  6. 109.]]