In [None]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DateType, FloatType, IntegerType
import pyspark.sql.functions as F

In [None]:
#changing user to hdfs such that it can access files in the hdfs
os.environ["HADOOP_USER_NAME"] = "hdfs"

In [None]:
conf = SparkConf().setAll((
    ("spark.task.maxFailures", "10"),
    ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
    ("spark.sql.execution.arrow.enabled", "true"),
    ("spark.shuffle.service.enabled", "true"),
    ("spark.driver.memory", "12g"),
    ("spark.dynamicAllocation.enabled", "true")))

In [None]:
conf.setAppName("prediction_2016").setMaster("yarn-client")

In [None]:
spark = SparkSession.builder \
    .appName("FNMA Spark -  Notebook 3") \
    .config(conf=conf) \
    .getOrCreate()


In [None]:
spark

In [None]:
df = spark.read.parquet("/Fannie-Mae/2016/FNMA_2016_Join_result_test.parquet/part*")

In [None]:
df.rdd.getNumPartitions()

In [None]:
#Renaming the ForeclosureDate column to Default
df=df.withColumnRenamed('ForeclosureDate','Default')

In [None]:
df1 = df.withColumn("Default",when(col("Default").isNull(),0).otherwise(1))

In [None]:
df1 = df1.drop('LoanID','Channel','SellerName','OrDate','FirstPayment','FTHomeBuyer','LoanPurpose','PropertyType','OccStatus','PropertyState','ProductType','RelMortInd','Servicer','MaturityDate','CurDelStatus','ModFlag','ZeroBalEffDate','LastInstallDate','DispositionDate','PricipleForgiven','RMWPF','FPWA','ServicingIndicator','OrLTV','Zip','MortInsPerc','CoCreditScore','MortInsType','CurrInterestRate','CAUPB','MSA','ForeclosureCost','RepairCost','AssetRecCost','MiscCostsPF','ATFHP','NetSaleProceeds','CreditEnhProceeds','RPMWP','OtherForePro','NonInterestUPB','ReportingDate')

In [None]:
df = df1.na.fill(0)

In [None]:
df.schema

In [None]:
data_size = df.count()
strat_data = df.sampleBy('Default', fractions={0: float(positive_count)/ data_size, 1: 1.0})

In [None]:
strat_data.persist()

In [None]:
print(strat_data.groupby('Default').count().toPandas())

In [None]:
splitSeed = 777
train_data, test_data = strat_data.randomSplit([0.8, 0.2], splitSeed)

In [None]:
train_data.persist()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
from time import time

In [None]:
feature_cols = df.drop('Default').drop('id').columns
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

In [None]:
lr = LogisticRegression(labelCol='Default', featuresCol='features')

In [None]:
pipeline = Pipeline(stages=[assembler, lr])

In [None]:
paramGrid = ParamGridBuilder() \
    .addGrid(lr.maxIter, [1, 10, 100]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

In [None]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(labelCol='Default', predictionCol='prediction'),
                          numFolds=3)


In [None]:
time_s = time()
cv_model = crossval.fit(train_data)
time_e = time()

print ('Total training time: %f' % (time_e - time_s))

In [None]:
def print_metrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print('Precision of True ', metrics.precision(1))
    print('Precision of False', metrics.precision(0))
    print('Recall of True    ', metrics.recall(1))
    print('Recall of False   ', metrics.recall(0))
    print('F-1 Score         ', metrics.fMeasure())
    print('Confusion Matrix\n', metrics.confusionMatrix().toArray())

In [None]:
test_data.persist()

In [None]:
predictions = cv_model.transform(test_data)
accuracy = cv_model.getEvaluator().evaluate(predictions)
print('F1 Accuracy: %f' % accuracy)

In [None]:
predictions_and_labels = predictions.select("prediction", "Default").rdd.map(lambda r: (float(r[0]), float(r[1])))

In [None]:
print_metrics(predictions_and_labels)