## HMEQ Data Mining - PySpark

### Connect to Spark
Read in Data from Hadoop

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from matplotlib import pyplot as plt
import pandas as pd

#Create Spark Session
spark=SparkSession.builder.appName("simple").getOrCreate()

#Read in text file from HDFS
HMEQ = spark.read.csv('hdfs://localhost:54310/user/andrew/data/HMEQ.csv', header=True)

In [2]:
HMEQ.show(10)
#HMEQ.select('BAD').show() one column

+---+----+-------+------+-------+------+----+-----+------+-----------+----+----+-----------+
|BAD|LOAN|MORTDUE| VALUE| REASON|   JOB| YOJ|DEROG|DELINQ|      CLAGE|NINQ|CLNO|    DEBTINC|
+---+----+-------+------+-------+------+----+-----+------+-----------+----+----+-----------+
|  1|1100|  25860| 39025|HomeImp| Other|10.5|    0|     0|94.36666667|   1|   9|       null|
|  1|1300|  70053| 68400|HomeImp| Other|   7|    0|     2|121.8333333|   0|  14|       null|
|  1|1500|  13500| 16700|HomeImp| Other|   4|    0|     0|149.4666667|   1|  10|       null|
|  1|1500|   null|  null|   null|  null|null| null|  null|       null|null|null|       null|
|  0|1700|  97800|112000|HomeImp|Office|   3|    0|     0|93.33333333|   0|  14|       null|
|  1|1700|  30548| 40320|HomeImp| Other|   9|    0|     0|101.4660019|   1|   8|37.11361356|
|  1|1800|  48649| 57037|HomeImp| Other|   5|    3|     2|       77.1|   1|  17|       null|
|  1|1800|  28502| 43034|HomeImp| Other|  11|    0|     0|88.76602988|

### Summary Statistics

In [3]:
HMEQ.describe(HMEQ.columns[0:4]).show()

+-------+-----------------+------------------+------------------+------------------+
|summary|              BAD|              LOAN|           MORTDUE|             VALUE|
+-------+-----------------+------------------+------------------+------------------+
|  count|             5960|              5960|              5442|              5848|
|   mean|0.199496644295302| 18607.96979865772| 73760.81719955898|101776.04874145007|
| stddev|0.399655517450269|11207.480416693992|44457.609458415885|  57385.7753337027|
|    min|                0|             10000|             10000|            100000|
|    max|                1|              9900|              9999|             99997|
+-------+-----------------+------------------+------------------+------------------+



In [4]:
HMEQ.describe(HMEQ.columns[4:9]).show()

+-------+-------+----+-----------------+------------------+------------------+
|summary| REASON| JOB|              YOJ|             DEROG|            DELINQ|
+-------+-------+----+-----------------+------------------+------------------+
|  count|   5708|5681|             5445|              5252|              5380|
|   mean|   null|null|8.922268135904508|0.2545696877380046|0.4494423791821561|
| stddev|   null|null|7.573982248898881| 0.846046777086318|1.1272659176049744|
|    min|DebtCon| Mgr|                0|                 0|                 0|
|    max|HomeImp|Self|              9.9|                 9|                 8|
+-------+-------+----+-----------------+------------------+------------------+



In [5]:
HMEQ.describe(HMEQ.columns[9:len(HMEQ.columns)]).show()

+-------+-----------------+------------------+------------------+-----------------+
|summary|            CLAGE|              NINQ|              CLNO|          DEBTINC|
+-------+-----------------+------------------+------------------+-----------------+
|  count|             5652|              5450|              5738|             4693|
|   mean|179.7662751868336|1.1860550458715597| 21.29609620076682|33.77991534872161|
| stddev|85.81009176264283|1.7286749712080534|10.138933192458637|8.601746186462469|
|    min|                0|                 0|                 0|      0.524499215|
|    max|      99.99378982|                 9|                 9|      91.61259998|
+-------+-----------------+------------------+------------------+-----------------+



### Count Missing

In [6]:
def null_count(column):
    return sum(col(column).isNull().cast("integer")).alias(column)

null_ct = [null_count(column) for column in HMEQ.columns]
HMEQ.agg(*null_ct).show()

+---+----+-------+-----+------+---+---+-----+------+-----+----+----+-------+
|BAD|LOAN|MORTDUE|VALUE|REASON|JOB|YOJ|DEROG|DELINQ|CLAGE|NINQ|CLNO|DEBTINC|
+---+----+-------+-----+------+---+---+-----+------+-----+----+----+-------+
|  0|   0|    518|  112|   252|279|515|  708|   580|  308| 510| 222|   1267|
+---+----+-------+-----+------+---+---+-----+------+-----+----+----+-------+



### Plot missing values

In [7]:
#Bring to pandas
df = HMEQ.agg(*null_ct).toPandas()

#Plot
tbl_foreplot = pd.Series(list(df.values[0]), index=list(df.columns))
missing_val = tbl_foreplot.plot(kind='bar', title='% Missing', color='c', figsize=(10,6))
missing_val.set_ylabel("Percent Missing")
missing_val.set_xlabel("Variable Names")
plt.show()

### Convert Numeric into Double

In [10]:
character_cols=['REASON','JOB']
numeric_cols= [column for column in HMEQ.columns if column not in character_cols + ['BAD']]
for col in numeric_cols:
    HMEQ = HMEQ.withColumn(col, HMEQ[col].cast("Double"))

### Treat Character Variables

In [11]:
for a in character_cols:
    print a+"index"

REASONindex
JOBindex


In [12]:
#Drop nulls temporarily - will add imputer
HMEQ = HMEQ.na.drop()

stages=[]
#Create dummy variables
for char_col in character_cols:
    stringIndexer = StringIndexer(inputCol=char_col, outputCol=char_col+"Index")
    encoder = OneHotEncoder(inputCol=char_col+"Index", outputCol=char_col+"classVec")
    stages += [stringIndexer, encoder]

### Prepare Pipeline

In [13]:
### Add label for target outcome
label_target = StringIndexer(inputCol='BAD', outputCol='label')
stages += [label_target]

#Create a input vector
vectorInputs=[column+'classVec' for column in character_cols] + numeric_cols
vectorInputs=numeric_cols
assembler = VectorAssembler(inputCols=vectorInputs, outputCol="features_indep_var")
stages += [assembler]

#Create  a pipeline
pipeline = Pipeline(stages=stages) 
model = pipeline.fit(HMEQ) #Compute transformations
model_dataset = model.transform(HMEQ) #Apply transformations to new dataset

#Drop columns not needed
keepCols=['label', 'features_indep_var'] + HMEQ.columns
HMEQ_prepped = model_dataset.select(keepCols)



### Split into Training and Validation

In [35]:
HMEQ.columns

['BAD', 'LOAN', 'MORTDUE', 'VALUE', 'REASON', 'JOB', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']

In [36]:
(HMEQ_train, HMEQ_valid) = HMEQ_prepped.randomSplit([.7,.3], seed=240)
HMEQ_train[HMEQ_train.columns[0:8]].show()
HMEQ_train[HMEQ_train.columns[8:len(HMEQ_train.columns)]].show()
print HMEQ_train.count(), '- Training Row Count'
print HMEQ_valid.count(), '- Validation Row Count'
print HMEQ.count(), '- Total Row Count'

+-----+--------------------+---+------+--------+--------+-------+-------+
|label|  features_indep_var|BAD|  LOAN| MORTDUE|   VALUE| REASON|    JOB|
+-----+--------------------+---+------+--------+--------+-------+-------+
|  0.0|[2400.0,98449.0,1...|  0|2400.0| 98449.0|117195.0|HomeImp| Office|
|  0.0|[2900.0,103949.0,...|  0|2900.0|103949.0|112505.0|HomeImp| Office|
|  0.0|[2900.0,104373.0,...|  0|2900.0|104373.0|120702.0|HomeImp| Office|
|  0.0|[3000.0,104570.0,...|  0|3000.0|104570.0|121729.0|HomeImp| Office|
|  0.0|[3600.0,52337.0,6...|  0|3600.0| 52337.0| 63989.0|HomeImp| Office|
|  0.0|[3600.0,100693.0,...|  0|3600.0|100693.0|114743.0|HomeImp| Office|
|  0.0|[3800.0,51180.0,6...|  0|3800.0| 51180.0| 63459.0|HomeImp| Office|
|  0.0|[3900.0,102143.0,...|  0|3900.0|102143.0|118742.0|HomeImp| Office|
|  0.0|[4200.0,50216.0,5...|  0|4200.0| 50216.0| 58541.0|HomeImp| Office|
|  0.0|[4200.0,56544.0,5...|  0|4200.0| 56544.0| 59218.0|HomeImp| Office|
|  0.0|[4300.0,68815.0,7...|  0|4300.0

### Random Forest Model

In [29]:
#label is target
#create Model
forest = RandomForestClassifier(labelCol="label", featuresCol='features_indep_var', maxDepth=7)

#Train Data
forestModel = forest.fit(HMEQ_train)

#Predict on training and validation
train_pred = forestModel.transform(HMEQ_train)
valid_pred = forestModel.transform(HMEQ_valid)

#AUC
AUC = BinaryClassificationEvaluator()
print AUC.evaluate(train_pred), ' Train AUC'
print AUC.evaluate(valid_pred), ' Validation AUC'

0.924544127025
0.885934250697
