## HMEQ Data Mining - PySpark

### Connect to Spark
Read in Data from Hadoop

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
from matplotlib import pyplot as plt
import pandas as pd

#Create Spark Session
spark=SparkSession.builder.appName("simple").getOrCreate()

#Read in text file from HDFS
HMEQ = spark.read.csv('hdfs://localhost:54310/user/andrew/data/HMEQ.csv', header=True)

### See Head of the Data

In [3]:
HMEQ.show(10)
#HMEQ.select('BAD').show() one column

+---+----+-------+------+-------+------+----+-----+------+-----------+----+----+-----------+
|BAD|LOAN|MORTDUE| VALUE| REASON|   JOB| YOJ|DEROG|DELINQ|      CLAGE|NINQ|CLNO|    DEBTINC|
+---+----+-------+------+-------+------+----+-----+------+-----------+----+----+-----------+
|  1|1100|  25860| 39025|HomeImp| Other|10.5|    0|     0|94.36666667|   1|   9|       null|
|  1|1300|  70053| 68400|HomeImp| Other|   7|    0|     2|121.8333333|   0|  14|       null|
|  1|1500|  13500| 16700|HomeImp| Other|   4|    0|     0|149.4666667|   1|  10|       null|
|  1|1500|   null|  null|   null|  null|null| null|  null|       null|null|null|       null|
|  0|1700|  97800|112000|HomeImp|Office|   3|    0|     0|93.33333333|   0|  14|       null|
|  1|1700|  30548| 40320|HomeImp| Other|   9|    0|     0|101.4660019|   1|   8|37.11361356|
|  1|1800|  48649| 57037|HomeImp| Other|   5|    3|     2|       77.1|   1|  17|       null|
|  1|1800|  28502| 43034|HomeImp| Other|  11|    0|     0|88.76602988|

### Summary Statistics

In [4]:
HMEQ.describe(HMEQ.columns[0:7]).show()

+-------+-----------------+------------------+------------------+------------------+-------+----+-----------------+
|summary|              BAD|              LOAN|           MORTDUE|             VALUE| REASON| JOB|              YOJ|
+-------+-----------------+------------------+------------------+------------------+-------+----+-----------------+
|  count|             5960|              5960|              5442|              5848|   5708|5681|             5445|
|   mean|0.199496644295302| 18607.96979865772| 73760.81719955898|101776.04874145007|   null|null|8.922268135904508|
| stddev|0.399655517450269|11207.480416693992|44457.609458415885|  57385.7753337027|   null|null|7.573982248898881|
|    min|                0|             10000|             10000|            100000|DebtCon| Mgr|                0|
|    max|                1|              9900|              9999|             99997|HomeImp|Self|              9.9|
+-------+-----------------+------------------+------------------+-------

In [5]:
HMEQ.describe(HMEQ.columns[7:len(HMEQ.columns)]).show()

+-------+------------------+------------------+-----------------+------------------+------------------+-----------------+
|summary|             DEROG|            DELINQ|            CLAGE|              NINQ|              CLNO|          DEBTINC|
+-------+------------------+------------------+-----------------+------------------+------------------+-----------------+
|  count|              5252|              5380|             5652|              5450|              5738|             4693|
|   mean|0.2545696877380046|0.4494423791821561|179.7662751868336|1.1860550458715597| 21.29609620076682|33.77991534872161|
| stddev| 0.846046777086318|1.1272659176049744|85.81009176264283|1.7286749712080534|10.138933192458637|8.601746186462469|
|    min|                 0|                 0|                0|                 0|                 0|      0.524499215|
|    max|                 9|                 8|      99.99378982|                 9|                 9|      91.61259998|
+-------+---------------

### Count Missing

In [6]:
def null_count(column):
    return sum(col(column).isNull().cast("integer")).alias(column)

null_ct = [null_count(column) for column in HMEQ.columns]
HMEQ.agg(*null_ct).show()

+---+----+-------+-----+------+---+---+-----+------+-----+----+----+-------+
|BAD|LOAN|MORTDUE|VALUE|REASON|JOB|YOJ|DEROG|DELINQ|CLAGE|NINQ|CLNO|DEBTINC|
+---+----+-------+-----+------+---+---+-----+------+-----+----+----+-------+
|  0|   0|    518|  112|   252|279|515|  708|   580|  308| 510| 222|   1267|
+---+----+-------+-----+------+---+---+-----+------+-----+----+----+-------+



### Plot missing values

In [7]:
#Bring to pandas
df = HMEQ.agg(*null_ct).toPandas()

#Plot
tbl_foreplot = pd.Series(list(df.values[0]), index=list(df.columns))
missing_val = tbl_foreplot.plot(kind='bar', title='% Missing', color='c', figsize=(10,6))
missing_val.set_ylabel("Percent Missing")
missing_val.set_xlabel("Variable Names")
plt.show()

### Split into Training and Validation

In [14]:
(HMEQ_train, HMEQ_valid) = HMEQ.randomSplit([.7,.3], seed=240)
HMEQ_train.show()
print(HMEQ_train.count(), HMEQ_valid.count(), HMEQ.count())

+---+-----+-------+------+-------+-------+----+-----+------+-----------+----+----+-----------+
|BAD| LOAN|MORTDUE| VALUE| REASON|    JOB| YOJ|DEROG|DELINQ|      CLAGE|NINQ|CLNO|    DEBTINC|
+---+-----+-------+------+-------+-------+----+-----+------+-----------+----+----+-----------+
|  0|10000|   null| 42497|HomeImp|    Mgr|   6| null|  null|165.4333333|null|   9|       null|
|  0|10000|   null| 49266|   null|   null|null| null|  null|       null|null|null|15.59729023|
|  0|10000|   null| 56475|HomeImp|  Other|   9|    2|     0|309.2644975|   2|  12|34.48718804|
|  0|10000|   null| 69502|   null|   null|null| null|  null|       null|null|null|25.77278284|
|  0|10000| 114643|130127|DebtCon|   Self|   5|    0|     1|160.6678931|   0|  19|33.92004439|
|  0|10000| 124000|192000|HomeImp| Office|   0|    0|     1|346.9666667|   0|  16|       null|
|  0|10000| 127319|194992|HomeImp| Office|   1|    0|     1|337.6899717|   0|  16|41.44750595|
|  0|10000| 131589|156000|HomeImp|ProfExe|   5|   

4137