In [7]:
# import context manager: SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import SQLContext
# set up the session
spark = SparkSession \
        .builder \
        .master('YARN')\
        .appName("DS5110_Final_Project_ED_SQUAD") \
        .config("spark.executor.memory", "256g")\
        .getOrCreate()
sqlContext = SQLContext(spark)

In [8]:
spark

In [5]:
import os
os.listdir()
os.getcwd()

'/sfs/qumulo/qhome/smn7ba/ds5110/project'

In [2]:
#import pandas too for visualizations
import pandas as pd
pd.set_option('display.max_rows', 200000)

In [3]:
%%time
#import mlLib libraries for classification
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder,TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import PCA
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

CPU times: user 4.19 ms, sys: 214 µs, total: 4.41 ms
Wall time: 3.96 ms


### Read Data; Create a binary flag; rename columns; Drop if necessary

In [4]:
%%time
#import whole data from the census
data = spark.read.csv('/project/ds5559/ds5110_project_snoo/acs_15_19_south.csv', inferSchema="true", header="true")

CPU times: user 3.04 ms, sys: 1.13 ms, total: 4.17 ms
Wall time: 24.1 s


In [5]:
%%time
#writing a user defined function to create a Educated or Not Flag - if EDUC>6 then it is 1 and if not 0
#https://towardsdatascience.com/5-ways-to-add-a-new-column-in-a-pyspark-dataframe-4e75c2fd8c08
def EDUCFunc(value):
  if   value > 6: 
      return 1
  else:
      return 0

#create the function to be applied and create a new column EDUC_FLAG
udfsomefunc = F.udf(EDUCFunc, IntegerType())
data = data.withColumn("EDUC_FLAG", udfsomefunc("EDUC"))
#see sample data
data.select('EDUC_FLAG').show(5)

+---------+
|EDUC_FLAG|
+---------+
|        0|
|        1|
|        0|
|        1|
|        0|
+---------+
only showing top 5 rows

CPU times: user 1.95 ms, sys: 2.98 ms, total: 4.93 ms
Wall time: 938 ms


In [6]:
%%time
#check the count for EDUC>6 or verify if flag was populated correctly
data.filter(data.EDUC>6).count()

CPU times: user 815 µs, sys: 2.73 ms, total: 3.54 ms
Wall time: 5.93 s


2470127

In [7]:
%%time
#Verify the flag count. Should match number above
data.filter(data.EDUC_FLAG!=0).count()

CPU times: user 1.74 ms, sys: 2.07 ms, total: 3.82 ms
Wall time: 7.69 s


2470127

### About data
AMERICAN COMMUNITY SURVEY 2015-2019 5-YEAR SAMPLE <br>
5-in-100 national random sample of the population <br>
Contains all households and persons from the 1% ACS samples for 2015, 2016, 2017, 2018, and 2019 identifiable by year. <br>
The data include persons in group quarters. <br>
This is a weighted sample. <br>
The smallest identifiable geographic unit is the PUMA, containing at least 100,000 persons. PUMAs do not cross state boundaries. <br>
Users should read the FAQ on the multi-year data. <br>


WHERE CAN I GET BETTER GEOGRAPHIC IDENTIFIERS? <br>
The lowest unit of geography in the microdata files is still the PUMA. PUMAs contain at least 100,000 people. <br>
Aggregate data (but not microdata) is currently available from the Census Bureau for geographic areas as small as block groups, but only for the entire 2005-2009 period. <br>


PERNUM numbers all persons within each household consecutively in the order in which they appear on the original census or survey form. <br>
When combined with SAMPLE and SERIAL, PERNUM uniquely identifies each person within the IPUMS. <br>

MULTYEAR identifies the actual year of survey in multi-year ACS/PRCS samples. <br>

<br>
For example, the 3-year ACS and PRCS data files each include cases from three single-year files. <br>
For these multi-year samples, the YEAR variable identifies the last year of data (2007 for the 2005-2007 3-year data; 2008 for the 2006-2008 data; and so on). <br>
MULTYEAR gives the single-year sample from which the case was drawn (2005, 2006, or 2007 for the 2005-2007 3-year data; 2006, 2007, or 2008 for the 2006-2008 3-year data; and so on). <br>

https://usa.ipums.org/usa/acs_multyr.shtml


In [8]:
%%time
#renaming dependent variable to label because the classfier is not recognizing other names. Skip thsi if you are trying other classifiers

df = data.withColumn("label",data.EDUC_FLAG) \
      .drop("EDUC_FLAG")

CPU times: user 252 µs, sys: 1.2 ms, total: 1.45 ms
Wall time: 43 ms


In [9]:
#saving col names in case if we can use it later ot iterate or use the list for labels etc.
cols = df.columns
#spark.createDataFrame(cols,StringType()).toPandas()

### EDA

In [10]:
#displaying number of rows and columns in the data
print((df.count(), len(df.columns)))

(5965249, 206)


In [11]:
%%time
#number of years in the data set
df.select('MULTYEAR').distinct().show()

+--------+
|MULTYEAR|
+--------+
|    2018|
|    2015|
|    2019|
|    2016|
|    2017|
+--------+

CPU times: user 1.16 ms, sys: 1.23 ms, total: 2.4 ms
Wall time: 6.26 s


In [12]:
#sampling data to use more effeciently; seed = 42
#https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.sampleBy.html
#https://towardsdatascience.com/exploratory-data-analysis-eda-with-pyspark-on-databricks-e8d6529626b1
#https://www.kaggle.com/tientd95/advanced-pyspark-for-exploratory-data-analysis
sampled = df.sampleBy("MULTYEAR", fractions={2015: 0.25, 2016: 0.25, 2017:0.25, 2018:0.25, 2019:0.25}, seed=42)
sampled.groupBy("MULTYEAR").count().orderBy("MULTYEAR").show()

+--------+------+
|MULTYEAR| count|
+--------+------+
|    2015|293037|
|    2016|295407|
|    2017|298937|
|    2018|300273|
|    2019|304384|
+--------+------+



In [13]:
#map to create meanign ful table # wip
'''
hhtype_dict = {'0':'N/A',\
            '1': 'Married-couple family household',\
            '2': 'Male householder, no wife present',\
            '3': 'Female householder, no husband present',\
            '4': 'Male householder, living alone',\
            '5': 'Male householder, not living alone',\
            '6': 'Female householder, living alone',\
            '7': 'Female householder, not living alone',\
            '9': 'HHTYPE could not be determined'}
'''

"\nhhtype_dict = {'0':'N/A',            '1': 'Married-couple family household',            '2': 'Male householder, no wife present',            '3': 'Female householder, no husband present',            '4': 'Male householder, living alone',            '5': 'Male householder, not living alone',            '6': 'Female householder, living alone',            '7': 'Female householder, not living alone',            '9': 'HHTYPE could not be determined'}\n"

In [14]:
#sampled.select('HHTYPE').rdd.map(lambda x: hhtype_dict.get(x) ).take(5)

In [15]:
%%time
##aggregating counts by Year and HHtype for sampple dataset - add labels - convert to visualization
#sampled_n = sampled.select('HHTYPE').rdd.map(lambda x: hhtype_dict.get(x) )
#sampled.filter((sampled.HHTYPE!=0) & (sampled.HHTYPE!=9)).groupBy('MULTYEAR','HHTYPE').count()\
    #.orderBy('MULTYEAR','count', ascending=False).show(100,truncate=False)
sampled.filter((sampled.HHTYPE!=0) & (sampled.HHTYPE!=9)).groupBy('MULTYEAR','HHTYPE').count()\
    .orderBy('MULTYEAR','count', ascending=False).show(100,truncate=False)

+--------+------+------+
|MULTYEAR|HHTYPE|count |
+--------+------+------+
|2019    |1     |183494|
|2019    |3     |37065 |
|2019    |6     |20111 |
|2019    |4     |14338 |
|2019    |2     |10785 |
|2019    |5     |3884  |
|2019    |7     |3201  |
|2018    |1     |180873|
|2018    |3     |37453 |
|2018    |6     |19439 |
|2018    |4     |14157 |
|2018    |2     |10990 |
|2018    |5     |3924  |
|2018    |7     |3197  |
|2017    |1     |178903|
|2017    |3     |38048 |
|2017    |6     |19292 |
|2017    |4     |14011 |
|2017    |2     |10345 |
|2017    |5     |3998  |
|2017    |7     |3160  |
|2016    |1     |176609|
|2016    |3     |37926 |
|2016    |6     |19519 |
|2016    |4     |13590 |
|2016    |2     |10355 |
|2016    |5     |3757  |
|2016    |7     |3223  |
|2015    |1     |174786|
|2015    |3     |38279 |
|2015    |6     |19423 |
|2015    |4     |13368 |
|2015    |2     |10081 |
|2015    |5     |3818  |
|2015    |7     |3045  |
+--------+------+------+

CPU times: user 3.38 ms,

In [None]:
sampled.printSchema()

### Transform Data; Scale; PCA; RF Classification - seed 42

In [17]:
%%time
#pass all the features into vector assembler to create a vector format to pass tto the classification model
assembler = VectorAssembler(inputCols=[cols for cols in cols if cols!='label'], outputCol="features") 
transformed = assembler.transform(sampled)
#register table as sql table and keep only columns fo interest and save in a new dataframe. This can be done without using SQl as well.
transformed.registerTempTable('transformed_tbl')
transformed_df = sqlContext.sql('select label,features from transformed_tbl')
transformed_df.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|(205,[0,1,2,3,4,5...|
|    0|(205,[0,1,2,3,4,5...|
|    1|(205,[0,1,2,3,4,5...|
|    1|(205,[0,1,2,3,4,5...|
|    1|(205,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 5 rows

CPU times: user 8.94 ms, sys: 2.91 ms, total: 11.9 ms
Wall time: 2.82 s


In [18]:
%%time
#scale the data
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(transformed_df)
scaledData = scalerModel.transform(transformed_df)

CPU times: user 6.27 ms, sys: 767 µs, total: 7.04 ms
Wall time: 19.1 s


In [19]:
%%time
#check sample scaled data
scaledData.select("label","scaledFeatures").show(5)

+-----+--------------------+
|label|      scaledFeatures|
+-----+--------------------+
|    1|(205,[0,1,2,3,4,5...|
|    0|(205,[0,1,2,3,4,5...|
|    1|(205,[0,1,2,3,4,5...|
|    1|(205,[0,1,2,3,4,5...|
|    1|(205,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 5 rows

CPU times: user 1.29 ms, sys: 815 µs, total: 2.1 ms
Wall time: 2.27 s


In [20]:
%%time
#randomly split data
training_data, test_data = scaledData.randomSplit([0.7, 0.3], seed=42)
cached_tr = training_data.cache()

CPU times: user 833 µs, sys: 833 µs, total: 1.67 ms
Wall time: 92.8 ms


In [None]:
%%time
#pca to reduce 200 odd features into principal components - on training data only because that is our model
#this takes a while to run. imagine it is running at least 9 combinations models with 3 folds and picking the best. Reduce parameters or folds if you want it to run faster
pca_model = PCA(inputCol = "scaledFeatures", outputCol = "pca_features_cv")

#create a randomforest classifier model to pass into pipeline
rf = RandomForestClassifier(labelCol = "label", featuresCol = "pca_features_cv")

#creating a pipeline with the pca and model to use in the cross validator
ppl_cv = Pipeline(stages = [pca_model, rf])


#create a param grid to pass to cross validator 
#k --> number of principal components
#number of treess in rf
#need to add more later
paramGrid = ParamGridBuilder() \
  .addGrid(pca_model.k, [10, 20, 30]) \
  .addGrid(rf.numTrees, [20, 30, 50]) \
  .build()

#passs the model with variosu combinations of the parameters and it will pick the best one. Using 3 folds to save time. Check seed=42.
crossval = CrossValidator(estimator = ppl_cv,\
                                        estimatorParamMaps=paramGrid,\
                                        evaluator = MulticlassClassificationEvaluator(),\
                                        numFolds= 3,seed=42)


#this is our best model - fit the training data
cv_model = crossval.fit(cached_tr)

In [22]:
#all the 9 model accuracies. The max one was picked as best
avgMetricsGrid = cv_model.avgMetrics
print(avgMetricsGrid)

#https://tsmatz.github.io/azure-databricks-exercise/exercise04-hyperparams-tuning.html
#https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.tuning.CrossValidator.html
# View all results (accuracy) by each params - these can be converted to pretty tables in pandas later
list(zip(cv_model.getEstimatorParamMaps()))

NameError: name 'cv_model' is not defined

In [None]:
%%time
#predict and evaluate the model for accuracy
predictions = cv_model.transform(test_data)
evaluator= MulticlassClassificationEvaluator(labelCol = "label", metricName= "accuracy")
accuracy = evaluator.evaluate(predictions)

In [None]:
#increased accuracy with binary flag
print(accuracy)

### confusion matrix, threshold, roc and other fun stuff

In [None]:
##more to come

### Model without PCA but selective features and QQ variables dropped

In [None]:
#more to come

In [None]:
#References
#https://awesomeopensource.com/project/adornes/spark_python_ml_examples
#https://spark.apache.org/docs/latest/ml-tuning.html
#https://sparkbyexamples.com/pyspark/pyspark-rename-dataframe-column/
#https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier
#https://people.stat.sc.edu/haigang/sparkCaseStudy.html