In [1]:
# import context manager: SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import SQLContext
# set up the session
spark = SparkSession.builder.getOrCreate()

sqlContext = SQLContext(spark)

In [2]:
#import pandas too for visualizations
import pandas as pd
pd.set_option('display.max_rows', 200000)

In [3]:
#import mlLib libraries for classification
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder,TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import PCA
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

### Read Data; Create a binary flag; rename columns; Drop if necessary

In [4]:
#import whole data from the census
data = spark.read.csv('/project/ds5559/ds5110_project_snoo/acs_15_19_south.csv', inferSchema="true", header="true")

In [5]:
#writing a user defined function to create a Educated or Not Flag - if EDUC>6 then it is 1 and if not 0
#https://towardsdatascience.com/5-ways-to-add-a-new-column-in-a-pyspark-dataframe-4e75c2fd8c08
def EDUCFunc(value):
  if   value > 6: 
      return 1
  else:
      return 0

#create the function to be applied and create a new column EDUC_FLAG
udfsomefunc = F.udf(EDUCFunc, IntegerType())
data = data.withColumn("EDUC_FLAG", udfsomefunc("EDUC"))
#see sample data
data.select('EDUC_FLAG').show(5)

+---------+
|EDUC_FLAG|
+---------+
|        0|
|        1|
|        0|
|        1|
|        0|
+---------+
only showing top 5 rows



In [6]:
#check the count for EDUC>6 or verify if flag was populated correctly
data.filter(data.EDUC>6).count()

2470127

In [7]:
#Verify the flag count. Should match number above
data.filter(data.EDUC_FLAG!=0).count()

2470127

### About data
AMERICAN COMMUNITY SURVEY 2015-2019 5-YEAR SAMPLE <br>
5-in-100 national random sample of the population <br>
Contains all households and persons from the 1% ACS samples for 2015, 2016, 2017, 2018, and 2019 identifiable by year. <br>
The data include persons in group quarters. <br>
This is a weighted sample. <br>
The smallest identifiable geographic unit is the PUMA, containing at least 100,000 persons. PUMAs do not cross state boundaries. <br>
Users should read the FAQ on the multi-year data. <br>


WHERE CAN I GET BETTER GEOGRAPHIC IDENTIFIERS? <br>
The lowest unit of geography in the microdata files is still the PUMA. PUMAs contain at least 100,000 people. <br>
Aggregate data (but not microdata) is currently available from the Census Bureau for geographic areas as small as block groups, but only for the entire 2005-2009 period. <br>


PERNUM numbers all persons within each household consecutively in the order in which they appear on the original census or survey form. <br>
When combined with SAMPLE and SERIAL, PERNUM uniquely identifies each person within the IPUMS. <br>

MULTYEAR identifies the actual year of survey in multi-year ACS/PRCS samples. <br>

<br>
For example, the 3-year ACS and PRCS data files each include cases from three single-year files. <br>
For these multi-year samples, the YEAR variable identifies the last year of data (2007 for the 2005-2007 3-year data; 2008 for the 2006-2008 data; and so on). <br>
MULTYEAR gives the single-year sample from which the case was drawn (2005, 2006, or 2007 for the 2005-2007 3-year data; 2006, 2007, or 2008 for the 2006-2008 3-year data; and so on). <br>

https://usa.ipums.org/usa/acs_multyr.shtml


In [7]:
#renaming dependent variable to label because the classfier is not recognizing other names. Skip thsi if you are trying other classifiers
from pyspark.sql.functions import *
df = data.withColumn("label",col("EDUC_FLAG")) \
      .drop("EDUC_FLAG")

In [8]:
#saving col names in case if we can use it later ot iterate or use the list for labels etc.
cols = df.columns
#spark.createDataFrame(cols,StringType()).toPandas()

### EDA

In [11]:
#displaying number of rows and columns in the data
print((df.count(), len(df.columns)))

(5965249, 206)


In [12]:
#number of years in the data set
df.select('MULTYEAR').distinct().show()

+--------+
|MULTYEAR|
+--------+
|    2018|
|    2015|
|    2019|
|    2016|
|    2017|
+--------+



In [9]:
#sampling data to use more effeciently; seed = 42
#https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.sampleBy.html
#https://towardsdatascience.com/exploratory-data-analysis-eda-with-pyspark-on-databricks-e8d6529626b1
#https://www.kaggle.com/tientd95/advanced-pyspark-for-exploratory-data-analysis
sampled = df.sampleBy("MULTYEAR", fractions={2015: 0.03, 2016: 0.03, 2017:0.03, 2018:0.03, 2019:0.03}, seed=42)
sampled.groupBy("MULTYEAR").count().orderBy("MULTYEAR").show()

+--------+-----+
|MULTYEAR|count|
+--------+-----+
|    2015|34940|
|    2016|35487|
|    2017|35961|
|    2018|36031|
|    2019|36488|
+--------+-----+



In [None]:
#map to create meanign ful table # wip
'''
hhtype_dict = {'0':'N/A',\
            '1': 'Married-couple family household',\
            '2': 'Male householder, no wife present',\
            '3': 'Female householder, no husband present',\
            '4': 'Male householder, living alone',\
            '5': 'Male householder, not living alone',\
            '6': 'Female householder, living alone',\
            '7': 'Female householder, not living alone',\
            '9': 'HHTYPE could not be determined'}
'''

In [None]:
#sampled.select('HHTYPE').rdd.map(lambda x: hhtype_dict.get(x) ).take(5)

In [19]:
##aggregating counts by Year and HHtype for sampple dataset - add labels - convert to visualization
#sampled_n = sampled.select('HHTYPE').rdd.map(lambda x: hhtype_dict.get(x) )
sampled.filter((sampled.HHTYPE!=0) & (sampled.HHTYPE!=9)).groupBy('MULTYEAR','HHTYPE').count()\
    .orderBy('MULTYEAR','count', ascending=False).show(100,truncate=False)

+--------+------+-----+
|MULTYEAR|HHTYPE|count|
+--------+------+-----+
|2019    |1     |22462|
|2019    |3     |4466 |
|2019    |6     |2421 |
|2019    |4     |1733 |
|2019    |2     |1333 |
|2019    |5     |481  |
|2019    |7     |375  |
|2018    |1     |21907|
|2018    |3     |4512 |
|2018    |6     |2394 |
|2018    |4     |1706 |
|2018    |2     |1342 |
|2018    |5     |463  |
|2018    |7     |410  |
|2017    |1     |21676|
|2017    |3     |4582 |
|2017    |6     |2397 |
|2017    |4     |1669 |
|2017    |2     |1247 |
|2017    |5     |510  |
|2017    |7     |391  |
|2016    |1     |21697|
|2016    |3     |4694 |
|2016    |6     |2361 |
|2016    |4     |1668 |
|2016    |2     |1268 |
|2016    |5     |450  |
|2016    |7     |432  |
|2015    |1     |21096|
|2015    |3     |4747 |
|2015    |6     |2367 |
|2015    |4     |1640 |
|2015    |2     |1272 |
|2015    |5     |476  |
|2015    |7     |335  |
+--------+------+-----+



In [20]:
sampled.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- MULTYEAR: integer (nullable = true)
 |-- SAMPLE: integer (nullable = true)
 |-- SERIAL: integer (nullable = true)
 |-- CBSERIAL: long (nullable = true)
 |-- HHWT: double (nullable = true)
 |-- HHTYPE: integer (nullable = true)
 |-- CLUSTER: long (nullable = true)
 |-- REGION: integer (nullable = true)
 |-- STATEFIP: integer (nullable = true)
 |-- COUNTYFIP: integer (nullable = true)
 |-- METRO: integer (nullable = true)
 |-- STRATA: integer (nullable = true)
 |-- GQ: integer (nullable = true)
 |-- OWNERSHP: integer (nullable = true)
 |-- OWNERSHPD: integer (nullable = true)
 |-- MORTGAGE: integer (nullable = true)
 |-- TAXINCL: integer (nullable = true)
 |-- INSINCL: integer (nullable = true)
 |-- PROPINSR: integer (nullable = true)
 |-- COSTELEC: integer (nullable = true)
 |-- COSTGAS: integer (nullable = true)
 |-- COSTWATR: integer (nullable = true)
 |-- COSTFUEL: integer (nullable = true)
 |-- FOODST

### Transform Data; Scale; PCA; RF Classification - seed 42

In [11]:
#pass all the features into vector assembler to create a vector format to pass tto the classification model
assembler = VectorAssembler(inputCols=[cols for cols in sampled.columns if cols!='label'], outputCol="features") 
transformed = assembler.transform(sampled)
#register table as sql table and keep only columns fo interest and save in a new dataframe. This can be done without using SQl as well.
transformed.registerTempTable('transformed_tbl')
transformed_df = sqlContext.sql('select label,features from transformed_tbl')
transformed_df.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(205,[0,1,2,3,4,5...|
|    1|[105.0,2019.0,201...|
|    0|(205,[0,1,2,3,4,5...|
|    0|(205,[0,1,2,3,4,5...|
|    0|(205,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 5 rows



In [12]:
#scale the data
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(transformed_df)
scaledData = scalerModel.transform(transformed_df)

In [13]:
#check sample scaled data
scaledData.select("label","scaledFeatures").show(5)

+-----+--------------------+
|label|      scaledFeatures|
+-----+--------------------+
|    0|(205,[0,1,2,3,4,5...|
|    1|[2.18691054270554...|
|    0|(205,[0,1,2,3,4,5...|
|    0|(205,[0,1,2,3,4,5...|
|    0|(205,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 5 rows



In [14]:
#randomly split data
training_data, test_data = scaledData.randomSplit([0.7, 0.3], seed=42)

In [15]:
#pca to reduce 200 odd features into principal components - on training data only because that is our model
#this takes a while to run. imagine it is running at least 9 combinations models with 3 folds and picking the best. Reduce parameters or folds if you want it to run faster
pca_model = PCA(inputCol = "scaledFeatures", outputCol = "pca_features_cv")

#create a randomforest classifier model to pass into pipeline
rf = RandomForestClassifier(labelCol = "label", featuresCol = "pca_features_cv")

#creating a pipeline with the pca and model to use in the cross validator
ppl_cv = Pipeline(stages = [pca_model, rf])


#create a param grid to pass to cross validator 
#k --> number of principal components
#number of treess in rf
#need to add more later
paramGrid = ParamGridBuilder() \
  .addGrid(pca_model.k, [10, 20, 30]) \
  .addGrid(rf.numTrees, [20, 30, 50]) \
  .build()

#passs the model with variosu combinations of the parameters and it will pick the best one. Using 3 folds to save time. Check seed=42.
crossval = CrossValidator(estimator = ppl_cv,\
                                        estimatorParamMaps=paramGrid,\
                                        evaluator = MulticlassClassificationEvaluator(),\
                                        numFolds= 3,seed=42)


#this is our best model - fit the training data
cv_model = crossval.fit(training_data)

In [29]:
#all the 9 model accuracies. The max one was picked as best
avgMetricsGrid = cv_model.avgMetrics
print(avgMetricsGrid)

#https://tsmatz.github.io/azure-databricks-exercise/exercise04-hyperparams-tuning.html
# View all results (accuracy) by each params - these can be converted to pretty tables in pandas later
list(zip(cv_model.getEstimatorParamMaps()))

[0.7723960143219148, 0.7733325749792399, 0.7727217338656844, 0.7974568204660167, 0.8025227590273747, 0.8040922137339593, 0.8017722452652558, 0.8052520576819326, 0.8029655061557093]


[({Param(parent='PCA_f75115576b9f', name='k', doc='the number of principal components'): 10,
   Param(parent='RandomForestClassifier_46a92114fd56', name='numTrees', doc='Number of trees to train (>= 1).'): 20},),
 ({Param(parent='PCA_f75115576b9f', name='k', doc='the number of principal components'): 10,
   Param(parent='RandomForestClassifier_46a92114fd56', name='numTrees', doc='Number of trees to train (>= 1).'): 30},),
 ({Param(parent='PCA_f75115576b9f', name='k', doc='the number of principal components'): 10,
   Param(parent='RandomForestClassifier_46a92114fd56', name='numTrees', doc='Number of trees to train (>= 1).'): 50},),
 ({Param(parent='PCA_f75115576b9f', name='k', doc='the number of principal components'): 20,
   Param(parent='RandomForestClassifier_46a92114fd56', name='numTrees', doc='Number of trees to train (>= 1).'): 20},),
 ({Param(parent='PCA_f75115576b9f', name='k', doc='the number of principal components'): 20,
   Param(parent='RandomForestClassifier_46a92114fd56', 

In [18]:
#predict and evaluate the model for accuracy
predictions = cv_model.transform(test_data)
evaluator= MulticlassClassificationEvaluator(labelCol = "label", metricName= "accuracy")
accuracy = evaluator.evaluate(predictions)

In [19]:
#increased accuracy with binary flag
print(accuracy)

0.8103416571659741


### Model without PCA but selective features and QQ variables dropped

In [None]:
#more to come

In [None]:
#References
#https://awesomeopensource.com/project/adornes/spark_python_ml_examples
#https://spark.apache.org/docs/latest/ml-tuning.html
#https://sparkbyexamples.com/pyspark/pyspark-rename-dataframe-column/
#https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier
#https://people.stat.sc.edu/haigang/sparkCaseStudy.html