In [1]:
# import context manager: SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

# set up the session
spark = SparkSession.builder.getOrCreate()


In [2]:
import pandas as pd
pd.set_option('display.max_rows', 200000)

In [3]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder,TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import PCA
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics


In [4]:
data = spark.read.csv('/project/ds5559/ds5110_project_snoo/acs_15_19_south.csv', inferSchema="true", header="true")

In [5]:
from pyspark.sql.functions import *
df = data.withColumn("label",col("EDUC")) \
      .drop("EDUC")

In [6]:
#running into space memory issues
#df = data.drop('_c0').collect()

In [7]:
cols = df.columns
#spark.createDataFrame(cols,StringType()).toPandas()

In [8]:
print((df.count(), len(df.columns)))

(5965249, 205)


In [9]:
df.select('MULTYEAR').distinct().show()

+--------+
|MULTYEAR|
+--------+
|    2018|
|    2015|
|    2019|
|    2016|
|    2017|
+--------+



In [None]:
#https://towardsdatascience.com/exploratory-data-analysis-eda-with-pyspark-on-databricks-e8d6529626b1
#https://www.kaggle.com/tientd95/advanced-pyspark-for-exploratory-data-analysis
data.filter((df.HHTYPE!=0) & (df.HHTYPE!=9)).groupBy('MULTYEAR','HHTYPE').count()\
    .orderBy('MULTYEAR','count', ascending=True).show(100,truncate=False)

In [11]:
#https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.sampleBy.html
sampled = df.sampleBy("MULTYEAR", fractions={2015: 0.03, 2016: 0.03, 2017:0.03, 2018:0.03, 2019:0.03}, seed=42)
sampled.groupBy("MULTYEAR").count().orderBy("MULTYEAR").show()

+--------+-----+
|MULTYEAR|count|
+--------+-----+
|    2015|34940|
|    2016|35487|
|    2017|35961|
|    2018|36031|
|    2019|36488|
+--------+-----+



In [12]:
sampled.filter((sampled.HHTYPE!=0) & (sampled.HHTYPE!=9)).groupBy('MULTYEAR','HHTYPE').count()\
    .orderBy('MULTYEAR','count', ascending=False).show(100,truncate=False)

+--------+------+-----+
|MULTYEAR|HHTYPE|count|
+--------+------+-----+
|2019    |1     |22462|
|2019    |3     |4466 |
|2019    |6     |2421 |
|2019    |4     |1733 |
|2019    |2     |1333 |
|2019    |5     |481  |
|2019    |7     |375  |
|2018    |1     |21907|
|2018    |3     |4512 |
|2018    |6     |2394 |
|2018    |4     |1706 |
|2018    |2     |1342 |
|2018    |5     |463  |
|2018    |7     |410  |
|2017    |1     |21676|
|2017    |3     |4582 |
|2017    |6     |2397 |
|2017    |4     |1669 |
|2017    |2     |1247 |
|2017    |5     |510  |
|2017    |7     |391  |
|2016    |1     |21697|
|2016    |3     |4694 |
|2016    |6     |2361 |
|2016    |4     |1668 |
|2016    |2     |1268 |
|2016    |5     |450  |
|2016    |7     |432  |
|2015    |1     |21096|
|2015    |3     |4747 |
|2015    |6     |2367 |
|2015    |4     |1640 |
|2015    |2     |1272 |
|2015    |5     |476  |
|2015    |7     |335  |
+--------+------+-----+



In [None]:
sampled.printSchema()

In [13]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[cols for cols in sampled.columns if cols!='label'], outputCol="features") 
transformed = assembler.transform(sampled)
transformed.select("label","features").show(2, truncate=False)

+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                   

In [14]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(transformed)
scaledData = scalerModel.transform(transformed)

scaledData.select("label","scaledFeatures").show(2, truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
training_data, test_data = scaledData.randomSplit([0.7, 0.3], seed=42)

In [16]:
pca_model = PCA(inputCol = "scaledFeatures", outputCol = "pca_features_cv")
rf = RandomForestClassifier(labelCol = "label", featuresCol = "pca_features_cv")
ppl_cv = Pipeline(stages = [pca_model, rf])

paramGrid = ParamGridBuilder() \
  .addGrid(pca_model.k, [10, 20, 30, 40, 50]) \
  .addGrid(rf.numTrees, [20, 30, 50]) \
  .build()

crossval = CrossValidator(estimator = ppl_cv,\
                                        estimatorParamMaps=paramGrid,\
                                        evaluator = MulticlassClassificationEvaluator(),\
                                        numFolds= 3,seed=42)



cv_model = crossval.fit(training_data)

In [19]:
predictions = cv_model.transform(test_data)
evaluator= MulticlassClassificationEvaluator(labelCol = "label", metricName= "accuracy")
accuracy = evaluator.evaluate(predictions)

In [20]:
print(accuracy)

0.5069876061932257


In [None]:
#https://awesomeopensource.com/project/adornes/spark_python_ml_examples
#https://spark.apache.org/docs/latest/ml-tuning.html
#https://sparkbyexamples.com/pyspark/pyspark-rename-dataframe-column/
#https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier
#https://people.stat.sc.edu/haigang/sparkCaseStudy.html