In [1]:
variables = """age: continuous. 
workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. 
fnlwgt: continuous. 
education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. 
education-num: continuous. 
marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. 
occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. 
relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. 
race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. 
sex: Female, Male. 
capital-gain: continuous. 
capital-loss: continuous. 
hours-per-week: continuous. 
native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
income:>50K, <=50K"""

In [2]:
variable_names = [l.split(':')[0] for l in variables.split('\n')]
variable_names

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income']

In [3]:
df = spark.read.csv('../../adult.data', inferSchema=True)

In [6]:
from pyspark.sql.functions import col
df.printSchema()

income_df = df.select(*[col(c).alias(a) for c,a in zip(df.columns, variable_names)])
income_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: double (nullable = true)
 |-- _c11: double (nullable = true)
 |-- _c12: double (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capit

In [24]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.types import StringType


from pyspark.ml.feature import Tokenizer


categorical_variables = [ f.name for f in income_df.schema.fields if f.dataType == StringType()]
print(categorical_variables)
indexerPipeline = Pipeline(stages = [StringIndexer(inputCol=c, outputCol= "cat_%s"%c) for c in categorical_variables])

pipelineModel = indexerPipeline.fit(income_df)
indexed_df = pipelineModel.transform(income_df)
indexed_df.printSchema()

['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capital-loss: double (nullable = true)
 |-- hours-per-week: double (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- cat_workclass: double (nullable = true)
 |-- cat_education: double (nullable = true)
 |-- cat_marital-status: double (nullable = true)
 |-- cat_occupation: double (nullable = true)
 |-- cat_relationship: double (nullable = true)
 |-- cat_race: double

In [40]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer

categorical_assembler = Pipeline(stages = [
    VectorAssembler(inputCols = ["cat_%s"%c for c in categorical_variables if c !='income'], outputCol='cat_vector'),                    
    VectorIndexer(inputCol='cat_vector', outputCol='cat_features')
])

categorical_assembler_model = categorical_assembler.fit(indexed_df)
cat_df = categorical_assembler_model.transform(indexed_df)
cat_df.printSchema()
cat_df.select('cat_features').show(5)

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capital-loss: double (nullable = true)
 |-- hours-per-week: double (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- cat_workclass: double (nullable = true)
 |-- cat_education: double (nullable = true)
 |-- cat_marital-status: double (nullable = true)
 |-- cat_occupation: double (nullable = true)
 |-- cat_relationship: double (nullable = true)
 |-- cat_race: double (nullable = true)
 |-- cat_sex: double (nullable = true)
 |-- cat_native-country: double (nullable = true)
 |-- cat_i

In [41]:
feature_assembler = VectorAssembler(inputCols = ['age', 'fnlwgt', 'hours-per-week', 'cat_features'] , outputCol='features')
feature_df = feature_assembler.transform(cat_df)

feature_df.printSchema()
feature_df.select('features').show()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capital-loss: double (nullable = true)
 |-- hours-per-week: double (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- cat_workclass: double (nullable = true)
 |-- cat_education: double (nullable = true)
 |-- cat_marital-status: double (nullable = true)
 |-- cat_occupation: double (nullable = true)
 |-- cat_relationship: double (nullable = true)
 |-- cat_race: double (nullable = true)
 |-- cat_sex: double (nullable = true)
 |-- cat_native-country: double (nullable = true)
 |-- cat_i

In [47]:
from pyspark.ml.classification import RandomForestClassifier

rf_classifier = RandomForestClassifier(featuresCol='features', labelCol='cat_income', maxBins=50)

rf_model = rf_classifier.fit(feature_df)



In [48]:
display(rf_model.transform(feature_df))



Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,cat_race,cat_sex,cat_native-country,cat_income,cat_vector,cat_features,features,rawPrediction,probability,prediction
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,...,0.0,0.0,0.0,0.0,"[4.0, 2.0, 1.0, 3.0, 1.0, 0.0, 0.0, 0.0]","[4.0, 2.0, 1.0, 3.0, 1.0, 0.0, 0.0, 0.0]","[39.0, 77516.0, 40.0, 4.0, 2.0, 1.0, 3.0, 1.0,...","[18.3459910316, 1.65400896836]","[0.917299551582, 0.0827004484181]",0.0
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0.0,0.0,0.0,0.0,"(1.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0)","(50.0, 83311.0, 13.0, 1.0, 2.0, 0.0, 2.0, 0.0,...","[8.06622349184, 11.9337765082]","[0.403311174592, 0.596688825408]",1.0
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,0.0,0.0,0.0,0.0,"(0.0, 0.0, 2.0, 9.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 2.0, 9.0, 1.0, 0.0, 0.0, 0.0)","(38.0, 215646.0, 40.0, 0.0, 0.0, 2.0, 9.0, 1.0...","[19.1698439358, 0.83015606421]","[0.95849219679, 0.0415078032105]",0.0
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,1.0,0.0,0.0,0.0,"(0.0, 5.0, 0.0, 9.0, 0.0, 1.0, 0.0, 0.0)","(0.0, 5.0, 0.0, 9.0, 0.0, 1.0, 0.0, 0.0)","(53.0, 234721.0, 40.0, 0.0, 5.0, 0.0, 9.0, 0.0...","[16.3338202607, 3.66617973931]","[0.816691013035, 0.183308986965]",0.0
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,1.0,1.0,9.0,0.0,"[0.0, 2.0, 0.0, 0.0, 4.0, 1.0, 1.0, 9.0]","[0.0, 2.0, 0.0, 0.0, 4.0, 1.0, 1.0, 9.0]","[28.0, 338409.0, 40.0, 0.0, 2.0, 0.0, 0.0, 4.0...","[8.29019254189, 11.7098074581]","[0.414509627095, 0.585490372905]",1.0
5,37,Private,284582.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,...,0.0,1.0,0.0,0.0,"(0.0, 3.0, 0.0, 2.0, 4.0, 0.0, 1.0, 0.0)","(0.0, 3.0, 0.0, 2.0, 4.0, 0.0, 1.0, 0.0)","[37.0, 284582.0, 40.0, 0.0, 3.0, 0.0, 2.0, 4.0...","[6.44597969255, 13.5540203075]","[0.322298984627, 0.677701015373]",1.0
6,49,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,...,1.0,1.0,11.0,0.0,"[0.0, 10.0, 5.0, 5.0, 1.0, 1.0, 1.0, 11.0]","[0.0, 10.0, 5.0, 5.0, 1.0, 1.0, 1.0, 11.0]","[49.0, 160187.0, 16.0, 0.0, 10.0, 5.0, 5.0, 1....","[19.3485458894, 0.651454110626]","[0.967427294469, 0.0325727055313]",0.0
7,52,Self-emp-not-inc,209642.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0.0,0.0,0.0,1.0,"(1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0)","(52.0, 209642.0, 45.0, 1.0, 0.0, 0.0, 2.0, 0.0...","[7.68207472805, 12.317925272]","[0.384103736402, 0.615896263598]",1.0
8,31,Private,45781.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,...,0.0,1.0,0.0,1.0,"(0.0, 3.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0)","(0.0, 3.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0)","[31.0, 45781.0, 50.0, 0.0, 3.0, 1.0, 0.0, 1.0,...","[14.475703915, 5.524296085]","[0.72378519575, 0.27621480425]",0.0
9,42,Private,159449.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0.0,0.0,0.0,1.0,"(0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0)","(42.0, 159449.0, 40.0, 0.0, 2.0, 0.0, 2.0, 0.0...","[5.75992406811, 14.2400759319]","[0.287996203405, 0.712003796595]",1.0


In [49]:
rf_model.featureImportances

SparseVector(11, {0: 0.0939, 1: 0.0018, 2: 0.054, 3: 0.0025, 4: 0.0885, 5: 0.2783, 6: 0.184, 7: 0.2875, 8: 0.0004, 9: 0.0042, 10: 0.0048})

In [43]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

(trainData, testData) = df_data_tf_idf.randomSplit([0.5, 0.5])

In [44]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'idf')
model = rf.fit(testData)

In [45]:
test_predictions = model.transform(testData)
test_predictions.show()

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|            filename|label|                text|                  tf|                 idf|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|0002.1999-12-13.f...|  0.0|[Subject:, vastar...|(5,[0,1,2,3,4],[1...|(5,[0,1,2,3,4],[1...|[12.5720286448161...|[0.62860143224080...|       0.0|
|0003.1999-12-14.f...|  0.0|[Subject:, calpin...|(5,[0,1,2,4],[2.0...|(5,[0,1,2,4],[0.0...|[15.3284586218421...|[0.76642293109210...|       0.0|
|0005.1999-12-14.f...|  0.0|[Subject:, meter,...|(5,[0,1,2,3,4],[4...|(5,[0,1,2,3,4],[0...|[15.8818467289343...|[0.79409233644671...|       0.0|
|0007.1999-12-14.f...|  0.0|[Subject:, mcmull...|(5,[0,1,2,3,4],[3...|(5,[0,1,2,3,4],[0...|[13.4509241509410...|[0.67254620754705.

In [46]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(test_predictions)

0.7344332232268187