# Dataset "Adults" zawiera informacje o obywatelach US. Celem eksperymentu jest przewidzenie czy dana osoba zarabia więcej niż 50,000$

In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Predicting the amount of money \  made by an individual US citizen") \
    .getOrCreate()
        
rawData = spark.read \
            .format('csv')\
            .option('header','false') \
            .option('ignoreLeadingWhiteSpace','True')\
            .load('../datasets/adult.csv')

In [2]:
dataset = rawData.toDF('Age', 'WorkClass', 'FnlWgt', 'Education', 'EducationNum', 'MartialStatus', 'Occupation', 'Relationship', 'Race', 'Gender','CapitalGain', 'capitalLoss', 'HoursPerWeek', 'NativeCountry', 'Label')


In [3]:
dataset.toPandas().head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MartialStatus,Occupation,Relationship,Race,Gender,CapitalGain,capitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Usuwamy kolumnę 'FnlWgt' ze względu na jej nieprzydatność w analizowanym eksperymencie
### następnie zastępujemy wszystkie puste rekordy ('?") na wartość None oraz usuwamy wszystkie wiersze z brakującymi rekordami

In [4]:
dataset.drop('FnlWgt')

DataFrame[Age: string, WorkClass: string, Education: string, EducationNum: string, MartialStatus: string, Occupation: string, Relationship: string, Race: string, Gender: string, CapitalGain: string, capitalLoss: string, HoursPerWeek: string, NativeCountry: string, Label: string]

In [5]:
dataset.toPandas()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MartialStatus,Occupation,Relationship,Race,Gender,CapitalGain,capitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [6]:
dataset = dataset.replace('?', None)
dataset.toPandas()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MartialStatus,Occupation,Relationship,Race,Gender,CapitalGain,capitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [7]:
dataset = dataset.dropna(how = 'any')

In [8]:
dataset.count()

30162

In [9]:
dataset.describe()

DataFrame[summary: string, Age: string, WorkClass: string, FnlWgt: string, Education: string, EducationNum: string, MartialStatus: string, Occupation: string, Relationship: string, Race: string, Gender: string, CapitalGain: string, capitalLoss: string, HoursPerWeek: string, NativeCountry: string, Label: string]

## Zamiana wszystkich zmiennych w danej kolumnie z typu string na  float

In [10]:
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col

dataset = dataset.withColumn('Age', dataset['Age'].cast(FloatType()))
dataset = dataset.withColumn('EducationNum', dataset['EducationNum'].cast(FloatType()))
dataset = dataset.withColumn('CapitalGain', dataset['CapitalGain'].cast(FloatType()))
dataset = dataset.withColumn('CapitalLoss', dataset['CapitalLoss'].cast(FloatType()))
dataset = dataset.withColumn('HoursPerWeek', dataset['HoursPerWeek'].cast(FloatType()))

In [11]:
dataset.describe()

DataFrame[summary: string, Age: string, WorkClass: string, FnlWgt: string, Education: string, EducationNum: string, MartialStatus: string, Occupation: string, Relationship: string, Race: string, Gender: string, CapitalGain: string, CapitalLoss: string, HoursPerWeek: string, NativeCountry: string, Label: string]

### Zamieniamy wszystkie zmienne kategoryczne na numeryczne (np Male/Female -> 0,1 itd)

In [12]:
from pyspark.ml.feature import StringIndexer

indexedDF = StringIndexer(
    inputCol = 'WorkClass',
    outputCol = 'WorkClass_index').fit(dataset).transform(dataset)

In [13]:
indexedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MartialStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label,WorkClass_index
0,39.0,State-gov,77516,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K,3.0
1,50.0,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K,1.0
2,38.0,Private,215646,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,0.0
3,53.0,Private,234721,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K,0.0
4,28.0,Private,338409,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K,0.0


#### zmiana typu zmiennej kategorycznej WorkClass z indeksu (0,1,2,3..) na typ OneHotEncoder gdzie wartość opisana jest poprzez indeks jedynki w liście

In [14]:
from pyspark.ml.feature import OneHotEncoder

encodedDF = OneHotEncoder(
    inputCol ="WorkClass_index",
    outputCol="WorkClass_encoded").transform(indexedDF)

In [15]:
encodedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MartialStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label,WorkClass_index,WorkClass_encoded
0,39.0,State-gov,77516,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)"
1,50.0,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)"
2,38.0,Private,215646,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,53.0,Private,234721,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,28.0,Private,338409,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"


In [16]:
encodedDF.select('WorkClass', 'WorkClass_index', 'WorkClass_Encoded').toPandas().head(15)

Unnamed: 0,WorkClass,WorkClass_index,WorkClass_Encoded
0,State-gov,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)"
1,Self-emp-not-inc,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)"
2,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
5,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
6,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
7,Self-emp-not-inc,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)"
8,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
9,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"


In [17]:
(trainingData, testData) = dataset.randomSplit([0.8,0.2])

### Enkodowanie do danych kategorycznych w postaci OneHotEncoding pozostałych kolumn

In [18]:
categoricalFeatures = [
            'WorkClass',
            'Education',
            'MartialStatus',
            'Occupation',
            'Relationship',
            'Race',
            'Gender',
            'NativeCountry'
]

### tworzymy listę obiektów stringindexer- parametr handleInvalid mówi że w przypadku napotkania nowej instancji w danych testowych zostanie nadany nowy indeks

In [55]:
indexers = [StringIndexer(
        inputCol = column,
        outputCol = column +'_index',
        handleInvalid = 'keep') for column in categoricalFeatures]

In [56]:
encoders = [OneHotEncoder(inputCol = column + '_index', outputCol = column +'_encoded') for column in categoricalFeatures]

In [57]:
labelIndexer = [StringIndexer(inputCol = 'Label', outputCol = 'Label_indexed')]

In [58]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = indexers + encoders + labelIndexer)

In [59]:
transformedDF = pipeline.fit(trainingData).transform(trainingData)
transformedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MartialStatus,Occupation,Relationship,Race,Gender,...,NativeCountry_index,WorkClass_encoded,Education_encoded,MartialStatus_encoded,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_indexed
0,17.0,Federal-gov,99893,11th,7.0,Never-married,Adm-clerical,Not-in-family,Black,Female,...,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
1,17.0,Local-gov,148194,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,...,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
2,17.0,Local-gov,170916,10th,6.0,Never-married,Protective-serv,Own-child,White,Female,...,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
3,17.0,Local-gov,173497,11th,7.0,Never-married,Prof-specialty,Own-child,Black,Male,...,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
4,17.0,Local-gov,175587,11th,7.0,Never-married,Protective-serv,Own-child,White,Male,...,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0


In [60]:
requiredFeatures = ['Age',
                    'EducationNum',
                    'CapitalGain',
                    'CapitalLoss',
                    'HoursPerWeek',
                    'WorkClass_encoded',
                    'Education_encoded',
                    'MartialStatus_encoded',
                    'Occupation_encoded',
                    'Relationship_encoded',
                    'Race_encoded',
                    'Gender_encoded',
                    'NativeCountry_encoded'

    
]

In [61]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = requiredFeatures, outputCol = 'features')

In [63]:
transformedDF = assembler.transform(transformedDF)
transformedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MartialStatus,Occupation,Relationship,Race,Gender,...,WorkClass_encoded,Education_encoded,MartialStatus_encoded,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_indexed,features
0,17.0,Federal-gov,99893,11th,7.0,Never-married,Adm-clerical,Not-in-family,Black,Female,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 1602.0, 40.0, 0.0, 0.0, 0.0, ..."
1,17.0,Local-gov,148194,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 12.0, 0.0, 0.0, 1.0, 0.0..."
2,17.0,Local-gov,170916,10th,6.0,Never-married,Protective-serv,Own-child,White,Female,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 1602.0, 40.0, 0.0, 0.0, 1.0, ..."
3,17.0,Local-gov,173497,11th,7.0,Never-married,Prof-specialty,Own-child,Black,Male,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 15.0, 0.0, 0.0, 1.0, 0.0..."
4,17.0,Local-gov,175587,11th,7.0,Never-married,Protective-serv,Own-child,White,Male,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 30.0, 0.0, 0.0, 1.0, 0.0..."


In [86]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='Label_indexed',
                            featuresCol = 'features',
                           maxDepth=10)

In [87]:
pipeline = Pipeline(stages = indexers + encoders + labelIndexer + [assembler, rf])
model = pipeline.fit(trainingData)

In [88]:
predictions = model.transform(testData)
predictionsDF = predictions.toPandas()
predictionsDF.head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MartialStatus,Occupation,Relationship,Race,Gender,...,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_indexed,features,rawPrediction,probability,prediction
0,17.0,Local-gov,192387,9th,5.0,Never-married,Other-service,Own-child,White,Male,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 5.0, 0.0, 0.0, 45.0, 0.0, 0.0, 1.0, 0.0...","[19.72479680007645, 0.275203199923547]","[0.9862398400038227, 0.013760159996177353]",0.0
1,17.0,Local-gov,244856,11th,7.0,Never-married,Prof-specialty,Own-child,White,Female,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 40.0, 0.0, 0.0, 1.0, 0.0...","[19.60088954424792, 0.39911045575207754]","[0.9800444772123962, 0.01995552278760388]",0.0
2,17.0,Local-gov,308901,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 15.0, 0.0, 0.0, 1.0, 0.0...","[19.903090207703098, 0.09690979229689775]","[0.9951545103851551, 0.004845489614844888]",0.0
3,17.0,Private,100828,11th,7.0,Never-married,Other-service,Not-in-family,White,Male,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 20.0, 1.0, 0.0, 0.0, 0.0...","[19.62258596153539, 0.3774140384646135]","[0.9811292980767693, 0.018870701923230673]",0.0
4,17.0,Private,103851,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 1055.0, 0.0, 20.0, 1.0, 0.0, 0.0, ...","[19.9059059372858, 0.09409406271419714]","[0.9952952968642901, 0.004704703135709857]",0.0


In [89]:
predictions = predictions.select(
    'Label_indexed',
    'prediction')

In [90]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol = 'Label_indexed',
    predictionCol = 'prediction',
    metricName = 'accuracy')

In [91]:
accuracy = evaluator.evaluate(predictions)
print("test Accuracy = ", accuracy)

test Accuracy =  0.8502099076406381
