<a href="https://colab.research.google.com/github/manojmanivannan/ApacheSparkEssentials/blob/master/ML_SparkByPluralsight.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Environment

In [112]:
!apt-get update
!apt-get install -y openjdk-8-jdk-headless scala jq  > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!wget -q https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip
!pip install -q pyspark findspark

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [C                                                                               Get:2 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [2 InRelease 14.2 kB/88.0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (185.125.190.39                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [2 InRelease 51.8 kB/88.7 k                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/machine-le

In [113]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [114]:
!python --version

Python 3.8.15


In [115]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

# Spark session & context
conf = SparkConf().set('spark.ui.port', '4050')
try:
  sc = SparkContext(conf=conf)
except ValueError:
  sc.stop()
  sc = SparkContext(conf=conf)

spark = SparkSession.builder.master('local[2]').getOrCreate()

In [116]:
sc

In [117]:
# create new token from ngrok
get_ipython().system_raw('./ngrok authtoken 2IaHtxKm7l0XqBXP6s9xWSyVHqE_4wYXtkZorquhudnHW23ci')
# revoke the token once you close the notebook
get_ipython().system_raw('./ngrok http 4050 &')

In [118]:
!sleep 10 && curl -s http://localhost:4040/api/tunnels | python3 -c "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

https://e030-34-123-69-186.ngrok.io


In [119]:
# Dataset
# https://goo.gl/pSKLMJ or https://archive.ics.uci.edu/ml/datasets/wine


In [120]:
rawData = sc.textFile('wine.data')

In [121]:
rawData.take(5)

['1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065',
 '1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050',
 '1,13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185',
 '1,14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480',
 '1,13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735']

In [122]:
# Column # 
# 0) - Wine maker 1,2 or 3
# 1) Alcohol
# 2) Malic acid
# 3) Ash
# 4) Alcalinity of ash
# 5) Magnesium
# 6) Total phenols
# 7) Flavanoids
# 8) Nonflavanoid phenols
# 9) Proanthocyanins
# 10)Color intensity
# 11)Hue
# 12)OD280/OD315 of diluted wines
# 13)Proline

# Using the older Spark mllib library

In [123]:
from pyspark.mllib.regression import LabeledPoint

def parsePoint(line):
  values = [float(x) for x in line.split(',')]
  return LabeledPoint(values[0],values[1:])


In [124]:
parsedData = rawData.map(parsePoint)
parsedData.take(5)

[LabeledPoint(1.0, [14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0]),
 LabeledPoint(1.0, [13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0]),
 LabeledPoint(1.0, [13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0]),
 LabeledPoint(1.0, [14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0]),
 LabeledPoint(1.0, [13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0])]

In [125]:
(trainingData, testData) = parsedData.randomSplit([0.7,0.3])

In [126]:
from pyspark.mllib.tree import DecisionTree

model = DecisionTree.trainClassifier(trainingData,
                                     numClasses=4, # there are classes 1,2,3 as wine makers , so should 1+greatest value
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=3,
                                     maxBins=32)

In [127]:
predictions = model.predict(testData.map(lambda x: x.features))
predictions.take(5)

[1.0, 1.0, 1.0, 1.0, 1.0]

In [128]:
# compare predictions with actual labels
labelsAndPredictions = testData.map(lambda x: x.label).zip(predictions)
labelsAndPredictions.take(5)

[(1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0)]

In [129]:
testAcc = labelsAndPredictions\
                .filter(lambda x: x[0] == x[1])\
                .count() / float(testData.count())
print('Test Accuracy = ', testAcc)                

Test Accuracy =  0.9583333333333334


In [130]:
# we have a built-in function for evaluating metrics
from pyspark.mllib.evaluation import MulticlassMetrics

metrics = MulticlassMetrics(labelsAndPredictions)

In [131]:
metrics.accuracy

0.9583333333333334

In [132]:
metrics.precision(1.0)

1.0

In [133]:
metrics.precision(3.0)

1.0

In [134]:
metrics.confusionMatrix().toArray()

array([[21.,  1.,  0.],
       [ 0., 12.,  0.],
       [ 0.,  1., 13.]])

In [135]:
# to view the decision tree
print(model.toDebugString())

DecisionTreeModel classifier of depth 3 with 9 nodes
  If (feature 9 <= 3.3899999999999997)
   Predict: 2.0
  Else (feature 9 > 3.3899999999999997)
   If (feature 6 <= 1.5950000000000002)
    If (feature 3 <= 17.05)
     Predict: 2.0
    Else (feature 3 > 17.05)
     Predict: 3.0
   Else (feature 6 > 1.5950000000000002)
    If (feature 12 <= 679.0)
     Predict: 2.0
    Else (feature 12 > 679.0)
     Predict: 1.0



# LIBSVM data format

Each row in a dataset, has a label and features.
Features are referred as index.
i.e, index1 => feature1
so a record is represented as

| label | index1:value1 | index2:value2 | index3:value3 | ... |
|-------|---------------|---------------|---------------|-----|
| 0     | 1:324         | 2:332         | 3:12          | ... |
| 1     | 1:455         | 2:213         | 3:85          | ... |
| 0     | 1:344         | 2:165         |               | ... |

missing feature can be represented by simply omitting them.


In [136]:
from pyspark.mllib.util import MLUtils
# dataset https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/wine.scale
libsvmData = MLUtils.loadLibSVMFile(sc, 'wine.scale')

In [137]:
libsvmData.take(5)

[LabeledPoint(1.0, (13,[0,1,2,3,4,5,6,7,8,9,10,11,12],[0.68421,-0.616601,0.144385,-0.484536,0.23913,0.255172,0.147679,-0.433962,0.18612,-0.255973,-0.089431,0.941392,0.122682])),
 LabeledPoint(1.0, (13,[0,1,2,3,4,5,6,7,8,9,10,11,12],[0.142105,-0.588933,-0.165775,-0.938144,-0.347826,0.151724,0.0210971,-0.509434,-0.451104,-0.47099,-0.0731708,0.56044,0.101284])),
 LabeledPoint(1.0, (13,[0,1,2,3,4,5,6,7,8,9,10,11,12],[0.121053,-0.359684,0.40107,-0.175258,-0.326087,0.255172,0.223629,-0.358491,0.514196,-0.249147,-0.105691,0.391941,0.293866])),
 LabeledPoint(1.0, (13,[0,1,2,3,4,5,6,7,8,9,10,11,12],[0.757895,-0.521739,0.219251,-0.360825,-0.0652174,0.97931,0.329114,-0.584906,0.116719,0.112628,-0.382114,0.59707,0.714693])),
 LabeledPoint(1.0, (13,[0,1,2,3,4,5,6,7,8,9,10,11,12],[0.163158,-0.268775,0.614973,0.0721649,0.0434783,0.255172,-0.00843878,-0.018868,-0.11041,-0.481229,-0.089431,0.216117,-0.348074]))]

In [138]:
(trainingData, testData) = libsvmData.randomSplit([0.8,0.2])

In [139]:
libsvmmodel = DecisionTree.trainClassifier(trainingData,
                                     numClasses=4, # there are classes 1,2,3 as wine makers , so should 1+greatest value
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=5,
                                     maxBins=32)

In [140]:
predictions = libsvmmodel.predict(testData.map(lambda x: x.features))

In [141]:
labelsAndPredictions = testData.map(lambda x: x.label).zip(predictions)
labelsAndPredictions.take(5)

[(1.0, 1.0), (1.0, 1.0), (1.0, 2.0), (1.0, 1.0), (1.0, 1.0)]

In [142]:
metrics = MulticlassMetrics(labelsAndPredictions)

In [143]:
metrics.accuracy

0.9459459459459459

In [144]:
metrics.confusionMatrix().toArray()

array([[ 9.,  0.,  0.],
       [ 1., 11.,  0.],
       [ 0.,  1., 15.]])

In [145]:
print(model.toDebugString())

DecisionTreeModel classifier of depth 3 with 9 nodes
  If (feature 9 <= 3.3899999999999997)
   Predict: 2.0
  Else (feature 9 > 3.3899999999999997)
   If (feature 6 <= 1.5950000000000002)
    If (feature 3 <= 17.05)
     Predict: 2.0
    Else (feature 3 > 17.05)
     Predict: 3.0
   Else (feature 6 > 1.5950000000000002)
    If (feature 12 <= 679.0)
     Predict: 2.0
    Else (feature 12 > 679.0)
     Predict: 1.0



# Using the newer Spark ml library

In [146]:
 rawData = spark.read\
                .format('csv')\
                .option('header','false')\
                .load('wine.data')

In [147]:
rawData

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string]

In [149]:
rawData.show(5)

+---+-----+----+----+----+---+----+----+---+----+----+----+----+----+
|_c0|  _c1| _c2| _c3| _c4|_c5| _c6| _c7|_c8| _c9|_c10|_c11|_c12|_c13|
+---+-----+----+----+----+---+----+----+---+----+----+----+----+----+
|  1|14.23|1.71|2.43|15.6|127| 2.8|3.06|.28|2.29|5.64|1.04|3.92|1065|
|  1| 13.2|1.78|2.14|11.2|100|2.65|2.76|.26|1.28|4.38|1.05| 3.4|1050|
|  1|13.16|2.36|2.67|18.6|101| 2.8|3.24| .3|2.81|5.68|1.03|3.17|1185|
|  1|14.37|1.95| 2.5|16.8|113|3.85|3.49|.24|2.18| 7.8| .86|3.45|1480|
|  1|13.24|2.59|2.87|  21|118| 2.8|2.69|.39|1.82|4.32|1.04|2.93| 735|
+---+-----+----+----+----+---+----+----+---+----+----+----+----+----+
only showing top 5 rows



In [150]:
dataset = rawData.toDF('Label',
                      'Alcohol',
                      'MalicAcid',
                      'Ash',
                      'AshAlkalinity',
                      'Magnesium',
                      'TotalPhenols',
                      'Flavanoids',
                      'NonflavanoidPhenols',
                      'Proanthocyanin',
                      'ColorIntensity',
                      'Hue',
                      'OD',
                      'Proline')

In [151]:
dataset

DataFrame[Label: string, Alcohol: string, MalicAcid: string, Ash: string, AshAlkalinity: string, Magnesium: string, TotalPhenols: string, Flavanoids: string, NonflavanoidPhenols: string, Proanthocyanin: string, ColorIntensity: string, Hue: string, OD: string, Proline: string]

In [152]:
dataset.show(5)

+-----+-------+---------+----+-------------+---------+------------+----------+-------------------+--------------+--------------+----+----+-------+
|Label|Alcohol|MalicAcid| Ash|AshAlkalinity|Magnesium|TotalPhenols|Flavanoids|NonflavanoidPhenols|Proanthocyanin|ColorIntensity| Hue|  OD|Proline|
+-----+-------+---------+----+-------------+---------+------------+----------+-------------------+--------------+--------------+----+----+-------+
|    1|  14.23|     1.71|2.43|         15.6|      127|         2.8|      3.06|                .28|          2.29|          5.64|1.04|3.92|   1065|
|    1|   13.2|     1.78|2.14|         11.2|      100|        2.65|      2.76|                .26|          1.28|          4.38|1.05| 3.4|   1050|
|    1|  13.16|     2.36|2.67|         18.6|      101|         2.8|      3.24|                 .3|          2.81|          5.68|1.03|3.17|   1185|
|    1|  14.37|     1.95| 2.5|         16.8|      113|        3.85|      3.49|                .24|          2.18|     

In [153]:
from pyspark.ml.linalg import Vectors

def vectorize(data):
  return data.rdd.map(lambda r: [r[0], Vectors.dense(r[1:])]).toDF(['label','features'])
  

In [154]:
vectorizedData = vectorize(dataset)

In [155]:
vectorizedData.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|[14.23,1.71,2.43,...|
|    1|[13.2,1.78,2.14,1...|
|    1|[13.16,2.36,2.67,...|
|    1|[14.37,1.95,2.5,1...|
|    1|[13.24,2.59,2.87,...|
+-----+--------------------+
only showing top 5 rows



In [156]:
vectorizedData.take(5)

[Row(label='1', features=DenseVector([14.23, 1.71, 2.43, 15.6, 127.0, 2.8, 3.06, 0.28, 2.29, 5.64, 1.04, 3.92, 1065.0])),
 Row(label='1', features=DenseVector([13.2, 1.78, 2.14, 11.2, 100.0, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.4, 1050.0])),
 Row(label='1', features=DenseVector([13.16, 2.36, 2.67, 18.6, 101.0, 2.8, 3.24, 0.3, 2.81, 5.68, 1.03, 3.17, 1185.0])),
 Row(label='1', features=DenseVector([14.37, 1.95, 2.5, 16.8, 113.0, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480.0])),
 Row(label='1', features=DenseVector([13.24, 2.59, 2.87, 21.0, 118.0, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735.0]))]

In [157]:
from pyspark.ml.feature import StringIndexer

labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel')

In [159]:
indexedData = labelIndexer.fit(vectorizedData).transform(vectorizedData)
indexedData.take(2)

[Row(label='1', features=DenseVector([14.23, 1.71, 2.43, 15.6, 127.0, 2.8, 3.06, 0.28, 2.29, 5.64, 1.04, 3.92, 1065.0]), indexedLabel=1.0),
 Row(label='1', features=DenseVector([13.2, 1.78, 2.14, 11.2, 100.0, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.4, 1050.0]), indexedLabel=1.0)]

In [160]:
indexedData

DataFrame[label: string, features: vector, indexedLabel: double]

In [161]:
indexedData.select('label').distinct().show()

+-----+
|label|
+-----+
|    3|
|    1|
|    2|
+-----+



In [162]:
indexedData.select('indexedLabel').distinct().show()

+------------+
|indexedLabel|
+------------+
|         0.0|
|         1.0|
|         2.0|
+------------+



In [163]:
(trainingData, testData) = indexedData.randomSplit([0.8,0.2])

In [166]:
from pyspark.ml.classification import DecisionTreeClassifier

dtree = DecisionTreeClassifier(
    labelCol='indexedLabel',
    featuresCol='features',
    maxDepth=3,
    impurity='gini'
)

In [167]:
model = dtree.fit(trainingData)

In [168]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='indexedLabel',
                                              predictionCol='prediction',
                                              metricName='f1')

In [170]:
transformed_data = model.transform(testData)
transformed_data.show(5,truncate=False)

+-----+---------------------------------------------------------------------+------------+--------------+--------------------------------------------+----------+
|label|features                                                             |indexedLabel|rawPrediction |probability                                 |prediction|
+-----+---------------------------------------------------------------------+------------+--------------+--------------------------------------------+----------+
|1    |[13.05,1.65,2.55,18.0,98.0,2.45,2.43,0.29,1.44,4.25,1.12,2.51,1105.0]|1.0         |[1.0,48.0,0.0]|[0.02040816326530612,0.9795918367346939,0.0]|1.0       |
|1    |[13.05,1.77,2.1,17.0,107.0,3.0,3.0,0.28,2.03,5.04,0.88,3.35,885.0]   |1.0         |[1.0,48.0,0.0]|[0.02040816326530612,0.9795918367346939,0.0]|1.0       |
|1    |[13.28,1.64,2.84,15.5,110.0,2.6,2.68,0.34,1.36,4.6,1.09,2.78,880.0]  |1.0         |[1.0,48.0,0.0]|[0.02040816326530612,0.9795918367346939,0.0]|1.0       |
|1    |[13.72,1.43,2.5,16.7,

In [171]:
print(evaluator.getMetricName(),
      'accuracy: ',
      evaluator.evaluate(transformed_data))

f1 accuracy:  0.9697597088901437


# Random Forest ML

In [228]:
rawData = spark.read\
                .format('csv')\
                .option('header','false')\
                .option('ignoreLeadingWhiteSpace','true')\
                .load('adult.csv')

In [229]:
dataset = rawData.toDF('Age',
                       'WorkClass',
                       'FnlWgt',
                       'Education',
                       'EducationNum',
                       'MaritalStatus',
                       'Occupation',
                       'Relationship',
                       'Race',
                       'Gender',
                       'CapitalGain',
                       'CapitalLoss',
                       'HoursPerWeek',
                       'NativeCountry',
                       'Label')

In [230]:
dataset.take(5)

[Row(Age='39', WorkClass='State-gov', FnlWgt='77516', Education='Bachelors', EducationNum='13', MaritalStatus='Never-married', Occupation='Adm-clerical', Relationship='Not-in-family', Race='White', Gender='Male', CapitalGain='2174', CapitalLoss='0', HoursPerWeek='40', NativeCountry='United-States', Label='<=50K'),
 Row(Age='50', WorkClass='Self-emp-not-inc', FnlWgt='83311', Education='Bachelors', EducationNum='13', MaritalStatus='Married-civ-spouse', Occupation='Exec-managerial', Relationship='Husband', Race='White', Gender='Male', CapitalGain='0', CapitalLoss='0', HoursPerWeek='13', NativeCountry='United-States', Label='<=50K'),
 Row(Age='38', WorkClass='Private', FnlWgt='215646', Education='HS-grad', EducationNum='9', MaritalStatus='Divorced', Occupation='Handlers-cleaners', Relationship='Not-in-family', Race='White', Gender='Male', CapitalGain='0', CapitalLoss='0', HoursPerWeek='40', NativeCountry='United-States', Label='<=50K'),
 Row(Age='53', WorkClass='Private', FnlWgt='234721', 

In [231]:
dataset.toPandas().head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [232]:
dataset = dataset.drop('FnlWgt')
dataset = dataset.replace('?',None)
dataset.count()


32561

In [233]:
dataset = dataset.dropna(how='any')
dataset.count()

30162

In [234]:
dataset.describe()

DataFrame[summary: string, Age: string, WorkClass: string, Education: string, EducationNum: string, MaritalStatus: string, Occupation: string, Relationship: string, Race: string, Gender: string, CapitalGain: string, CapitalLoss: string, HoursPerWeek: string, NativeCountry: string, Label: string]

In [235]:
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col

dataset = dataset.withColumn('Age',dataset['Age'].cast(FloatType()))
dataset = dataset.withColumn('EducationNum',dataset['EducationNum'].cast(FloatType()))
dataset = dataset.withColumn('CapitalGain',dataset['CapitalGain'].cast(FloatType()))
dataset = dataset.withColumn('CapitalLoss',dataset['CapitalLoss'].cast(FloatType()))
dataset = dataset.withColumn('HoursPerWeek',dataset['HoursPerWeek'].cast(FloatType()))

In [236]:
dataset.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [237]:
indexedDF = StringIndexer(inputCol='WorkClass', outputCol='WorkClass_index').fit(dataset).transform(dataset)


In [238]:
indexedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label,WorkClass_index
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K,3.0
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K,1.0
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,0.0
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K,0.0
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K,0.0


In [239]:
from pyspark.ml.feature import OneHotEncoder

encodedDF = OneHotEncoder(
    inputCol="WorkClass_index",
    outputCol="WorkClass_encoded"
).fit(indexedDF).transform(indexedDF)

In [240]:
encodedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label,WorkClass_index,WorkClass_encoded
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)"
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)"
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"


In [241]:
encodedDF.select('WorkClass','WorkClass_index','WorkClass_encoded').toPandas().head()

Unnamed: 0,WorkClass,WorkClass_index,WorkClass_encoded
0,State-gov,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)"
1,Self-emp-not-inc,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)"
2,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"


In [242]:
(trainingData,testData) = dataset.randomSplit([0.8,0.2])

In [243]:
categoricalFeatures = [
    'WorkClass',
    'Education',
    'MaritalStatus',
    'Relationship',
    'Race',
    'Gender',
    'NativeCountry',
    'Occupation'
]
indexers = [StringIndexer(inputCol=column, 
                          outputCol=column+'_index',
                          handleInvalid='keep') for column in categoricalFeatures]

In [244]:
encoders = [OneHotEncoder(inputCol=column+'_index',
                          outputCol=column+'_encoded') for column in categoricalFeatures]

In [245]:
labelIndexer = [StringIndexer(inputCol='Label',outputCol='Label_index')]

In [246]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=indexers+encoders+labelIndexer)

In [247]:
transformedDF = pipeline.fit(trainingData).transform(trainingData)

In [248]:
transformedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,Occupation_index,WorkClass_encoded,Education_encoded,MaritalStatus_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Occupation_encoded,Label_index
0,17.0,Federal-gov,11th,7.0,Never-married,Adm-clerical,Not-in-family,Black,Female,0.0,...,3.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
1,17.0,Local-gov,10th,6.0,Never-married,Other-service,Own-child,White,Female,0.0,...,5.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",0.0
2,17.0,Local-gov,10th,6.0,Never-married,Protective-serv,Own-child,White,Female,0.0,...,11.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
3,17.0,Local-gov,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,3.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
4,17.0,Local-gov,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,3.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0


In [249]:
requiredFeatures =[
    'Age',
    'EducationNum',
    'CapitalGain',
    'CapitalGain',
    'HoursPerWeek',
    'WorkClass_encoded',
    'Education_encoded',
    'MaritalStatus_encoded',
    'Occupation_encoded',
    'Relationship_encoded',
    'Race_encoded',
    'Gender_encoded',
    'NativeCountry_encoded'
]

In [250]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=requiredFeatures, outputCol='features')

In [251]:
transformedDF = assembler.transform(transformedDF)
transformedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,WorkClass_encoded,Education_encoded,MaritalStatus_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Occupation_encoded,Label_index,features
0,17.0,Federal-gov,11th,7.0,Never-married,Adm-clerical,Not-in-family,Black,Female,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 40.0, 0.0, 0.0, 0.0, 0.0..."
1,17.0,Local-gov,10th,6.0,Never-married,Other-service,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 25.0, 0.0, 0.0, 1.0, 0.0..."
2,17.0,Local-gov,10th,6.0,Never-married,Protective-serv,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 40.0, 0.0, 0.0, 1.0, 0.0..."
3,17.0,Local-gov,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 12.0, 0.0, 0.0, 1.0, 0.0..."
4,17.0,Local-gov,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 15.0, 0.0, 0.0, 1.0, 0.0..."


In [252]:
transformedDF.select('features').toPandas().head()

Unnamed: 0,features
0,"(17.0, 7.0, 0.0, 0.0, 40.0, 0.0, 0.0, 0.0, 0.0..."
1,"(17.0, 6.0, 0.0, 0.0, 25.0, 0.0, 0.0, 1.0, 0.0..."
2,"(17.0, 6.0, 0.0, 0.0, 40.0, 0.0, 0.0, 1.0, 0.0..."
3,"(17.0, 7.0, 0.0, 0.0, 12.0, 0.0, 0.0, 1.0, 0.0..."
4,"(17.0, 7.0, 0.0, 0.0, 15.0, 0.0, 0.0, 1.0, 0.0..."


In [261]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='Label_index',
                            featuresCol='features',
                            maxDepth=10)

In [262]:
pipeline = Pipeline(
    stages=indexers+encoders+labelIndexer + [assembler,rf]
)

In [263]:
model = pipeline.fit(trainingData)

In [264]:
predictions = model.transform(testData)
predictionsDF = predictions.toPandas()
predictionsDF.head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Occupation_encoded,Label_index,features,rawPrediction,probability,prediction
0,17.0,Local-gov,9th,5.0,Never-married,Other-service,Own-child,White,Male,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 5.0, 0.0, 0.0, 45.0, 0.0, 0.0, 1.0, 0.0...","[19.667656293397805, 0.3323437066021966]","[0.9833828146698902, 0.016617185330109828]",0.0
1,17.0,Private,10th,6.0,Never-married,Farming-fishing,Own-child,White,Male,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 20.0, 1.0, 0.0, 0.0, 0.0...","[19.82028165595313, 0.17971834404687206]","[0.9910140827976563, 0.008985917202343601]",0.0
2,17.0,Private,10th,6.0,Never-married,Handlers-cleaners,Own-child,White,Female,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 23.0, 1.0, 0.0, 0.0, 0.0...","[19.870343933622205, 0.12965606637779215]","[0.9935171966811105, 0.006482803318889609]",0.0
3,17.0,Private,10th,6.0,Never-married,Handlers-cleaners,Own-child,White,Male,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 0.0...","[19.83561685820668, 0.16438314179332036]","[0.9917808429103341, 0.008219157089666019]",0.0
4,17.0,Private,10th,6.0,Never-married,Handlers-cleaners,Own-child,White,Male,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 20.0, 1.0, 0.0, 0.0, 0.0...","[19.83561685820668, 0.16438314179332036]","[0.9917808429103341, 0.008219157089666019]",0.0


In [265]:
predictions = predictions.select('Label_index','prediction')

In [266]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol='Label_index',
    predictionCol='prediction',
    metricName='accuracy'
)

In [267]:
accuracy = evaluator.evaluate(predictions)
print('Test Accuracy',accuracy)

Test Accuracy 0.843279797125951


In [268]:
predictionsDF.loc[
    predictionsDF['Label_index'] != predictionsDF['prediction']
]

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Occupation_encoded,Label_index,features,rawPrediction,probability,prediction
578,22.0,Private,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",1.0,"(22.0, 9.0, 0.0, 0.0, 50.0, 1.0, 0.0, 0.0, 0.0...","[15.116369209925255, 4.883630790074745]","[0.7558184604962628, 0.24418153950373728]",0.0
608,22.0,Private,Some-college,10.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",1.0,"(22.0, 10.0, 0.0, 0.0, 48.0, 1.0, 0.0, 0.0, 0....","[14.391043670897348, 5.608956329102652]","[0.7195521835448674, 0.2804478164551326]",0.0
728,23.0,Private,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Wife,White,Female,0.0,...,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",1.0,"(23.0, 9.0, 0.0, 0.0, 40.0, 1.0, 0.0, 0.0, 0.0...","[15.446937839222544, 4.553062160777453]","[0.7723468919611274, 0.2276531080388727]",0.0
854,24.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(24.0, 13.0, 0.0, 0.0, 44.0, 1.0, 0.0, 0.0, 0....","[10.422586401668315, 9.577413598331683]","[0.5211293200834157, 0.47887067991658416]",0.0
865,24.0,Private,Bachelors,13.0,Never-married,Exec-managerial,Unmarried,White,Male,0.0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(24.0, 13.0, 0.0, 0.0, 40.0, 1.0, 0.0, 0.0, 0....","[18.03187156795525, 1.9681284320447507]","[0.9015935783977624, 0.09840642160223753]",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5869,74.0,Self-emp-inc,Some-college,10.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(74.0, 10.0, 0.0, 0.0, 35.0, 0.0, 0.0, 0.0, 0....","[10.691472259448496, 9.308527740551503]","[0.5345736129724248, 0.4654263870275751]",0.0
5873,74.0,State-gov,Doctorate,16.0,Never-married,Prof-specialty,Other-relative,White,Female,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(74.0, 16.0, 0.0, 0.0, 20.0, 0.0, 0.0, 0.0, 1....","[15.611581100822276, 4.388418899177723]","[0.7805790550411138, 0.21942094495888614]",0.0
5894,79.0,Self-emp-inc,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(79.0, 9.0, 0.0, 0.0, 40.0, 0.0, 0.0, 0.0, 0.0...","[12.159844448385305, 7.840155551614695]","[0.6079922224192653, 0.39200777758073474]",0.0
5896,80.0,Private,Doctorate,16.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(80.0, 16.0, 0.0, 0.0, 30.0, 1.0, 0.0, 0.0, 0....","[5.584426584566316, 14.415573415433684]","[0.2792213292283158, 0.7207786707716842]",1.0


# Regression