In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('prepare_data').getOrCreate()

In [3]:
from pyspark.sql.types import *
from pyspark.sql import Row
from time import time

In [4]:
train_data = "../datasets/dummyTrain.csv"

custRDD = sc.textFile(train_data)

In [5]:
custRDD.take(5)

[u'Customer_ID,Name,Gender,Address,Nationality,Account_Type,Age,Education,Employment,Salary,Employer_Stability,Customer_Loyalty,Balance,Residential_Status,Service_Level',
 u'0,Ashley Payne,Male,1114 Odonnell Camp,Zimbabwean,Current,0,0,1,2,1,1,1,1,1',
 u'1,Michelle Clark,Female,23560 Ayala Spring,Zimbabwean,Current,0,0,0,2,1,1,2,1,1',
 u'2,Rodney Rich PhD,Male,883 Franco Knolls,Zimbabwean,Savings,2,1,2,2,0,0,0,0,1',
 u'3,Janet Hernandez,Male,2440 Strickland Park,Zimbabwean,Savings,0,1,1,1,0,1,0,1,0']

In [6]:
# Splitting the lines on the commas
custRDD = custRDD.map(lambda line: line.split(','))

In [7]:
custRDD.take(2)

[[u'Customer_ID',
  u'Name',
  u'Gender',
  u'Address',
  u'Nationality',
  u'Account_Type',
  u'Age',
  u'Education',
  u'Employment',
  u'Salary',
  u'Employer_Stability',
  u'Customer_Loyalty',
  u'Balance',
  u'Residential_Status',
  u'Service_Level'],
 [u'0',
  u'Ashley Payne',
  u'Male',
  u'1114 Odonnell Camp',
  u'Zimbabwean',
  u'Current',
  u'0',
  u'0',
  u'1',
  u'2',
  u'1',
  u'1',
  u'1',
  u'1',
  u'1']]

In [8]:
# Stripping the header from the RDD
# passing a filter tells Spark to select all RDD member
header = custRDD.first()
custRDD = custRDD.filter(lambda line:line != header)

In [9]:
custRDD.take(2)

[[u'0',
  u'Ashley Payne',
  u'Male',
  u'1114 Odonnell Camp',
  u'Zimbabwean',
  u'Current',
  u'0',
  u'0',
  u'1',
  u'2',
  u'1',
  u'1',
  u'1',
  u'1',
  u'1'],
 [u'1',
  u'Michelle Clark',
  u'Female',
  u'23560 Ayala Spring',
  u'Zimbabwean',
  u'Current',
  u'0',
  u'0',
  u'0',
  u'2',
  u'1',
  u'1',
  u'2',
  u'1',
  u'1']]

In [10]:
# mapping evry line from the RDD to a row in the DataFrame
# piping the result with .toDF() to create the DataFrame

df = custRDD.map(lambda line: Row(Age = line[6], Education = line[7], Employment = line[8], Salary = line[9]
                                  , Employer_Stability = line[10], Customer_Loyalty = line[11], Balance = line[12]
                                  , Residential_Status = line[13], Service_Level = line[14])).toDF()

In [11]:
df.take(5)

[Row(Age=u'0', Balance=u'1', Customer_Loyalty=u'1', Education=u'0', Employer_Stability=u'1', Employment=u'1', Residential_Status=u'1', Salary=u'2', Service_Level=u'1'),
 Row(Age=u'0', Balance=u'2', Customer_Loyalty=u'1', Education=u'0', Employer_Stability=u'1', Employment=u'0', Residential_Status=u'1', Salary=u'2', Service_Level=u'1'),
 Row(Age=u'2', Balance=u'0', Customer_Loyalty=u'0', Education=u'1', Employer_Stability=u'0', Employment=u'2', Residential_Status=u'0', Salary=u'2', Service_Level=u'1'),
 Row(Age=u'0', Balance=u'0', Customer_Loyalty=u'1', Education=u'1', Employer_Stability=u'0', Employment=u'1', Residential_Status=u'1', Salary=u'1', Service_Level=u'0'),
 Row(Age=u'1', Balance=u'0', Customer_Loyalty=u'1', Education=u'1', Employer_Stability=u'1', Employment=u'0', Residential_Status=u'1', Salary=u'2', Service_Level=u'1')]

In [12]:
df.show(5)

+---+-------+----------------+---------+------------------+----------+------------------+------+-------------+
|Age|Balance|Customer_Loyalty|Education|Employer_Stability|Employment|Residential_Status|Salary|Service_Level|
+---+-------+----------------+---------+------------------+----------+------------------+------+-------------+
|  0|      1|               1|        0|                 1|         1|                 1|     2|            1|
|  0|      2|               1|        0|                 1|         0|                 1|     2|            1|
|  2|      0|               0|        1|                 0|         2|                 0|     2|            1|
|  0|      0|               1|        1|                 0|         1|                 1|     1|            0|
|  1|      0|               1|        1|                 1|         0|                 1|     2|            1|
+---+-------+----------------+---------+------------------+----------+------------------+------+-------------+
o

In [13]:
df.toPandas().head()

Unnamed: 0,Age,Balance,Customer_Loyalty,Education,Employer_Stability,Employment,Residential_Status,Salary,Service_Level
0,0,1,1,0,1,1,1,2,1
1,0,2,1,0,1,0,1,2,1
2,2,0,0,1,0,2,0,2,1
3,0,0,1,1,0,1,1,1,0
4,1,0,1,1,1,0,1,2,1


In [14]:
favorite_Age = df[df.Age == 0]

In [15]:
favorite_Age.show(5)

+---+-------+----------------+---------+------------------+----------+------------------+------+-------------+
|Age|Balance|Customer_Loyalty|Education|Employer_Stability|Employment|Residential_Status|Salary|Service_Level|
+---+-------+----------------+---------+------------------+----------+------------------+------+-------------+
|  0|      1|               1|        0|                 1|         1|                 1|     2|            1|
|  0|      2|               1|        0|                 1|         0|                 1|     2|            1|
|  0|      0|               1|        1|                 0|         1|                 1|     1|            0|
|  0|      0|               1|        1|                 1|         0|                 0|     1|            0|
|  0|      2|               0|        1|                 0|         2|                 1|     1|            1|
+---+-------+----------------+---------+------------------+----------+------------------+------+-------------+
o

In [16]:
df.select('Age','Balance').show(10)

+---+-------+
|Age|Balance|
+---+-------+
|  0|      1|
|  0|      2|
|  2|      0|
|  0|      0|
|  1|      0|
|  0|      0|
|  2|      0|
|  1|      1|
|  2|      0|
|  1|      1|
+---+-------+
only showing top 10 rows



In [17]:
df.groupBy("Age").count().show()

+---+-----+
|Age|count|
+---+-----+
|  0| 3295|
|  1| 3363|
|  2| 3342|
+---+-----+



In [19]:
df.describe(['Age', 'Balance', 'Customer_Loyalty', 'Education', 'Employer_Stability', 'Employment', 'Residential_Status', 'Salary', 'Service_Level']).show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|               Age|           Balance|  Customer_Loyalty|         Education|Employer_Stability|        Employment|Residential_Status|            Salary|     Service_Level|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|  count|             10000|             10000|             10000|             10000|             10000|             10000|             10000|             10000|             10000|
|   mean|            1.0047|            0.9979|            0.5023|            0.5053|            0.4961|            1.0092|             0.505|            1.0027|            0.4988|
| stddev|0.8147050290923954|0.8193672719071882|0.5000197115826506|0.4999969096814193|0.50000979

## Logistic Regression Model with MLlib

In [20]:
from pyspark.mllib.regression import LabeledPoint
from numpy import array

In [21]:
df = df.select('Service_Level', 'Age', 'Balance', 'Customer_Loyalty', 'Education', 'Employer_Stability', 'Employment'
              , 'Residential_Status', 'Salary')

In [22]:
df.describe(['Service_Level', 'Age', 'Balance', 'Customer_Loyalty', 'Education', 'Employer_Stability', 'Employment'
              , 'Residential_Status', 'Salary']).show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|     Service_Level|               Age|           Balance|  Customer_Loyalty|         Education|Employer_Stability|        Employment|Residential_Status|            Salary|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|  count|             10000|             10000|             10000|             10000|             10000|             10000|             10000|             10000|             10000|
|   mean|            0.4988|            1.0047|            0.9979|            0.5023|            0.5053|            0.4961|            1.0092|             0.505|            1.0027|
| stddev|0.6695098089934411|0.8147050290923954|0.8193672719071882|0.5000197115826506|0.49999690

In [23]:
df.show(5)

+-------------+---+-------+----------------+---------+------------------+----------+------------------+------+
|Service_Level|Age|Balance|Customer_Loyalty|Education|Employer_Stability|Employment|Residential_Status|Salary|
+-------------+---+-------+----------------+---------+------------------+----------+------------------+------+
|            1|  0|      1|               1|        0|                 1|         1|                 1|     2|
|            1|  0|      2|               1|        0|                 1|         0|                 1|     2|
|            1|  2|      0|               0|        1|                 0|         2|                 0|     2|
|            0|  0|      0|               1|        1|                 0|         1|                 1|     1|
|            1|  1|      0|               1|        1|                 1|         0|                 1|     2|
+-------------+---+-------+----------------+---------+------------------+----------+------------------+------+
o

### Labeled Points and Data Scaling

In [34]:
# created a tuple 'temp' of class/ output and a vector of predictors/ features
# called rdd.map on the df to return an RDD of LabeledPoints
# parsing the data

temp = df.rdd.map(lambda line:LabeledPoint(line[0],[line[1:]]))
temp.take(20)

[LabeledPoint(1.0, [0.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0]),
 LabeledPoint(1.0, [0.0,2.0,1.0,0.0,1.0,0.0,1.0,2.0]),
 LabeledPoint(1.0, [2.0,0.0,0.0,1.0,0.0,2.0,0.0,2.0]),
 LabeledPoint(0.0, [0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0]),
 LabeledPoint(1.0, [1.0,0.0,1.0,1.0,1.0,0.0,1.0,2.0]),
 LabeledPoint(0.0, [0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0]),
 LabeledPoint(0.0, [2.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0]),
 LabeledPoint(1.0, [1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0]),
 LabeledPoint(1.0, [2.0,0.0,1.0,1.0,1.0,2.0,0.0,1.0]),
 LabeledPoint(1.0, [1.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0]),
 LabeledPoint(1.0, [0.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0]),
 LabeledPoint(2.0, [2.0,2.0,1.0,0.0,0.0,2.0,1.0,1.0]),
 LabeledPoint(0.0, [0.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0]),
 LabeledPoint(1.0, [2.0,2.0,0.0,1.0,1.0,0.0,1.0,1.0]),
 LabeledPoint(1.0, [2.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0]),
 LabeledPoint(1.0, [2.0,1.0,0.0,0.0,1.0,2.0,1.0,1.0]),
 LabeledPoint(1.0, [2.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0]),
 LabeledPoint(1.0, [2.0,1.0,0.0,0.0,1.0,2.0,1.0,0.0]),
 LabeledPo

In [108]:
# Save the file as a parquet file with LabeledPoints

# t0 = time()

# temp.saveAsTextFile('../datasets/training')

# tt = time() - t0

In [112]:
# # load data
# data = spark.read.format("csv").load('../datasets/training/part-00000')

# # making a test/ train split 

# splits = data.randomSplit([0.7, 0.3], 1234)

# trainingSet = splits[0]
# testingSet = splits[1]

In [115]:
# # MultiClass Classification using a MultiLayer Perceptron Neural Network
# from pyspark.ml.classification import MultilayerPerceptronClassifier
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# # Layers:
# # input layers of size 8 (features), two intermediate (hidden) of size 9 and 8
# # and output of size 3 (classes)

# layers = [8, 9, 8, 3]


# # NeuralNet trainer with parameters
# trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)


# # training the model
# t0 = time()

# model = trainer.fit(trainingSet)

# # compute accuracy on test set
# result = model.transform(testingSet)
# predictionAndLabels = result.select('prediction', 'label')
# evaluator = MulticlassClassificationEvaluator(metricName='accuracy')


# tt = time() - t0

# print('MultiLayerPerceptron Classifier trained in {} seconds').format(round(tt, 3))
# print ('Test set accuracy = ' + str(evaluator.evaluate(predictionAndLabels)))

In [37]:
trainingSet, testingSet = temp.randomSplit([0.8,0.2], seed=1234)

In [46]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
# from pyspark.ml.classification import LogisticRegression

# # building the model
t0 = time()

logist_model = LogisticRegressionWithLBFGS.train(trainingSet, iterations=10000, numClasses=3)


tt = time() - t0

print('Classifier trained in {} seconds').format(round(tt, 3))


Classifier trained in 2.484 seconds


In [47]:
# # evaluating the model on training data
labelsAndPreds = temp.map(lambda p: (p.label, logist_model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(temp.count())

print('Training Error = ' + str(trainErr))



Training Error = 0.4892


In [48]:
# Testing the accuracy of prediction 
t0 = time()
test_accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count() / float(temp.count())

print('Prediction made in {} seconds. test accuracy is {}').format(round(tt, 3), round(test_accuracy, 4))

Prediction made in 2.484 seconds. test accuracy is 0.5108


In [31]:
# Save and load the model
lrModel = logist_model.predict(temp)

In [49]:
# printing the intercept coefficient
logist_model.intercept

0.0

In [50]:
# printing the weights of the variables
logist_model.weights

DenseVector([0.0356, 0.0544, -0.2298, -0.3428, -0.2695, 0.1208, -0.2128, 0.0737, -0.0644, -0.0148, -0.4695, -0.46, -0.4099, -0.047, -0.3341, -0.078])

In [51]:
# examine testingSet and predicting
testingSet.take(10)

[LabeledPoint(1.0, [0.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0]),
 LabeledPoint(1.0, [2.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0]),
 LabeledPoint(1.0, [0.0,2.0,1.0,0.0,1.0,2.0,0.0,2.0]),
 LabeledPoint(2.0, [0.0,2.0,0.0,1.0,1.0,2.0,1.0,2.0]),
 LabeledPoint(0.0, [2.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0]),
 LabeledPoint(0.0, [0.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0]),
 LabeledPoint(1.0, [1.0,2.0,1.0,0.0,1.0,2.0,0.0,1.0]),
 LabeledPoint(1.0, [2.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0]),
 LabeledPoint(0.0, [1.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0]),
 LabeledPoint(0.0, [1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0])]

In [52]:
# making prediction with sample data
logist_model.predict([1.0,2.0,1.0,0.0,1.0,2.0,0.0,1.0])

0