In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('prepare_data').getOrCreate()

In [3]:
from pyspark.sql.types import *
from pyspark.sql import Row
from time import time

In [16]:
# load training data

train_data = "../datasets/oneMillTrain.csv"

custRDD = sc.textFile(train_data)

print "Train data size is {}".format(custRDD.count())

Train data size is 1000001


In [17]:
custRDD.take(5)

[u'Customer_ID,Name,Gender,Address,Nationality,Account_Type,Age,Education,Employment,Salary,Employer_Stability,Consistency,Balance,Residential_Status,Service_Level',
 u'0,Diane Houston,Female,679 Morales Lane,Zimbabwean,Savings,0,0,2,1,1,1,2,1,1',
 u'1,Kristen Kelly,Male,5762 Robert Plaza Suite 861,Zimbabwean,Savings,2,1,2,0,1,1,2,1,2',
 u'2,Jeffrey Miller,Female,9865 James Tunnel,Zimbabwean,Current,2,1,0,1,1,1,1,1,1',
 u'3,Timothy Savage,Male,484 Rodriguez Viaduct,Zimbabwean,Current,1,0,1,0,0,1,2,1,0']

In [18]:
# Splitting the lines on the commas on Train data
custRDD = custRDD.map(lambda line: line.split(','))

In [19]:
custRDD.take(2)

[[u'Customer_ID',
  u'Name',
  u'Gender',
  u'Address',
  u'Nationality',
  u'Account_Type',
  u'Age',
  u'Education',
  u'Employment',
  u'Salary',
  u'Employer_Stability',
  u'Consistency',
  u'Balance',
  u'Residential_Status',
  u'Service_Level'],
 [u'0',
  u'Diane Houston',
  u'Female',
  u'679 Morales Lane',
  u'Zimbabwean',
  u'Savings',
  u'0',
  u'0',
  u'2',
  u'1',
  u'1',
  u'1',
  u'2',
  u'1',
  u'1']]

In [20]:
# Stripping the header from the RDD
# passing a filter tells Spark to select all RDD member
header = custRDD.first()
custRDD = custRDD.filter(lambda line:line != header)

In [21]:
custRDD.take(2)

[[u'0',
  u'Diane Houston',
  u'Female',
  u'679 Morales Lane',
  u'Zimbabwean',
  u'Savings',
  u'0',
  u'0',
  u'2',
  u'1',
  u'1',
  u'1',
  u'2',
  u'1',
  u'1'],
 [u'1',
  u'Kristen Kelly',
  u'Male',
  u'5762 Robert Plaza Suite 861',
  u'Zimbabwean',
  u'Savings',
  u'2',
  u'1',
  u'2',
  u'0',
  u'1',
  u'1',
  u'2',
  u'1',
  u'2']]

In [22]:
# mapping evry line from the RDD to a row in the DataFrame
# piping the result with .toDF() to create the DataFrame
# Gender = line[2], Account_Type = line[5], 

df = custRDD.map(lambda line: Row(Age = line[6], Education = line[7], Employment = line[8], Salary = line[9]
                                  , Employer_Stability = line[10], Customer_Loyalty = line[11], Balance = line[12]
                                  , Residential_Status = line[13], Service_Level = line[14])).toDF()

In [23]:
df.take(5)

[Row(Age=u'0', Balance=u'2', Customer_Loyalty=u'1', Education=u'0', Employer_Stability=u'1', Employment=u'2', Residential_Status=u'1', Salary=u'1', Service_Level=u'1'),
 Row(Age=u'2', Balance=u'2', Customer_Loyalty=u'1', Education=u'1', Employer_Stability=u'1', Employment=u'2', Residential_Status=u'1', Salary=u'0', Service_Level=u'2'),
 Row(Age=u'2', Balance=u'1', Customer_Loyalty=u'1', Education=u'1', Employer_Stability=u'1', Employment=u'0', Residential_Status=u'1', Salary=u'1', Service_Level=u'1'),
 Row(Age=u'1', Balance=u'2', Customer_Loyalty=u'1', Education=u'0', Employer_Stability=u'0', Employment=u'1', Residential_Status=u'1', Salary=u'0', Service_Level=u'0'),
 Row(Age=u'1', Balance=u'2', Customer_Loyalty=u'0', Education=u'0', Employer_Stability=u'1', Employment=u'1', Residential_Status=u'0', Salary=u'2', Service_Level=u'1')]

In [24]:
df.show(5)

+---+-------+----------------+---------+------------------+----------+------------------+------+-------------+
|Age|Balance|Customer_Loyalty|Education|Employer_Stability|Employment|Residential_Status|Salary|Service_Level|
+---+-------+----------------+---------+------------------+----------+------------------+------+-------------+
|  0|      2|               1|        0|                 1|         2|                 1|     1|            1|
|  2|      2|               1|        1|                 1|         2|                 1|     0|            2|
|  2|      1|               1|        1|                 1|         0|                 1|     1|            1|
|  1|      2|               1|        0|                 0|         1|                 1|     0|            0|
|  1|      2|               0|        0|                 1|         1|                 0|     2|            1|
+---+-------+----------------+---------+------------------+----------+------------------+------+-------------+
o

In [25]:
df.toPandas().head()

Unnamed: 0,Age,Balance,Customer_Loyalty,Education,Employer_Stability,Employment,Residential_Status,Salary,Service_Level
0,0,2,1,0,1,2,1,1,1
1,2,2,1,1,1,2,1,0,2
2,2,1,1,1,1,0,1,1,1
3,1,2,1,0,0,1,1,0,0
4,1,2,0,0,1,1,0,2,1


In [26]:
favorite_Age = df[df.Age == 0]

In [27]:
favorite_Age.show(5)

+---+-------+----------------+---------+------------------+----------+------------------+------+-------------+
|Age|Balance|Customer_Loyalty|Education|Employer_Stability|Employment|Residential_Status|Salary|Service_Level|
+---+-------+----------------+---------+------------------+----------+------------------+------+-------------+
|  0|      2|               1|        0|                 1|         2|                 1|     1|            1|
|  0|      1|               0|        0|                 1|         0|                 1|     1|            0|
|  0|      1|               1|        0|                 1|         0|                 0|     0|            0|
|  0|      0|               1|        0|                 0|         2|                 0|     0|            0|
|  0|      1|               1|        1|                 0|         0|                 1|     1|            0|
+---+-------+----------------+---------+------------------+----------+------------------+------+-------------+
o

In [28]:
df.select('Age','Balance').show(10)

+---+-------+
|Age|Balance|
+---+-------+
|  0|      2|
|  2|      2|
|  2|      1|
|  1|      2|
|  1|      2|
|  2|      2|
|  2|      1|
|  0|      1|
|  1|      1|
|  0|      1|
+---+-------+
only showing top 10 rows



In [29]:
df.groupBy("Age").count().show()

+---+------+
|Age| count|
+---+------+
|  0|333024|
|  1|333891|
|  2|333085|
+---+------+



In [99]:
df.describe(['Gender', 'Account_Type', 'Age', 'Balance', 'Customer_Loyalty', 'Education', 'Employer_Stability', 'Employment', 'Residential_Status', 'Salary', 'Service_Level']).show()

+-------+-------+------------+------------------+-----------------+-------------------+-----------------+------------------+------------------+-------------------+------------------+------------------+
|summary| Gender|Account_Type|               Age|          Balance|   Customer_Loyalty|        Education|Employer_Stability|        Employment| Residential_Status|            Salary|     Service_Level|
+-------+-------+------------+------------------+-----------------+-------------------+-----------------+------------------+------------------+-------------------+------------------+------------------+
|  count|1000000|     1000000|           1000000|          1000000|            1000000|          1000000|           1000000|           1000000|            1000000|           1000000|           1000000|
|   mean|   null|        null|          1.000061|         0.999579|           0.499428|         0.500304|          0.499703|          0.999441|           0.499295|           1.00123|          

## Logistic Regression Model with MLlib

In [30]:
from pyspark.mllib.regression import LabeledPoint
from numpy import array

In [32]:
df = df.select('Service_Level', 'Age', 'Balance', 'Customer_Loyalty', 'Education', 'Employer_Stability', 'Employment'
              , 'Residential_Status', 'Salary')



In [39]:
df.describe(['Service_Level', 'Age', 'Balance', 'Customer_Loyalty', 'Education', 'Employer_Stability'
             , 'Employment', 'Residential_Status', 'Salary']).show()

+-------+------------------+------------------+-----------------+-------------------+-----------------+------------------+------------------+-------------------+------------------+
|summary|     Service_Level|               Age|          Balance|   Customer_Loyalty|        Education|Employer_Stability|        Employment| Residential_Status|            Salary|
+-------+------------------+------------------+-----------------+-------------------+-----------------+------------------+------------------+-------------------+------------------+
|  count|           1000000|           1000000|          1000000|            1000000|          1000000|           1000000|           1000000|            1000000|           1000000|
|   mean|          0.495119|          1.000061|         0.999579|           0.499428|         0.500304|          0.499703|          0.999441|           0.499295|           1.00123|
| stddev|0.6652462840118804|0.8161554155849643|0.816237397454008|0.49999992281591665|0.50000015

In [40]:
df.show(5)

+-------------+---+-------+----------------+---------+------------------+----------+------------------+------+
|Service_Level|Age|Balance|Customer_Loyalty|Education|Employer_Stability|Employment|Residential_Status|Salary|
+-------------+---+-------+----------------+---------+------------------+----------+------------------+------+
|            1|  0|      2|               1|        0|                 1|         2|                 1|     1|
|            2|  2|      2|               1|        1|                 1|         2|                 1|     0|
|            1|  2|      1|               1|        1|                 1|         0|                 1|     1|
|            0|  1|      2|               1|        0|                 0|         1|                 1|     0|
|            1|  1|      2|               0|        0|                 1|         1|                 0|     2|
+-------------+---+-------+----------------+---------+------------------+----------+------------------+------+
o

### Labeled Points and Data Scaling

In [41]:
# created a tuple 'temp' of class/ output and a vector of predictors/ features
# called rdd.map on the df to return an RDD of LabeledPoints
# parsing the data

temp = df.rdd.map(lambda line:LabeledPoint(line[0],[line[1:]]))
#temp.take(5)

In [108]:
# Save the file as a parquet file with LabeledPoints

# t0 = time()

# temp.saveAsTextFile('../datasets/training')

# tt = time() - t0

In [112]:
# # load data
# data = spark.read.format("csv").load('../datasets/training/part-00000')

# # making a test/ train split 

# splits = data.randomSplit([0.7, 0.3], 1234)

# trainingSet = splits[0]
# testingSet = splits[1]

In [115]:
# # MultiClass Classification using a MultiLayer Perceptron Neural Network
# from pyspark.ml.classification import MultilayerPerceptronClassifier
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# # Layers:
# # input layers of size 8 (features), two intermediate (hidden) of size 9 and 8
# # and output of size 3 (classes)

# layers = [8, 9, 8, 3]


# # NeuralNet trainer with parameters
# trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)


# # training the model
# t0 = time()

# model = trainer.fit(trainingSet)

# # compute accuracy on test set
# result = model.transform(testingSet)
# predictionAndLabels = result.select('prediction', 'label')
# evaluator = MulticlassClassificationEvaluator(metricName='accuracy')


# tt = time() - t0

# print('MultiLayerPerceptron Classifier trained in {} seconds').format(round(tt, 3))
# print ('Test set accuracy = ' + str(evaluator.evaluate(predictionAndLabels)))

In [42]:
trainingSet, testingSet = temp.randomSplit([0.8,0.2], seed=1234)

In [43]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
# from pyspark.ml.classification import LogisticRegression

# # building the model
t0 = time()

logist_model = LogisticRegressionWithLBFGS.train(trainingSet, iterations=10000, numClasses=3)


tt = time() - t0

print('Classifier trained in {} seconds').format(round(tt, 3))


Classifier trained in 80.026 seconds


In [53]:
# # evaluating the model on test data
labelsAndPreds = testingSet.map(lambda p: (p.label, logist_model.predict(p.features)))

In [54]:
# calculating error 
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingSet.count())

print('Training Error = ' + str(trainErr))

Training Error = 0.114383384498


In [55]:
# Testing the accuracy of prediction 
t0 = time()
test_accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count() / float(testingSet.count())
tt = time() - t0

print('Prediction made in {} seconds. The test accuracy is {}').format(round(tt, 3), round(test_accuracy, 4))

Prediction made in 79.58 seconds. The test accuracy is 0.5414


In [56]:
# Save and load the model
lrModel = logist_model.predict(trainingSet)

In [57]:
# printing the intercept coefficient
logist_model.intercept

0.0

In [58]:
# printing the weights of the variables
logist_model.weights

DenseVector([0.0525, 0.0525, -0.2146, -0.2154, -0.2136, 0.0465, -0.2107, 0.0521, -0.0636, -0.0627, -0.421, -0.412, -0.4163, -0.064, -0.4094, -0.0608])

In [49]:
# examine testingSet and predicting
testingSet.take(10)

[LabeledPoint(1.0, [1.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0]),
 LabeledPoint(0.0, [1.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0]),
 LabeledPoint(1.0, [0.0,2.0,0.0,0.0,1.0,2.0,1.0,1.0]),
 LabeledPoint(1.0, [2.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0]),
 LabeledPoint(2.0, [1.0,2.0,0.0,1.0,0.0,2.0,1.0,2.0]),
 LabeledPoint(2.0, [0.0,2.0,1.0,0.0,1.0,2.0,1.0,2.0]),
 LabeledPoint(0.0, [2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0]),
 LabeledPoint(0.0, [1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0]),
 LabeledPoint(0.0, [2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0]),
 LabeledPoint(0.0, [2.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0])]

In [59]:
# making prediction with sample data
logist_model.predict([0.0,2.0,0.0,0.0,1.0,2.0,1.0,1.0])

0