In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('prepare_data').getOrCreate()

In [3]:
from pyspark.sql.types import *
from pyspark.sql import Row
from time import time

In [4]:
dataset = spark.read.csv('../datasets/dummyTrain.csv', header='true', inferSchema='true')

In [5]:
# Creates a temporary view using the DataFrame
dataset.createOrReplaceTempView("customers")

# SQL can be run over DataFrames that have been registered as a table.
results = spark.sql("SELECT Age, Education, Employment, Salary, Employer_Stability, Customer_Loyalty, Balance, Residential_Status, Service_Level FROM customers")
#results = spark.sql("SELECT Gender, Account_Type, Age, Education, Employment, Salary, Employer_Stability, Customer_Loyalty, Balance, Residential_Status, Service_Level FROM customers")
results.show()

+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|    Age|           Education|Employment|Salary|Employer_Stability|Customer_Loyalty|Balance|Residential_Status|Service_Level|
+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|   60 +|Highschool and below| Permanent| 57311|            Stable|               7| 123003|            Rented|          2.0|
|36 - 59|  Tertiary and above| Permanent|   597|        Not Stable|               3|   9443|             Owned|          2.0|
|18 - 35|  Tertiary and above| Permanent|   597|        Not Stable|               3| 123003|            Rented|          2.0|
|36 - 59|  Tertiary and above|  Contract|  6525|            Stable|               9| 123003|             Owned|          2.0|
|36 - 59|Highschool and below|   Student| 57311|        Not Stable|               5|   1350|            Rented|       

In [50]:
#spark.sql("DROP TABLE IF EXISTS customers")

In [49]:
# spark.sql("CREATE TABLE IF NOT EXISTS customers (Customer_ID DOUBLE, Name STRING, Gender STRING, Address STRING, Nationality DOUBLE, Account_Type STRING, Age STRING, Education STRING, Employment STRING, Salary DOUBLE, Employer_Stability STRING, Customer_Loyalty DOUBLE, Balance DOUBLE, Residential_Status STRING, Service_Level STRING)")
# spark.sql("LOAD DATA LOCAL INPATH '../datasets/dummyTrain.csv' INTO TABLE customers")

# spark.sql("CREATE TABLE customers (Gender STRING, Account_Type STRING, Age STRING, Education STRING, Employment STRING, Salary DOUBLE, Employer_Stability STRING, Customer_Loyalty DOUBLE, Balance DOUBLE, Residential_Status STRING, Service_Level STRING)")
# spark.sql("LOAD DATA LOCAL INPATH '../datasets/dummyTrain.csv' INTO TABLE customers")

# USING com.databricks.spark.csv
#OPTIONS (path "../datasets/dummyTrain.csv", header "true")

In [6]:
#results = spark.table("customers")
cols = results.columns

In [7]:
display(results)

DataFrame[Age: string, Education: string, Employment: string, Salary: int, Employer_Stability: string, Customer_Loyalty: int, Balance: int, Residential_Status: string, Service_Level: double]

In [8]:
results.show(5)

+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|    Age|           Education|Employment|Salary|Employer_Stability|Customer_Loyalty|Balance|Residential_Status|Service_Level|
+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|   60 +|Highschool and below| Permanent| 57311|            Stable|               7| 123003|            Rented|          2.0|
|36 - 59|  Tertiary and above| Permanent|   597|        Not Stable|               3|   9443|             Owned|          2.0|
|18 - 35|  Tertiary and above| Permanent|   597|        Not Stable|               3| 123003|            Rented|          2.0|
|36 - 59|  Tertiary and above|  Contract|  6525|            Stable|               9| 123003|             Owned|          2.0|
|36 - 59|Highschool and below|   Student| 57311|        Not Stable|               5|   1350|            Rented|       

In [14]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

categoricalColumns = ["Age","Education", "Employment", "Employer_Stability", "Residential_Status"]
stages = [] # stages in the pipeline

# "Gender","Account_Type",

for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
    # Using OneHotEncoder to convert categorical variables into binary SparseVectors
    #encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
    encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=categoricalCol+"classVec")
    # Adding the stages: will be run all at once later on
    stages += [stringIndexer, encoder]

In [15]:
# convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol = "Service_Level", outputCol = "label")
stages += [label_stringIdx]

In [16]:
# Transform all features into a vector using VectorAssembler
numericCols = ["Salary", "Customer_Loyalty", "Balance"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [17]:
# Creating a Pipeline
pipeline = Pipeline(stages=stages)
# Running the feature transformations.
# - fit() computes feature statistics as needed
# - transform() actually transforms the features
pipelineModel = pipeline.fit(results)
results = pipelineModel.transform(results)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
results = results.select(selectedcols)
display(results)

IllegalArgumentException: u'requirement failed: Output column label already exists.'

In [19]:
results.show(100)

+-----+--------------------+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|label|            features|    Age|           Education|Employment|Salary|Employer_Stability|Customer_Loyalty|Balance|Residential_Status|Service_Level|
+-----+--------------------+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|  0.0|[1.0,0.0,0.0,0.0,...|   60 +|Highschool and below| Permanent| 57311|            Stable|               7| 123003|            Rented|          2.0|
|  0.0|(10,[2,7,8,9],[1....|36 - 59|  Tertiary and above| Permanent|   597|        Not Stable|               3|   9443|             Owned|          2.0|
|  0.0|[0.0,1.0,1.0,0.0,...|18 - 35|  Tertiary and above| Permanent|   597|        Not Stable|               3| 123003|            Rented|          2.0|
|  0.0|[0.0,0.0,1.0,1.0,...|36 - 59|  Tertiary and above|  Contract|  6525|       

In [20]:
### Splitting data randomly into training and test sets. set seed for reproducibility
(trainingData, testData) = results.randomSplit([0.7, 0.3], seed = 100)
print trainingData.count()
print testData.count()

70173
29827


In [35]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

reg=100.0 
lr=LogisticRegression(labelCol="label", featuresCol="features", maxIter=500) 

paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.02,0.01,0.2,0.1]).addGrid(lr.elasticNetParam, [0.0, 1.0]).build() 

crossval = CrossValidator(estimator=lr, 
                          estimatorParamMaps=paramGrid, 
                          evaluator=BinaryClassificationEvaluator(), 
                          numFolds=3) 
 
## initial Logistic Regression model
#lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=100, regParam=0.3, elasticNetParam=0.8)
#lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=1000, regParam=0.1, elasticNetParam=0)

## Training model with Training Data
#lrModel = lr.fit(trainingData)
lrModel = crossval.fit(trainingData)

# Make predictions.
predict = lrModel.transform(testData)

# Select example rows to display.
predict.select("prediction", "label", "features").show(1000)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predict)
print("Test Error = %g " % (1.0 - accuracy))

# treeModel = lrModel.stages[2]
# # summary only
# print(treeModel)
print("Accuracy = %g Percent" % (accuracy * 100))

# Print the coefficients and intercept for multinomial logistic regression
# print("Coefficients: \n" + str(lrModel.coefficientMatrix))
# print("Intercept: " + str(lrModel.interceptVector))

#spark.stop()

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|
|       0.0|  0.0|(10,[0,2,7,8,9],[...|


In [24]:
print evaluator.explainParams()

labelCol: label column name. (default: label, current: label)
metricName: metric name in evaluation (f1|weightedPrecision|weightedRecall|accuracy) (default: f1, current: accuracy)
predictionCol: prediction column name. (default: prediction, current: prediction)


In [34]:
lrModel.transform(testData).first()

Row(label=0.0, features=SparseVector(10, {0: 1.0, 2: 1.0, 7: 597.0, 8: 3.0, 9: 123003.0}), Age=u'60 +', Education=u'Tertiary and above', Employment=u'Permanent', Salary=597, Employer_Stability=u'Not Stable', Customer_Loyalty=3, Balance=123003, Residential_Status=u'Owned', Service_Level=2.0, rawPrediction=DenseVector([1.374, 0.1996, -1.5736]), probability=DenseVector([0.7345, 0.227, 0.0385]), prediction=0.0)

In [19]:
# Making predictions on test data using the transform() method.
# LogisticRegression.transform() only uses the 'features' column
predictions = lrModel.transform(testData)

In [20]:
predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- Age: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Employment: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Employer_Stability: string (nullable = true)
 |-- Customer_Loyalty: integer (nullable = true)
 |-- Balance: integer (nullable = true)
 |-- Residential_Status: string (nullable = true)
 |-- Service_Level: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [21]:
# View model's predictions and probabilities of each prediction class
# Can select any columns in the schema to view as well. 
selected = predictions.select("label", "prediction", "probability", "Age", "Employment")
display(selected)

AttributeError: 'NoneType' object has no attribute '_jvm'

In [18]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

Coefficients: 
DenseMatrix([[ -1.01966765e+00,  -9.29605994e-01,   1.59263680e+00,
               -4.68339287e-02,   1.01933855e+00,  -4.26833573e-01,
               -3.36912828e+00,   9.49289368e-01,  -2.90188441e+00,
                2.00285903e-05,   1.49403389e+00,   4.20976530e-06],
             [ -3.74169879e-02,  -8.45326488e-02,   1.44373891e-01,
                2.04956642e-01,  -5.43927989e-02,   3.89900570e-01,
                5.01293357e-01,  -2.56733990e-02,  -8.22671214e-02,
               -1.47559803e-08,  -1.17629892e-01,  -4.20443723e-08],
             [  1.05708463e+00,   1.01413864e+00,  -1.73701070e+00,
               -1.58122713e-01,  -9.64945748e-01,   3.69330029e-02,
                2.86783492e+00,  -9.23615969e-01,   2.98415153e+00,
               -2.00138343e-05,  -1.37640400e+00,  -4.16772089e-06]])
Intercept: [-2.20139847162,0.96748825639,1.23391021523]


In [81]:
# evaluating the model on test data
# labelsAndPreds = testData.map(lambda p: (p.label, lr.predict(p.features)))

In [39]:
### MultiLayerPerceptron Model

from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Split the data into train and test
splits = results.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [10, 11, 10, 3]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(train)


In [40]:
model.layers

[10, 11, 10, 3]

In [41]:
model.weights.size

274

In [111]:
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
# evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
# print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

In [68]:
## Decison Tree Classifier
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(results)
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=3).fit(results)

# Split the data into training and test sets (30% held out for testing)
(trainData, testingData) = results.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
Tmodel = pipeline.fit(trainingData)

# Make predictions.
predictions = Tmodel.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = Tmodel.stages[2]
# summary only
print(treeModel)
print("The accuracy is %g Percent" % (accuracy * 100))

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|(12,[0,1,2,9,10,1...|
|       0.0|         0.0|(12,[0,1,2,9,10,1...|
|       0.0|         0.0|(12,[0,1,2,9,10,1...|
|       0.0|         0.0|(12,[0,1,2,9,10,1...|
|       0.0|         0.0|(12,[0,1,2,9,10,1...|
+----------+------------+--------------------+
only showing top 5 rows

Test Error = 0.197836 
DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4d589af44c5af0bd3de3) of depth 5 with 61 nodes
The accuracy is 80.2164 Percent
