In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Multiclass Classification: TBank').getOrCreate()

In [4]:
from pyspark.sql.types import *
from pyspark.sql import Row
from time import time

In [5]:
# Load train data : 1 million rows
TrainSet = spark.read.csv('../datasets/oneMill.csv', header='true', inferSchema='true')

# Load test data: 1 thousand rows
TestSet = spark.read.csv('../datasets/testData.csv', header='true', inferSchema='true')

In [6]:
# Creates a temporary view using the DataFrames
## Train View
TrainSet.createOrReplaceTempView("customers")

## Test View
TestSet.createOrReplaceTempView("testing")


# SQL can be run over DataFrames that have been registered as a table.
## Train
results = spark.sql("SELECT Age, Education, Employment, Salary, Employer_Stability, Customer_Loyalty, Balance, Residential_Status, Service_Level FROM customers")


## Test
tests = spark.sql("SELECT Age, Education, Employment, Salary, Employer_Stability, Customer_Loyalty, Balance, Residential_Status, Service_Level FROM testing")


#results = spark.sql("SELECT Gender, Account_Type, Age, Education, Employment, Salary, Employer_Stability, Customer_Loyalty, Balance, Residential_Status, Service_Level FROM customers")
results.show()

+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|    Age|           Education|Employment|Salary|Employer_Stability|Customer_Loyalty|Balance|Residential_Status|Service_Level|
+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|   60 +|Highschool and below|  Contract| 29633|            Stable|               8|    386|             Owned|          1.0|
|   60 +|Highschool and below|   Student|  5622|            Stable|               0|    386|             Owned|          0.0|
|   60 +|  Tertiary and above|  Contract|  5622|          Unstable|               5| 269684|             Owned|          0.0|
|18 - 35|Highschool and below| Permanent|  5622|            Stable|               5| 269684|             Owned|          1.0|
|   60 +|Highschool and below|  Contract|  5622|          Unstable|               9|   8024|             Owned|       

In [50]:
#spark.sql("DROP TABLE IF EXISTS customers")

In [49]:
# spark.sql("CREATE TABLE IF NOT EXISTS customers (Customer_ID DOUBLE, Name STRING, Gender STRING, Address STRING, Nationality DOUBLE, Account_Type STRING, Age STRING, Education STRING, Employment STRING, Salary DOUBLE, Employer_Stability STRING, Customer_Loyalty DOUBLE, Balance DOUBLE, Residential_Status STRING, Service_Level STRING)")
# spark.sql("LOAD DATA LOCAL INPATH '../datasets/dummyTrain.csv' INTO TABLE customers")

# spark.sql("CREATE TABLE customers (Gender STRING, Account_Type STRING, Age STRING, Education STRING, Employment STRING, Salary DOUBLE, Employer_Stability STRING, Customer_Loyalty DOUBLE, Balance DOUBLE, Residential_Status STRING, Service_Level STRING)")
# spark.sql("LOAD DATA LOCAL INPATH '../datasets/dummyTrain.csv' INTO TABLE customers")

# USING com.databricks.spark.csv
#OPTIONS (path "../datasets/dummyTrain.csv", header "true")

In [7]:
#results = spark.table("customers")
cols = results.columns


## tests columns
testcols = tests.columns

In [8]:
display(results)

DataFrame[Age: string, Education: string, Employment: string, Salary: int, Employer_Stability: string, Customer_Loyalty: int, Balance: int, Residential_Status: string, Service_Level: double]

In [9]:
results.show(5)

+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|    Age|           Education|Employment|Salary|Employer_Stability|Customer_Loyalty|Balance|Residential_Status|Service_Level|
+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|   60 +|Highschool and below|  Contract| 29633|            Stable|               8|    386|             Owned|          1.0|
|   60 +|Highschool and below|   Student|  5622|            Stable|               0|    386|             Owned|          0.0|
|   60 +|  Tertiary and above|  Contract|  5622|          Unstable|               5| 269684|             Owned|          0.0|
|18 - 35|Highschool and below| Permanent|  5622|            Stable|               5| 269684|             Owned|          1.0|
|   60 +|Highschool and below|  Contract|  5622|          Unstable|               9|   8024|             Owned|       

In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

categoricalColumns = ["Age","Education", "Employment", "Employer_Stability", "Residential_Status"]
stages = [] # stages in the pipeline

# "Gender","Account_Type",

for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
    # Using OneHotEncoder to convert categorical variables into binary SparseVectors
    #encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
    encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=categoricalCol+"classVec")
    # Adding the stages: will be run all at once later on
    stages += [stringIndexer, encoder]

In [11]:
# convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol = "Service_Level", outputCol = "label")
stages += [label_stringIdx]

In [12]:
# Transform all features into a vector using VectorAssembler
numericCols = ["Salary", "Customer_Loyalty", "Balance"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [13]:
# Creating a Pipeline for Training
pipeline = Pipeline(stages=stages)
# Running the feature transformations.
# - fit() computes feature statistics as needed
# - transform() actually transforms the features
pipelineModel = pipeline.fit(results)
results = pipelineModel.transform(results)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
TrainingData = results.select(selectedcols)
display(TrainingData)



DataFrame[label: double, features: vector, Age: string, Education: string, Employment: string, Salary: int, Employer_Stability: string, Customer_Loyalty: int, Balance: int, Residential_Status: string, Service_Level: double]

In [14]:
#### Creating a Pipeline for Testing

pipeline = Pipeline(stages=stages)
# Running the feature transformations.
# - fit() computes feature statistics as needed
# - transform() actually transforms the features
pipelineModel = pipeline.fit(tests)
tests = pipelineModel.transform(tests)

# Keep relevant columns
selectedcols = ["label", "features"] + testcols
TestingData = tests.select(selectedcols)
display(TestingData)

DataFrame[label: double, features: vector, Age: string, Education: string, Employment: string, Salary: int, Employer_Stability: string, Customer_Loyalty: int, Balance: int, Residential_Status: string, Service_Level: double]

In [15]:
### Splitting data randomly into training and test sets. set seed for reproducibility
# (trainingData, testData) = results.randomSplit([0.7, 0.3], seed = 100)
# print trainingData.count()
# print testData.count()

print TrainingData.count()
print TestingData.count()

1000000
1000


In [16]:
TestingData.show()

+-----+--------------------+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|label|            features|    Age|           Education|Employment|Salary|Employer_Stability|Customer_Loyalty|Balance|Residential_Status|Service_Level|
+-----+--------------------+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|  0.0|[1.0,0.0,1.0,1.0,...|36 - 59|  Tertiary and above|  Contract|   886|          Unstable|               4|   2789|            Rented|          0.0|
|  0.0|(10,[1,6,7,8,9],[...|   60 +|Highschool and below|   Student|  4627|            Stable|               7|   2333|            Rented|          0.0|
|  0.0|[0.0,1.0,1.0,0.0,...|   60 +|  Tertiary and above|   Student| 24001|          Unstable|               2|   2789|            Rented|          0.0|
|  0.0|[1.0,0.0,1.0,1.0,...|36 - 59|  Tertiary and above|  Contract| 24001|       

In [34]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# reg=100.0 
# lr=LogisticRegression(labelCol="label", featuresCol="features", maxIter=500) 

# paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.02,0.01,0.2,0.1]).addGrid(lr.elasticNetParam, [0.0, 1.0]).build() 

# crossval = CrossValidator(estimator=lr, 
#                           estimatorParamMaps=paramGrid, 
#                           evaluator=BinaryClassificationEvaluator(), 
#                           numFolds=3) 
 
## initial Logistic Regression model
#lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=100, regParam=0.3, elasticNetParam=0.8)
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=1000, regParam=0.1, elasticNetParam=0)

## Training model with Training Data
lrModel = lr.fit(TrainingData)
#lrModel = crossval.fit(TrainingData)

# Make predictions.
predict = lrModel.transform(TestingData)

# Select example rows to display.
predict.select("prediction", "label", "features").show(100)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predict)
print("Test Error = %g " % (1.0 - accuracy))


print("Accuracy = %g Percent" % (accuracy * 100))

# Print the coefficients and intercept for multinomial logistic regression comment
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

#spark.stop()

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,6,7,9],[...|
|       0.0|  0.0|(10,[1,4,6,7,9],[...|
|       0.0|  0.0|(10,[1,4,6,7,9],[...|
|       0.0|  0.0|(10,[1,4,6,7,9],[...|
|       0.0|  0.0|(10,[1,4,6,7,9],[...|
|       0.0|  0.0|(10,[1,4,6,7,9],[...|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|
|       0.0|  0.0|(10,[1,4,7,9],[1....|


In [18]:
print evaluator.explainParams()

labelCol: label column name. (default: label, current: label)
metricName: metric name in evaluation (f1|weightedPrecision|weightedRecall|accuracy) (default: f1, current: accuracy)
predictionCol: prediction column name. (default: prediction, current: prediction)


In [19]:
lrModel.transform(TestingData).first()

Row(label=0.0, features=DenseVector([1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 886.0, 4.0, 2789.0]), Age=u'36 - 59', Education=u'Tertiary and above', Employment=u'Contract', Salary=886, Employer_Stability=u'Unstable', Customer_Loyalty=4, Balance=2789, Residential_Status=u'Rented', Service_Level=0.0, rawPrediction=DenseVector([6.0866, 0.4766, -6.5631]), probability=DenseVector([0.9963, 0.0036, 0.0]), prediction=0.0)

In [21]:
# Making predictions on test data using the transform() method.
# LogisticRegression.transform() only uses the 'features' column
predictions = lrModel.transform(TestingData)

In [22]:
predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- Age: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Employment: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Employer_Stability: string (nullable = true)
 |-- Customer_Loyalty: integer (nullable = true)
 |-- Balance: integer (nullable = true)
 |-- Residential_Status: string (nullable = true)
 |-- Service_Level: double (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [23]:
# View model's predictions and probabilities of each prediction class
# Can select any columns in the schema to view as well. 
selected = predictions.select("label", "prediction", "probability", "Age", "Employment")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, Age: string, Employment: string]

In [24]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

AttributeError: 'CrossValidatorModel' object has no attribute 'coefficientMatrix'

In [81]:
# evaluating the model on test data
# labelsAndPreds = testData.map(lambda p: (p.label, lr.predict(p.features)))

In [25]:
### MultiLayerPerceptron Model

from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Split the data into train and test
# splits = results.randomSplit([0.6, 0.4], 1234)
# train = splits[0]
# test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [8, 9, 8, 3]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(TrainingData)


In [26]:
model.layers

[8, 9, 8, 3]

In [27]:
model.weights.size

188

In [30]:
# compute accuracy on the test set
MlTest = model.transform(TestingData)
predictionAndLabels = MlTest.select("prediction", "label")
# evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
# print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

In [32]:
## Decison Tree Classifier
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(results)
#labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(tests)
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=3).fit(results)
    
# featureIndexer =\
#     VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=3).fit(tests)
    

# Split the data into training and test sets (30% held out for testing)
(TrainingData, TestingData) = results.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
Tmodel = pipeline.fit(TrainingData)

# Make predictions.
predictions = Tmodel.transform(TestingData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(10000)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = Tmodel.stages[2]
# summary only
print(treeModel)
print("The accuracy is %g Percent" % (accuracy * 100))

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|(10,[1,4,7,9],[1....|
|       0.0|         0.0|(10,[1,4,7,9],[1....|
|       0.0|         0.0|(10,[1,4,7,9],[1....|
|       0.0|         0.0|(10,[1,4,7,9],[1....|
|       0.0|         0.0|(10,[1,4,7,9],[1....|
|       0.0|         0.0|(10,[1,4,7,9],[1....|
|       0.0|         0.0|(10,[1,4,7,9],[1....|
|       0.0|         0.0|(10,[1,4,7,9],[1....|
|       0.0|         0.0|(10,[1,4,7,9],[1....|
|       0.0|         0.0|(10,[1,4,7,9],[1....|
|       0.0|         0.0|(10,[1,4,7,9],[1....|
|       0.0|         0.0|(10,[1,4,6,7,9],[...|
|       0.0|         0.0|(10,[1,4,6,7,9],[...|
|       0.0|         0.0|(10,[1,4,6,7,9],[...|
|       0.0|         0.0|(10,[1,4,6,7,9],[...|
|       0.0|         0.0|(10,[1,4,6,7,9],[...|
|       0.0|         0.0|(10,[1,4,6,7,9],[...|
|       0.0|         0.0|(10,[1,4,7,9],[1....|
|       0.0| 