In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Row
from time import time

spark = SparkSession.builder.master('local[*]').appName('Multiclass Classification: TBank').config("spark.sql.warehouse.dir", "/home/maffsojah/Projects/HIT_400/capstone_project/web/tbank/spark-warehouse").getOrCreate()

In [14]:
# Load train data : 1 million rows
TrainSet = spark.read.csv('hdfs://localhost:9000/user/hduser/datasets/oneMill.csv', header='true', inferSchema='true')
#TrainSet = spark.read.csv('../datasets/predict.csv', header='true', inferSchema='true')

In [15]:
# Creating Spark SQL temporary views with the DataFrames
## Train View
TrainSet.createOrReplaceTempView("customers")
# TrainSet.createOrReplaceTempView("predict")


# SQL can be run over DataFrames that have been registered as a table.
## Train
results = spark.sql("SELECT Gender, Account_Type, Age, Education, Employment, Salary, Employer_Stability, Customer_Loyalty, Balance, Residential_Status, Service_Level FROM customers")

# results = spark.sql("SELECT Gender, Account_Type, Age, Education, Employment, Salary, Employer_Stability, Customer_Loyalty, Balance, Residential_Status, Service_Level FROM predict")


results.show()

+------+---------------+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|Gender|   Account_Type|    Age|           Education|Employment|Salary|Employer_Stability|Customer_Loyalty|Balance|Residential_Status|Service_Level|
+------+---------------+-------+--------------------+----------+------+------------------+----------------+-------+------------------+-------------+
|  Male|Current Account|   60 +|Highschool and below|  Contract| 29633|            Stable|               8|    386|             Owned|          1.0|
|  Male|Current Account|   60 +|Highschool and below|   Student|  5622|            Stable|               0|    386|             Owned|          0.0|
|  Male|Savings Account|   60 +|  Tertiary and above|  Contract|  5622|          Unstable|               5| 269684|             Owned|          0.0|
|Female|Current Account|18 - 35|Highschool and below| Permanent|  5622|            Stable|               5

In [16]:
#results columns: trainset
cols = results.columns


## tests columns
#testcols = tests.columns

display(results)

DataFrame[Gender: string, Account_Type: string, Age: string, Education: string, Employment: string, Salary: int, Employer_Stability: string, Customer_Loyalty: int, Balance: int, Residential_Status: string, Service_Level: double]

In [17]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

categoricalColumns = ["Gender", "Account_Type", "Age","Education", "Employment", "Employer_Stability", "Residential_Status"]
stages = [] # stages in the pipeline

# "Gender","Account_Type",

for categoricalCol in categoricalColumns:
    
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
    
    # Using OneHotEncoder to convert categorical variables into binary SparseVectors
    #encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
    encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=categoricalCol+"classVec")
    
    # Adding the stages: will be run all at once later on
    stages += [stringIndexer, encoder]
    
# convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol = "Service_Level", outputCol = "label")
stages += [label_stringIdx]

# Transform all features into a vector using VectorAssembler
numericCols = ["Salary", "Customer_Loyalty", "Balance"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

# Creating a Pipeline for Training
pipeline = Pipeline(stages=stages)
# Running the feature transformations.
# - fit() computes feature statistics as needed
# - transform() actually transforms the features
pipelineModel = pipeline.fit(results)
results = pipelineModel.transform(results)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
TrainingData = results.select(selectedcols)
display(TrainingData)
# predictData = results.select(selectedcols)
# display(predictData)

DataFrame[label: double, features: vector, Gender: string, Account_Type: string, Age: string, Education: string, Employment: string, Salary: int, Employer_Stability: string, Customer_Loyalty: int, Balance: int, Residential_Status: string, Service_Level: double]

In [18]:
# Splitting data randomly into training and test sets. set seed for reproducibility
(trainData, testData) = TrainingData.randomSplit([0.7, 0.3], seed = 100)

print trainData.count()
print testData.count()
# print(predictData).count()

699367
300633


In [20]:
from pyspark.ml.classification import LogisticRegression, OneVsRest, LogisticRegressionModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import tempfile

# ## save and load model
# temp_path = tempfile.mkdtemp()
# #globs['temp_path'] = temp_path
# reg_path = temp_path + '/reg'

reg = LogisticRegression(labelCol="label", featuresCol="features", maxIter=1000, regParam=0.01, family="multinomial" )E
regModel = reg.fit(trainData)

predict = regModel.transform(testData)
predict.select("prediction", "label", "features").show()
#predict = model2.transform(testData)
# predict = model2.transform(predictData)
# predict.select("prediction", "label", "features").show()

# load saved model
# reg2 = LogisticRegression.load(reg_path)
# regModel2 = LogisticRegressionModel.load(model_path)
# predict = regModel2.transform(predictData)
# predict.select("prediction", "label", "features").show()
        

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predict)

## save and load model
temp_path = tempfile.mkdtemp()
#globs['temp_path'] = temp_path
reg_path = temp_path + '/reg'
reg.save(reg_path)
model2 = LogisticRegression.load(reg_path)
model2.getMaxIter()

model_path = temp_path + 'reg_model'
regModel.save(model_path)
model2 = LogisticRegressionModel.load(model_path)

print("Test Error = %g " % (1.0 - accuracy))
print("Accuracy = %g Percent" % (accuracy * 100))
print("Coefficients: \n" + str(regModel.coefficientMatrix))
print("Intercept: " + str(regModel.interceptVector))
print("coefficientMatrix check = %g " % (regModel.coefficientMatrix[0, 1] == model2.coefficientMatrix[0, 1]))
print("interceptVector check = %g " % (regModel.interceptVector == model2.interceptVector))


+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
|       0.0|  0.0|(12,[0,1,2,4,9,11...|
+----------+-----+--------------------+
only showing top 20 rows

Test Error = 0

In [21]:
print evaluator.explainParams()

labelCol: label column name. (default: label, current: label)
metricName: metric name in evaluation (f1|weightedPrecision|weightedRecall|accuracy) (default: f1, current: accuracy)
predictionCol: prediction column name. (default: prediction, current: prediction)


In [22]:
regModel.transform(testData).first()

Row(label=0.0, features=SparseVector(12, {0: 1.0, 1: 1.0, 2: 1.0, 4: 1.0, 9: 236.0, 11: 386.0}), Gender=u'Male', Account_Type=u'Current Account', Age=u'60 +', Education=u'Tertiary and above', Employment=u'Permanent', Salary=236, Employer_Stability=u'Stable', Customer_Loyalty=0, Balance=386, Residential_Status=u'Owned', Service_Level=0.0, rawPrediction=DenseVector([6.1675, 0.0231, -6.1905]), probability=DenseVector([0.9979, 0.0021, 0.0]), prediction=0.0)

In [23]:
predict = model2.transform(predictData)
# predict.select("prediction", "label", "features").show()

In [24]:
from os.path import join as pjoin

## save and load model
#temp_path = tempfile.mkdtemp()
temp_path = pjoin("/home/maffsojah/Projects/HIT_400/capstone_project/web/tbank/spark-warehouse")
#globs['temp_path'] = temp_path
reg_path = temp_path + '/reg'
reg.save(reg_path)
#model2 = LogisticRegression.load(reg_path)
#model2.getMaxIter()

model_path = temp_path + 'reg_model'
regModel.save(model_path)
#model2 = LogisticRegressionModel.load(model_path)

In [25]:
model2 = LogisticRegression.load(reg_path)
model2.getMaxIter()

1000