## Load Data

In [2]:
!pip install wget --user



In [3]:
import wget

my_data = 'https://apsportal.ibm.com/exchange-api/v1/entries/8044492073eb964f46597b4be06ff5ea/data?accessKey=9561295fa407698694b1e254d0099600'
data_name = wget.download(my_data)

print data_name

GoSales_Tx_NaiveBayes (2).csv


### Load The Apache Spark Dataframe


In [4]:
spark = SparkSession.builder.getOrCreate()

df_data = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load(data_name)


In [5]:
### Data Exploration

In [6]:
df_data.printSchema() #shows columns and their data types

root
 |-- PRODUCT_LINE: string (nullable = true)
 |-- GENDER: string (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- MARITAL_STATUS: string (nullable = true)
 |-- PROFESSION: string (nullable = true)



In [7]:
df_data.show() #shows the first 20 lines of data

+--------------------+------+---+--------------+------------+
|        PRODUCT_LINE|GENDER|AGE|MARITAL_STATUS|  PROFESSION|
+--------------------+------+---+--------------+------------+
|Personal Accessories|     M| 27|        Single|Professional|
|Personal Accessories|     F| 39|       Married|       Other|
|Mountaineering Eq...|     F| 39|       Married|       Other|
|Personal Accessories|     F| 56|   Unspecified| Hospitality|
|      Golf Equipment|     M| 45|       Married|     Retired|
|      Golf Equipment|     M| 45|       Married|     Retired|
|   Camping Equipment|     F| 39|       Married|       Other|
|   Camping Equipment|     F| 49|       Married|       Other|
|  Outdoor Protection|     F| 49|       Married|       Other|
|      Golf Equipment|     M| 47|       Married|     Retired|
|      Golf Equipment|     M| 47|       Married|     Retired|
|Mountaineering Eq...|     M| 21|        Single|      Retail|
|Personal Accessories|     F| 66|       Married|       Other|
|   Camp

### ML Model

In [8]:
splitted_data = df_data.randomSplit([0.7, 0.27, 0.03], 24)
train_data = splitted_data[0]
test_data = splitted_data[1]
predict_data = splitted_data[2]


In [9]:
print "Number of training records: " + str(train_data.count())
print "Number of testing records : " + str(test_data.count())
print "Number of prediction records : " + str(predict_data.count())

Number of training records: 42124
Number of testing records : 16307
Number of prediction records : 1821


### Create Pipeline

In [10]:
#import Spark ML packages
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model


In [11]:
#converts string fields to numeric ones by using StringIndexer transformer
si_label = StringIndexer(inputCol="PRODUCT_LINE", outputCol="label").fit(df_data)
si_prof = StringIndexer(inputCol="PROFESSION", outputCol="PROFESSION_I")
si_gend = StringIndexer(inputCol="GENDER", outputCol="GENDER_I")
si_mar = StringIndexer(inputCol="MARITAL_STATUS", outputCol="MARITAL_STATUS_I")

In [12]:
#combines the features then craete a feature 
va_features = VectorAssembler(inputCols=["GENDER_I", "AGE", "MARITAL_STATUS_I", "PROFESSION_I"], outputCol="features")

In [13]:
#Random Forest defines estimators which are wanted to use for classification
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

In [14]:
#back to original(string) labels 
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=si_label.labels)

In [15]:
pipeline_rf = Pipeline(stages=[si_label, si_prof, si_gend, si_mar, va_features, rf, labelConverter])

In [16]:
model_rf = pipeline_rf.fit(train_data)

In [25]:
#measures model accuracy
predictions = model_rf.transform(test_data)
evaluatorRF = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.590605
Test Error = 0.409395


In [18]:
predictions.show(5)

+-----------------+------+---+--------------+-----------+-----+------------+--------+----------------+------------------+--------------------+--------------------+----------+--------------------+
|     PRODUCT_LINE|GENDER|AGE|MARITAL_STATUS| PROFESSION|label|PROFESSION_I|GENDER_I|MARITAL_STATUS_I|          features|       rawPrediction|         probability|prediction|      predictedLabel|
+-----------------+------+---+--------------+-----------+-----+------------+--------+----------------+------------------+--------------------+--------------------+----------+--------------------+
|Camping Equipment|     F| 18|        Single|      Other|  0.0|         0.0|     1.0|             1.0|[1.0,18.0,1.0,0.0]|[5.21023104170881...|[0.26051155208544...|       1.0|Personal Accessories|
|Camping Equipment|     F| 18|        Single|      Other|  0.0|         0.0|     1.0|             1.0|[1.0,18.0,1.0,0.0]|[5.21023104170881...|[0.26051155208544...|       1.0|Personal Accessories|
|Camping Equipment| 

In [19]:
predictions.select("predictedLabel").groupBy("predictedLabel").count().show()

+--------------------+-----+
|      predictedLabel|count|
+--------------------+-----+
|   Camping Equipment| 9599|
|      Golf Equipment|  888|
|Mountaineering Eq...| 1109|
|Personal Accessories| 4711|
+--------------------+-----+



In [29]:
import sys
import pandas
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

import plotly.graph_objs as go
init_notebook_mode(connected=True)
sys.path.append("".join([os.environ["HOME"]])) 

In [21]:
predictions_pdf = predictions.select("prediction", "predictedLabel", "GENDER", "AGE", "PROFESSION", "MARITAL_STATUS").toPandas()

In [24]:
predictions_pdf

Unnamed: 0,prediction,predictedLabel,GENDER,AGE,PROFESSION,MARITAL_STATUS
0,1,Personal Accessories,F,18,Other,Single
1,1,Personal Accessories,F,18,Other,Single
2,1,Personal Accessories,F,18,Retail,Single
3,1,Personal Accessories,F,18,Retail,Single
4,0,Camping Equipment,F,19,Hospitality,Single
5,0,Camping Equipment,F,19,Hospitality,Single
6,0,Camping Equipment,F,19,Hospitality,Single
7,0,Camping Equipment,F,19,Hospitality,Single
8,0,Camping Equipment,F,19,Hospitality,Single
9,0,Camping Equipment,F,19,Hospitality,Single
