In [1]:
!apt-get install openjdk-8-jdk-headless

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following packages were automatically installed and are no longer required:
  libnvidia-common-460 nsight-compute-2020.2.0
Use 'apt autoremove' to remove them.
The following additional packages will be installed:
  openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra
  fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-microhei
  fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  openjdk-8-jdk-headless openjdk-8-jre-headless
0 upgraded, 2 newly installed, 0 to remove and 67 not upgraded.
Need to get 36.5 MB of archives.
After this operation, 143 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jre-headless amd64 8u312-b07-0ubuntu1~18.04 [28.2 MB]
Get:2 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jdk-headless

In [2]:
!wget https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop2.7.tgz
!tar xf /content/spark-3.2.1-bin-hadoop2.7.tgz
!pip install -q findspark

--2022-05-17 22:19:00--  https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop2.7.tgz
Resolving archive.apache.org (archive.apache.org)... 138.201.131.134, 2a01:4f8:172:2ec5::2
Connecting to archive.apache.org (archive.apache.org)|138.201.131.134|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 272637746 (260M) [application/x-gzip]
Saving to: ‘spark-3.2.1-bin-hadoop2.7.tgz’


2022-05-17 22:19:04 (77.2 MB/s) - ‘spark-3.2.1-bin-hadoop2.7.tgz’ saved [272637746/272637746]



In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop2.7"

In [4]:
import findspark
findspark.init()
findspark.find()

'/content/spark-3.2.1-bin-hadoop2.7'

In [5]:
import pyspark
import numpy as np
import pandas as pd

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').appName('Buyers').getOrCreate()

# Read data from CSV file
#you can download it from here: https://raw.githubusercontent.com/besherh/BigDataManagement/main/SparkCSV/flights-larger.csv
customer_df = spark.read.csv('combined-data.csv', sep=',', header=True, inferSchema=True, nullValue='NULL')
#df = pd.read_csv("combined-data.csv")

In [7]:
#View table structure
customer_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- userSessionId: integer (nullable = true)
 |-- teamLevel: integer (nullable = true)
 |-- platformType: string (nullable = true)
 |-- count_gameclicks: integer (nullable = true)
 |-- count_hits: integer (nullable = true)
 |-- count_buyId: integer (nullable = true)
 |-- avg_price: double (nullable = true)



In [8]:
#Total number of records
customer_df.count()

4619

In [None]:
# Find Count of Null, None, NaN of All DataFrame Columns
from pyspark.sql.functions import col,isnan, when, count

In [9]:
#Check for missing values
#
customer_df.na.drop().count()


1411

In [10]:
customer_df.na.drop(how="any", thresh=2).show()

+------+-------------+---------+------------+----------------+----------+-----------+---------+
|userId|userSessionId|teamLevel|platformType|count_gameclicks|count_hits|count_buyId|avg_price|
+------+-------------+---------+------------+----------------+----------+-----------+---------+
|   812|         5648|        1|     android|              69|         8|       null|     null|
|  1658|         5649|        1|      iphone|              31|         5|       null|     null|
|  1589|         5650|        1|      iphone|              26|         2|       null|     null|
|  1863|         5651|        1|     android|              35|         4|       null|     null|
|   937|         5652|        1|     android|              39|         0|          1|      1.0|
|   342|         5653|        1|     android|              36|         5|       null|     null|
|   849|         5654|        1|      iphone|              40|         5|       null|     null|
|  1277|         5655|        1|     win

In [11]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['count_buyId', 'avg_price'], 
    outputCols=["{}_imputed".format(c) for c in ['count_buyId', 'avg_price']]
    ).setStrategy("mean")

In [12]:
 ###Add imputation cols to df
customer_df2 = imputer.fit(customer_df).transform(customer_df)

In [13]:
customer_df2.show()

+------+-------------+---------+------------+----------------+----------+-----------+---------+-------------------+-----------------+
|userId|userSessionId|teamLevel|platformType|count_gameclicks|count_hits|count_buyId|avg_price|count_buyId_imputed|avg_price_imputed|
+------+-------------+---------+------------+----------------+----------+-----------+---------+-------------------+-----------------+
|   812|         5648|        1|     android|              69|         8|       null|     null|                  1|7.214323175053155|
|  1658|         5649|        1|      iphone|              31|         5|       null|     null|                  1|7.214323175053155|
|  1589|         5650|        1|      iphone|              26|         2|       null|     null|                  1|7.214323175053155|
|  1863|         5651|        1|     android|              35|         4|       null|     null|                  1|7.214323175053155|
|   937|         5652|        1|     android|              39|

In [14]:
customer_df2.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- userSessionId: integer (nullable = true)
 |-- teamLevel: integer (nullable = true)
 |-- platformType: string (nullable = true)
 |-- count_gameclicks: integer (nullable = true)
 |-- count_hits: integer (nullable = true)
 |-- count_buyId: integer (nullable = true)
 |-- avg_price: double (nullable = true)
 |-- count_buyId_imputed: integer (nullable = true)
 |-- avg_price_imputed: double (nullable = true)



In [15]:
#Create label whether user is a big player or small 

customer_df_players = customer_df2.withColumn('label', (customer_df2.count_gameclicks >=143).cast('integer'))


In [16]:
customer_df_players.show(5)

+------+-------------+---------+------------+----------------+----------+-----------+---------+-------------------+-----------------+-----+
|userId|userSessionId|teamLevel|platformType|count_gameclicks|count_hits|count_buyId|avg_price|count_buyId_imputed|avg_price_imputed|label|
+------+-------------+---------+------------+----------------+----------+-----------+---------+-------------------+-----------------+-----+
|   812|         5648|        1|     android|              69|         8|       null|     null|                  1|7.214323175053155|    0|
|  1658|         5649|        1|      iphone|              31|         5|       null|     null|                  1|7.214323175053155|    0|
|  1589|         5650|        1|      iphone|              26|         2|       null|     null|                  1|7.214323175053155|    0|
|  1863|         5651|        1|     android|              35|         4|       null|     null|                  1|7.214323175053155|    0|
|   937|         565

In [17]:
customer_df_players=customer_df_players.select("userId","userSessionId","teamLevel","platformType","count_hits","count_buyId","avg_price","count_buyId_imputed","avg_price_imputed","label")

In [None]:
customer_df_players.show(5)

+------+-------------+---------+------------+----------+-----------+---------+-------------------+-----------------+-----+
|userId|userSessionId|teamLevel|platformType|count_hits|count_buyId|avg_price|count_buyId_imputed|avg_price_imputed|label|
+------+-------------+---------+------------+----------+-----------+---------+-------------------+-----------------+-----+
|   812|         5648|        1|     android|         8|       null|     null|                  1|7.214323175053155|    0|
|  1658|         5649|        1|      iphone|         5|       null|     null|                  1|7.214323175053155|    0|
|  1589|         5650|        1|      iphone|         2|       null|     null|                  1|7.214323175053155|    0|
|  1863|         5651|        1|     android|         4|       null|     null|                  1|7.214323175053155|    0|
|   937|         5652|        1|     android|         0|          1|      1.0|                  1|              1.0|    0|
+------+--------

In [18]:
#Categorical transformation of PlatformType column to indexed numerical value

from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='platformType', outputCol='platformType_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(customer_df_players)


In [19]:
# Indexer creates a new column with numeric index values
platformType_idx = indexer_model.transform(customer_df_players)

# Repeat the process for the other categorical feature
#platformType_idx = StringIndexer(inputCol='platformType', outputCol='platformType_idx').fit(df_players).transform(df_players)


In [20]:
#Categorical transformation of  teamlevel column to indexed numerical value

from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='teamLevel', outputCol='teamLevel_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(customer_df_players)


In [21]:
teamLevel_idx = indexer_model.transform(customer_df_players)

In [22]:
#Categorical transformation of  teamlevel column to indexed numerical value

from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='userId', outputCol='userId_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(customer_df_players)
userId_idx = indexer_model.transform(customer_df_players)

In [23]:
#Categorical transformation of  teamlevel column to indexed numerical value

from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='userSessionId', outputCol='userSessionId_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(customer_df_players)
userSessionId_idx = indexer_model.transform(customer_df_players)

In [24]:
#Categorical transformation of  teamlevel column to indexed numerical value

from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='count_buyId_imputed', outputCol='count_buyId_imputed_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(customer_df_players)
count_buyId_imputed_idx = indexer_model.transform(customer_df_players)

In [25]:
customer_df_players.columns

['userId',
 'userSessionId',
 'teamLevel',
 'platformType',
 'count_hits',
 'count_buyId',
 'avg_price',
 'count_buyId_imputed',
 'avg_price_imputed',
 'label']

In [26]:
#Assembling columns

from pyspark.ml.feature import VectorAssembler

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'teamLevel', 'count_hits',
    'count_buyId_imputed', 
    'avg_price_imputed'
    
], outputCol='features')

# Consolidate predictor columns
customer_assembled = assembler.transform(customer_df_players)

# Check the resulting column
customer_assembled.select('features', 'label').show(5, truncate=False)

+-------------------------------+-----+
|features                       |label|
+-------------------------------+-----+
|[1.0,8.0,1.0,7.214323175053155]|0    |
|[1.0,5.0,1.0,7.214323175053155]|0    |
|[1.0,2.0,1.0,7.214323175053155]|0    |
|[1.0,4.0,1.0,7.214323175053155]|0    |
|[1.0,0.0,1.0,1.0]              |0    |
+-------------------------------+-----+
only showing top 5 rows



Decision Tree
Train/test split To objectively assess a Machine Learning model you need to be able to test it on an independent set of data. You can't use the same data that you used to train the model: of course the model will perform (relatively) well on those data!

You will split the data into two components:

training data (used to train the model) and testing data (used to test the model).**bold text**


In [27]:
# Split into training and test sets in a 70:30 ratio
customers_train, customers_test = customer_assembled.randomSplit([0.7, 0.3], seed=17)

# Check that training set has around 70% of records
training_ratio = customers_train.count() / customer_assembled.count()
print(training_ratio)

0.6958216064083135


In [28]:
customers_test.show(2)

+------+-------------+---------+------------+----------+-----------+---------+-------------------+-----------------+-----+--------------------+
|userId|userSessionId|teamLevel|platformType|count_hits|count_buyId|avg_price|count_buyId_imputed|avg_price_imputed|label|            features|
+------+-------------+---------+------------+----------+-----------+---------+-------------------+-----------------+-----+--------------------+
|     0|        23473|        1|      iphone|        28|       null|     null|                  1|7.214323175053155|    1|[1.0,28.0,1.0,7.2...|
|     1|        10041|        3|     android|         9|          2|      3.0|                  2|              3.0|    0|   [3.0,9.0,2.0,3.0]|
+------+-------------+---------+------------+----------+-----------+---------+-------------------+-----------------+-----+--------------------+
only showing top 2 rows



Build a Decision Tree
Now that you've split the flights data into training and testing sets, you can use the training set to fit a Decision Tree model.

In [29]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier()
tree_model = tree.fit(customers_train)


In [36]:
tree_model

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b75c05678f8c, depth=5, numNodes=39, numClasses=2, numFeatures=4

In [30]:
# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(customers_test)
prediction.select('label', 'prediction', 'probability').show(5, False)

+-----+----------+-----------------------------------------+
|label|prediction|probability                              |
+-----+----------+-----------------------------------------+
|1    |1.0       |[0.005454545454545455,0.9945454545454545]|
|0    |0.0       |[0.9925,0.0075]                          |
|0    |0.0       |[0.9137055837563451,0.08629441624365482] |
|0    |0.0       |[0.5764705882352941,0.4235294117647059]  |
|0    |0.0       |[0.9925,0.0075]                          |
+-----+----------+-----------------------------------------+
only showing top 5 rows



Evaluate the Decision Tree
You can assess the quality of your model by evaluating how well it performs on the testing data. Because the model was not trained on these data, this represents an objective assessment of the model.

A confusion matrix gives a useful breakdown of predictions versus known values. It has four cells which represent the counts of:

True Negatives (TN) — model predicts negative outcome & known outcome is negative True Positives (TP) — model predicts positive outcome & known outcome is positive False Negatives (FN) — model predicts negative outcome but known outcome is positive False Positives (FP) — model predicts positive outcome but known outcome is negative.

In [31]:
# Create a confusion matrix
prediction.groupBy('label', 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label = 1').count()
FP = prediction.filter('prediction = 1 AND label = 0').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP) / (TN + TP + FN + FP)
print(accuracy)

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   76|
|    0|       0.0|  934|
|    1|       1.0|  341|
|    0|       1.0|   54|
+-----+----------+-----+

0.9074733096085409


In [32]:
tree_model.featureImportances

SparseVector(4, {0: 0.0037, 1: 0.9792, 2: 0.0071, 3: 0.0099})