In [1]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer,VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
from sklearn.metrics import classification_report
from IPython.display import display
import pyspark.sql.functions as F
import pandas as pd
from data_processing import *

# Decision Tree Algorithm 
This function trains a Decision Tree model using the input DataFrame, evaluates the model's performance using a classification report and a confusion matrix, and returns the predictions made by the model on the test data.

In [2]:
# Create a sample DataFrame
df = sampled_data()

# Apply string indexer
indexer = StringIndexer(inputCol="label_position", outputCol="label_position_index")
df = indexer.fit(df).transform(df)

# Drop the label_position column to only get the features
list_of_features = df.drop("label_position").drop("label_position_index").columns
assembler = VectorAssembler(inputCols=list_of_features, outputCol="indexed_features")
df = assembler.transform(df)

# Display the DataFrame
sample = df.take(5)
sample = pd.DataFrame(sample, columns=sample[0].__fields__)
display(sample)

Unnamed: 0,pace,shooting,passing,dribbling,defending,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,...,power_long_shots,mentality_interceptions,mentality_positioning,mentality_penalties,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,label_position,label_position_index,indexed_features
0,82,49,64,69,67,65,36,48,68,46,...,57,70,70,47,65,72,69,Defender,0.0,"[82.0, 49.0, 64.0, 69.0, 67.0, 65.0, 36.0, 48...."
1,70,37,42,55,64,37,31,63,57,34,...,27,62,31,34,61,68,66,Defender,0.0,"[70.0, 37.0, 42.0, 55.0, 64.0, 37.0, 31.0, 63...."
2,77,60,61,63,55,63,59,54,60,52,...,62,54,59,57,56,54,53,Defender,0.0,"[77.0, 60.0, 61.0, 63.0, 55.0, 63.0, 59.0, 54...."
3,79,31,50,61,58,62,27,46,55,33,...,35,55,56,42,57,62,66,Defender,0.0,"[79.0, 31.0, 50.0, 61.0, 58.0, 62.0, 27.0, 46...."
4,67,35,54,58,58,58,27,56,52,34,...,35,57,50,41,60,58,56,Defender,0.0,"[67.0, 35.0, 54.0, 58.0, 58.0, 58.0, 27.0, 56...."


# Training Decision Tree Model
To train the decision tree model, the dataset is split into a training set and a test set. The training set is created by using 2/3 of the dataset, while the remaining 1/3 is used for testing. The training set contains all the features of the dataset, including the target variable "label_position_index". A subset of the training set is displayed in the table below.

In [3]:
trainingData, testData = df.randomSplit([0.67, 0.33], 24)
dt = DecisionTreeClassifier(labelCol="label_position_index", featuresCol="indexed_features", impurity="entropy",
                                maxDepth=15)
sample= trainingData.take(10)
sample= pd.DataFrame(sample, columns=sample[0].__fields__)
display(sample)

Unnamed: 0,pace,shooting,passing,dribbling,defending,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,...,power_long_shots,mentality_interceptions,mentality_positioning,mentality_penalties,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,label_position,label_position_index,indexed_features
0,28,40,47,38,71,42,32,75,50,28,...,49,71,33,50,69,71,70,Defender,0.0,"[28.0, 40.0, 47.0, 38.0, 71.0, 42.0, 32.0, 75...."
1,29,28,59,41,75,32,28,77,72,25,...,24,78,36,42,76,75,66,Defender,0.0,"[29.0, 28.0, 59.0, 41.0, 75.0, 32.0, 28.0, 77...."
2,30,33,44,36,71,19,25,73,57,23,...,21,66,35,44,75,70,71,Defender,0.0,"[30.0, 33.0, 44.0, 36.0, 71.0, 19.0, 25.0, 73...."
3,30,57,49,44,69,39,52,70,53,40,...,57,71,45,80,70,69,60,Defender,0.0,"[30.0, 57.0, 49.0, 44.0, 69.0, 39.0, 52.0, 70...."
4,31,32,39,39,51,45,23,45,43,25,...,36,51,33,40,49,55,49,Defender,0.0,"[31.0, 32.0, 39.0, 39.0, 51.0, 45.0, 23.0, 45...."
5,31,34,49,45,65,46,25,67,55,22,...,22,67,37,55,66,63,63,Defender,0.0,"[31.0, 34.0, 49.0, 45.0, 65.0, 46.0, 25.0, 67...."
6,31,51,59,61,67,60,40,69,58,54,...,66,73,23,48,61,65,72,Defender,0.0,"[31.0, 51.0, 59.0, 61.0, 67.0, 60.0, 40.0, 69...."
7,33,34,51,54,74,38,22,75,63,28,...,38,73,31,35,75,75,68,Defender,0.0,"[33.0, 34.0, 51.0, 54.0, 74.0, 38.0, 22.0, 75...."
8,34,33,35,37,67,32,30,64,39,33,...,28,66,29,40,68,69,58,Defender,0.0,"[34.0, 33.0, 35.0, 37.0, 67.0, 32.0, 30.0, 64...."
9,34,34,34,31,69,25,30,80,35,29,...,16,56,39,49,69,74,73,Defender,0.0,"[34.0, 34.0, 34.0, 31.0, 69.0, 25.0, 30.0, 80...."


In [4]:
sample= testData.take(10)
sample= pd.DataFrame(sample, columns=sample[0].__fields__)
display(sample)

Unnamed: 0,pace,shooting,passing,dribbling,defending,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,...,power_long_shots,mentality_interceptions,mentality_positioning,mentality_penalties,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,label_position,label_position_index,indexed_features
0,32,34,44,42,63,32,32,68,53,28,...,26,59,28,37,55,70,64,Defender,0.0,"[32.0, 34.0, 44.0, 42.0, 63.0, 32.0, 32.0, 68...."
1,34,29,48,50,68,37,27,79,60,30,...,26,62,41,39,71,70,56,Defender,0.0,"[34.0, 29.0, 48.0, 50.0, 68.0, 37.0, 27.0, 79...."
2,40,23,37,37,69,46,17,62,46,18,...,19,69,23,26,70,71,70,Defender,0.0,"[40.0, 23.0, 37.0, 37.0, 69.0, 46.0, 17.0, 62...."
3,41,27,49,45,73,32,21,68,66,30,...,21,66,24,35,74,78,76,Defender,0.0,"[41.0, 27.0, 49.0, 45.0, 73.0, 32.0, 21.0, 68...."
4,42,33,53,43,68,40,24,71,66,23,...,28,62,39,40,73,71,52,Defender,0.0,"[42.0, 33.0, 53.0, 43.0, 68.0, 40.0, 24.0, 71...."
5,42,47,50,48,74,38,40,81,62,30,...,49,73,58,45,73,75,72,Defender,0.0,"[42.0, 47.0, 50.0, 48.0, 74.0, 38.0, 40.0, 81...."
6,43,29,38,37,63,25,26,72,45,27,...,23,61,28,32,66,60,55,Defender,0.0,"[43.0, 29.0, 38.0, 37.0, 63.0, 25.0, 26.0, 72...."
7,43,32,45,33,66,26,18,69,59,27,...,49,60,25,30,67,69,63,Defender,0.0,"[43.0, 32.0, 45.0, 33.0, 66.0, 26.0, 18.0, 69...."
8,45,37,44,38,65,32,20,69,53,31,...,51,70,40,39,64,63,58,Defender,0.0,"[45.0, 37.0, 44.0, 38.0, 65.0, 32.0, 20.0, 69...."
9,46,55,63,63,67,62,49,72,66,54,...,56,67,60,63,66,66,68,Defender,0.0,"[46.0, 55.0, 63.0, 63.0, 67.0, 62.0, 49.0, 72...."


In [5]:
model = dt.fit(trainingData)
predictions = model.transform(testData)

# Evalution Metrics 
To produce the classification report, we choose the "prediction" and "label_position_index" columns from the "predictions" dataframe. We then convert the resulting DataFrame into a pandas dataframe, which enables us to employ the classification_report() function from the scikit-learn metrics module.

In [6]:
predictions_and_labels_pd = predictions.select("prediction", "label_position_index").toPandas()
class_report_dict = classification_report(predictions_and_labels_pd['label_position_index'],
                                              predictions_and_labels_pd['prediction'], output_dict=True)
class_report_df = pd.DataFrame.from_dict(class_report_dict).transpose()
print("Classification Report:")
print(class_report_df)

Classification Report:
              precision    recall  f1-score     support
0.0            0.883871  0.878205  0.881029  156.000000
1.0            0.815951  0.858065  0.836478  155.000000
2.0            0.744681  0.709459  0.726644  148.000000
accuracy       0.816993  0.816993  0.816993    0.816993
macro avg      0.814834  0.815243  0.814717  459.000000
weighted avg   0.816055  0.816993  0.816204  459.000000


To create the confusion matrix, we pick the "prediction" and "label_position_index" columns from the dataframe, and then we feed the resulting dataframe into the confusionMatrix() function available in the MulticlassMetrics module.

In [7]:
predictions_and_labels = predictions.select("prediction", "label_position_index").rdd
metrics = MulticlassMetrics(predictions_and_labels)
confusion_matrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:")
print(confusion_matrix)



Confusion Matrix:
[[137.   2.  17.]
 [  3. 133.  19.]
 [ 15.  28. 105.]]
