In [74]:
#Imports
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# 1.0 Classification Using Decision Tree Model

### Loading Data

In [75]:
col_names = ['rank', 'name', 'platform', 'year', 'genre', 'publisher', 'na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'global_sales']
# load dataset
sales = pd.read_csv("./assets/sales_table.csv", header=None, names=col_names)
# Removing the first row since it has duplicate column names.
sales = sales.tail(-1)
sales.head()

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
0.0,14691,Cossacks: European Wars,PC,2001,Strategy,Strategy First,0.0,0.02,0.0,0.0,0.03
1.0,61,Just Dance 3,Wii,2011,Misc,Ubisoft,6.05,3.15,0.0,1.07,10.26
2.0,69,Just Dance 2,Wii,2010,Misc,Ubisoft,5.84,2.89,0.01,0.78,9.52
3.0,103,Just Dance,Wii,2009,Misc,Ubisoft,3.51,3.03,0.0,0.73,7.27
4.0,112,Just Dance 4,Wii,2012,Misc,Ubisoft,4.14,2.21,0.0,0.56,6.91


In [76]:
# Converting attributes from string to a more relevant data type.
sales['rank'] = sales['rank'].astype(int)
sales['year'] = sales['year'].astype(int)
sales['na_sales'] = sales['na_sales'].astype(float)
sales['eu_sales'] = sales['eu_sales'].astype(float)
sales['jp_sales'] = sales['jp_sales'].astype(float)
sales['other_sales'] = sales['other_sales'].astype(float)
sales['global_sales'] = sales['global_sales'].astype(float)
sales.head()

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
0.0,14691,Cossacks: European Wars,PC,2001,Strategy,Strategy First,0.0,0.02,0.0,0.0,0.03
1.0,61,Just Dance 3,Wii,2011,Misc,Ubisoft,6.05,3.15,0.0,1.07,10.26
2.0,69,Just Dance 2,Wii,2010,Misc,Ubisoft,5.84,2.89,0.01,0.78,9.52
3.0,103,Just Dance,Wii,2009,Misc,Ubisoft,3.51,3.03,0.0,0.73,7.27
4.0,112,Just Dance 4,Wii,2012,Misc,Ubisoft,4.14,2.21,0.0,0.56,6.91


In [77]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 2338 entries, 0.0 to 2337.0
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   rank          2338 non-null   int64  
 1   name          2338 non-null   object 
 2   platform      2338 non-null   object 
 3   year          2338 non-null   int64  
 4   genre         2338 non-null   object 
 5   publisher     2338 non-null   object 
 6   na_sales      2338 non-null   float64
 7   eu_sales      2338 non-null   float64
 8   jp_sales      2338 non-null   float64
 9   other_sales   2338 non-null   float64
 10  global_sales  2338 non-null   float64
dtypes: float64(5), int64(2), object(4)
memory usage: 219.2+ KB


In [78]:
# Encoding the publisher and platform attributes for training the model.
sales.platform = pd.Categorical(pd.factorize(sales.platform)[0])
sales.publisher = pd.Categorical(pd.factorize(sales.publisher)[0])
sales.genre = pd.Categorical(pd.factorize(sales.genre)[0])
sales.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 2338 entries, 0.0 to 2337.0
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   rank          2338 non-null   int64   
 1   name          2338 non-null   object  
 2   platform      2338 non-null   category
 3   year          2338 non-null   int64   
 4   genre         2338 non-null   category
 5   publisher     2338 non-null   category
 6   na_sales      2338 non-null   float64 
 7   eu_sales      2338 non-null   float64 
 8   jp_sales      2338 non-null   float64 
 9   other_sales   2338 non-null   float64 
 10  global_sales  2338 non-null   float64 
dtypes: category(3), float64(5), int64(2), object(1)
memory usage: 173.0+ KB


### Feature Selection

In [79]:
#split dataset in features and target variable
feature_cols = ['rank', 'publisher','year', 'genre', 'na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'global_sales']
X = sales[feature_cols] # Features
y = sales.platform # Target variable

### Spitting the Data

In [80]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

### Building Decision Tree Model

In [81]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

### Evaluation of the Decision Tree Model

In [84]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
pre = precision_score(y_test, y_pred, average='micro')
re = recall_score(y_test, y_pred, average='micro')
print("Precision:", pre)
print("Recall:", re)

Accuracy: 0.45014245014245013
Precision: 0.45014245014245013
Recall: 0.45014245014245013


# 2.0 Classification Using Gradient Boosting

### Building Gradient Boosting Model

In [85]:
gradient_booster = GradientBoostingClassifier(learning_rate=0.1)
gradient_booster.fit(X_train,y_train)


### Evaluation of the Gradient Boosting Model

In [86]:
print(classification_report(y_test,gradient_booster.predict(X_test)))

              precision    recall  f1-score   support

           0       0.78      0.78      0.78        89
           1       0.27      0.26      0.27        65
           2       0.41      0.43      0.42        67
           3       0.67      0.60      0.63        97
           4       0.49      0.54      0.51       114
           5       0.83      0.83      0.83       106
           6       0.20      0.33      0.25         3
           7       0.29      0.13      0.18        15
           8       0.37      0.45      0.41        22
           9       0.21      0.44      0.28        16
          10       0.50      0.21      0.30        33
          11       0.84      0.93      0.88        41
          12       0.00      0.00      0.00         4
          13       0.78      0.67      0.72        27
          14       0.00      0.00      0.00         3

    accuracy                           0.58       702
   macro avg       0.44      0.44      0.43       702
weighted avg       0.59   

# 3.0 Classification Using Random Forest Algorithm

### Building Random Forest Alogrithm Model

In [89]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

### Evaluation of the Gradient Boosting Mode

In [92]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='macro')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5712250712250713
Precision: 0.5712250712250713
Recall: 0.40499613802540024
