In [14]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor  # Import Decision Tree Regressor
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.metrics import r2_score

In [5]:
# create a pandas dataframe from the data in GitHub

games = pd.read_csv("https://raw.githubusercontent.com/dmml-heriot-watt/group-coursework-ha/mark-branch/data/games_clean_ohe.csv?token=GHSAT0AAAAAACJ3NTU4BL3NCACCIWYSIC5MZKGLZPA")

In [6]:
games.head()

Unnamed: 0,Title,Rating,Number of Reviews,Plays,Playing,Active Users,Team_07th Expansion,Team_2K Games,Team_2K Marin,Team_343 Industries,...,Genres_Point-and-Click,Genres_Puzzle,Genres_RPG,Genres_Racing,Genres_Real Time Strategy,Genres_Shooter,Genres_Simulator,Genres_Sport,Genres_Strategy,Genres_Visual Novel
0,Elden Ring,4.5,3900,17000,3800,0.22,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Hades,4.3,2900,21000,3200,0.15,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Legend of Zelda: Breath of the Wild,4.4,4300,30000,2500,0.08,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Undertale,4.2,3500,28000,679,0.02,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Hollow Knight,4.4,3000,21000,2400,0.11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# split dataset into features and target variable

# remove 'Title' from features as its only used for context on which game is being referred to
# remove 'Rating' as thats the target variable

feature_columns = games.loc[:, ~games.columns.isin(['Title','Rating'])]

In [8]:
feature_columns

Unnamed: 0,Number of Reviews,Plays,Playing,Active Users,Team_07th Expansion,Team_2K Games,Team_2K Marin,Team_343 Industries,Team_38 Studios,Team_3909,...,Genres_Point-and-Click,Genres_Puzzle,Genres_RPG,Genres_Racing,Genres_Real Time Strategy,Genres_Shooter,Genres_Simulator,Genres_Sport,Genres_Strategy,Genres_Visual Novel
0,3900,17000,3800,0.22,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2900,21000,3200,0.15,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4300,30000,2500,0.08,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3500,28000,679,0.02,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3000,21000,2400,0.11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,94,763,5,0.01,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1111,264,1500,49,0.03,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1112,210,1100,45,0.04,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1113,165,269,79,0.29,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X = feature_columns # features

y = games.Rating # target variable

In [10]:
# split dataset into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 70% training and 30% test

# Build Decision Tree Regression Model

In [14]:
# create Decision Tree regressor object

dtr = DecisionTreeRegressor(random_state=42)

dtr = dtr.fit(X_train,y_train)

# Grid Search CV

In [24]:
# Using GridSearchCV to try to establish the optimal hyperparameters for the Decision Tree models

from sklearn.model_selection import GridSearchCV

parameters = [{'max_depth': [None, 3, 8, 15, 25, 50, 75], 'min_samples_split': [3, 8, 15, 25, 50, 75], 'min_samples_leaf': [3, 8, 15, 25, 50, 75]}]

grid_search = GridSearchCV(estimator = dtr,
                           param_grid = parameters,
                           scoring = 'r2',
                           cv = 10)

grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_

print(best_parameters)

{'max_depth': None, 'min_samples_leaf': 50, 'min_samples_split': 3}


# Train Decision Tree with Optimal Params from GridSearchCV

In [11]:
dtr_optimal = DecisionTreeRegressor(max_depth=None, min_samples_split=3, min_samples_leaf=50, random_state=42)

dtr_optimal = dtr_optimal.fit(X_train,y_train)

y_pred = dtr_optimal.predict(X_test)

# Evaluate Model Accuracy

In [12]:
y_pred = np.round(y_pred, 1)

accuracy_comparison = pd.DataFrame({'Predicted':y_pred, 'Actual':y_test})

print(accuracy_comparison)

      Predicted  Actual
265         3.4     3.6
101         4.0     4.3
1045        3.7     3.1
792         3.7     3.3
902         3.4     3.0
...         ...     ...
591         4.0     4.2
65          3.7     4.3
462         3.4     2.5
1002        3.1     3.3
866         3.9     4.2

[335 rows x 2 columns]


In [15]:
# calculate R2

print('R2 for DT for optimal params from GridSearchCV:', round(r2_score(y_test, y_pred),3))

R2 for DT for optimal params from GridSearchCV: 0.298
