In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor  # Import Random Forest Regressor 
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.metrics import r2_score

In [2]:
# create a pandas dataframe from the data in GitHub

games = pd.read_csv("https://raw.githubusercontent.com/dmml-heriot-watt/group-coursework-ha/mark-branch/data/games_clean.csv?token=GHSAT0AAAAAACJ3NTU5TFVWE4RA26PX6CLIZKGOGWA")

In [3]:
games.head()

Unnamed: 0,Title,Team,Rating,Number of Reviews,Genres,Plays,Playing,Active Users
0,Elden Ring,Bandai Namco Entertainment,4.5,3900,Adventure,17000,3800,0.22
1,Hades,Supergiant Games,4.3,2900,Adventure,21000,3200,0.15
2,The Legend of Zelda: Breath of the Wild,Nintendo,4.4,4300,Adventure,30000,2500,0.08
3,Undertale,tobyfox,4.2,3500,Adventure,28000,679,0.02
4,Hollow Knight,Team Cherry,4.4,3000,Adventure,21000,2400,0.11


In [4]:
# split dataset into features and target variable

# remove 'Title' from features as its only used for context on which game is being referred to
# remove 'Rating' as thats the target variable

feature_columns = games.loc[:, ~games.columns.isin(['Title','Team','Rating','Genres'])]

In [5]:
print(feature_columns)

      Number of Reviews  Plays  Playing  Active Users
0                  3900  17000     3800          0.22
1                  2900  21000     3200          0.15
2                  4300  30000     2500          0.08
3                  3500  28000      679          0.02
4                  3000  21000     2400          0.11
...                 ...    ...      ...           ...
1110                 94    763        5          0.01
1111                264   1500       49          0.03
1112                210   1100       45          0.04
1113                165    269       79          0.29
1114                184   1700       11          0.01

[1115 rows x 4 columns]


In [6]:
X = feature_columns # features

y = games.Rating # target variable

# Model Testing - Splitting the data - Holdout Method

In [7]:
# split dataset into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 70% training and 30% test

# Build Random Forest Regression Models

In [8]:
# create Random Forest regressor objects

rf1 = RandomForestRegressor(random_state=42)

rf2 = RandomForestRegressor(max_depth=3, random_state=42)
rf3 = RandomForestRegressor(max_depth=8, random_state=42)
rf4 = RandomForestRegressor(max_depth=15, random_state=42)

rf5 = RandomForestRegressor(min_samples_split=3, random_state=42)
rf6 = RandomForestRegressor(min_samples_split=8, random_state=42)
rf7 = RandomForestRegressor(min_samples_split=15, random_state=42)

rf8 = RandomForestRegressor(min_samples_leaf=3, random_state=42)
rf9 = RandomForestRegressor(min_samples_leaf=8, random_state=42)
rf10 = RandomForestRegressor(min_samples_leaf=15, random_state=42)

rf11 = RandomForestRegressor(max_depth=1, min_samples_split=1.0, min_samples_leaf=1, random_state=42)
rf12 = RandomForestRegressor(max_depth=2, min_samples_split=2, min_samples_leaf=2, random_state=42)
rf13 = RandomForestRegressor(max_depth=3, min_samples_split=3, min_samples_leaf=3, random_state=42)

rf14 = RandomForestRegressor(max_depth=15, min_samples_split=15, min_samples_leaf=15, random_state=42)
rf15 = RandomForestRegressor(max_depth=25, min_samples_split=25, min_samples_leaf=25, random_state=42)
rf16 = RandomForestRegressor(max_depth=50, min_samples_split=50, min_samples_leaf=50, random_state=42)

# train Random Forest regressors

rf1 = rf1.fit(X_train,y_train)
rf2 = rf2.fit(X_train,y_train)
rf3 = rf3.fit(X_train,y_train)
rf4 = rf4.fit(X_train,y_train)
rf5 = rf5.fit(X_train,y_train)
rf6 = rf6.fit(X_train,y_train)
rf7 = rf7.fit(X_train,y_train)
rf8 = rf8.fit(X_train,y_train)
rf9 = rf9.fit(X_train,y_train)
rf10 = rf10.fit(X_train,y_train)
rf11 = rf11.fit(X_train,y_train)
rf12 = rf12.fit(X_train,y_train)
rf13 = rf13.fit(X_train,y_train)
rf14 = rf14.fit(X_train,y_train)
rf15 = rf15.fit(X_train,y_train)
rf16 = rf16.fit(X_train,y_train)

# predictions for test dataset

y_pred1 = rf1.predict(X_test)
y_pred2 = rf2.predict(X_test)
y_pred3 = rf3.predict(X_test)
y_pred4 = rf4.predict(X_test)
y_pred5 = rf5.predict(X_test)
y_pred6 = rf6.predict(X_test)
y_pred7 = rf7.predict(X_test)
y_pred8 = rf8.predict(X_test)
y_pred9 = rf9.predict(X_test)
y_pred10 = rf10.predict(X_test)
y_pred11 = rf11.predict(X_test)
y_pred12 = rf12.predict(X_test)
y_pred13 = rf13.predict(X_test)
y_pred14 = rf14.predict(X_test)
y_pred15 = rf15.predict(X_test)
y_pred16 = rf16.predict(X_test)

# Evaulating Model Accuracy

In [9]:
# evaluate the accuracy of the regression models performance

# create dataframes comparing the actual and predicted values for the target variable

y_pred1 = np.round(y_pred1, 1)
y_pred2 = np.round(y_pred2, 1)
y_pred3 = np.round(y_pred3, 1)
y_pred4 = np.round(y_pred4, 1)
y_pred5 = np.round(y_pred5, 1)
y_pred6 = np.round(y_pred6, 1)
y_pred7 = np.round(y_pred7, 1)
y_pred8 = np.round(y_pred8, 1)
y_pred9 = np.round(y_pred9, 1)
y_pred10 = np.round(y_pred10, 1)
y_pred11 = np.round(y_pred11, 1)
y_pred12 = np.round(y_pred12, 1)
y_pred13 = np.round(y_pred13, 1)
y_pred14 = np.round(y_pred14, 1)
y_pred15 = np.round(y_pred15, 1)
y_pred16 = np.round(y_pred16, 1)

accuracy_comparison1 = pd.DataFrame({'Predicted':y_pred1, 'Actual':y_test})
accuracy_comparison2 = pd.DataFrame({'Predicted':y_pred2, 'Actual':y_test})
accuracy_comparison3 = pd.DataFrame({'Predicted':y_pred3, 'Actual':y_test})
accuracy_comparison4 = pd.DataFrame({'Predicted':y_pred4, 'Actual':y_test})
accuracy_comparison5 = pd.DataFrame({'Predicted':y_pred5, 'Actual':y_test})
accuracy_comparison6 = pd.DataFrame({'Predicted':y_pred6, 'Actual':y_test})
accuracy_comparison7 = pd.DataFrame({'Predicted':y_pred7, 'Actual':y_test})
accuracy_comparison8 = pd.DataFrame({'Predicted':y_pred8, 'Actual':y_test})
accuracy_comparison9 = pd.DataFrame({'Predicted':y_pred9, 'Actual':y_test})
accuracy_comparison10 = pd.DataFrame({'Predicted':y_pred10, 'Actual':y_test})
accuracy_comparison11 = pd.DataFrame({'Predicted':y_pred11, 'Actual':y_test})
accuracy_comparison12 = pd.DataFrame({'Predicted':y_pred12, 'Actual':y_test})
accuracy_comparison13 = pd.DataFrame({'Predicted':y_pred13, 'Actual':y_test})
accuracy_comparison14 = pd.DataFrame({'Predicted':y_pred14, 'Actual':y_test})
accuracy_comparison15 = pd.DataFrame({'Predicted':y_pred15, 'Actual':y_test})
accuracy_comparison16 = pd.DataFrame({'Predicted':y_pred16, 'Actual':y_test})

print(accuracy_comparison1)
print(accuracy_comparison2)
print(accuracy_comparison3)
print(accuracy_comparison4)
print(accuracy_comparison5)
print(accuracy_comparison6)
print(accuracy_comparison7)
print(accuracy_comparison8)
print(accuracy_comparison9)
print(accuracy_comparison10)
print(accuracy_comparison11)
print(accuracy_comparison12)
print(accuracy_comparison13)
print(accuracy_comparison14)
print(accuracy_comparison15)
print(accuracy_comparison16)

      Predicted  Actual
265         3.5     3.6
101         3.9     4.3
1045        3.9     3.1
792         4.1     3.3
902         3.0     3.0
...         ...     ...
591         4.1     4.2
65          3.7     4.3
462         3.2     2.5
1002        2.8     3.3
866         3.9     4.2

[335 rows x 2 columns]
      Predicted  Actual
265         3.5     3.6
101         4.0     4.3
1045        3.7     3.1
792         3.7     3.3
902         3.4     3.0
...         ...     ...
591         4.0     4.2
65          3.6     4.3
462         3.5     2.5
1002        3.1     3.3
866         3.8     4.2

[335 rows x 2 columns]
      Predicted  Actual
265         3.5     3.6
101         3.9     4.3
1045        3.8     3.1
792         4.0     3.3
902         3.2     3.0
...         ...     ...
591         4.1     4.2
65          3.7     4.3
462         3.4     2.5
1002        3.0     3.3
866         3.8     4.2

[335 rows x 2 columns]
      Predicted  Actual
265         3.5     3.6
101         3.9 

In [11]:
# calculate R2

print('R2 first experiment:', round(r2_score(y_test, y_pred1),3))
print('R2 second experiment:', round(r2_score(y_test, y_pred2),3))
print('R2 third experiment:', round(r2_score(y_test, y_pred3),3))
print('R2 fourth experiment:', round(r2_score(y_test, y_pred4),3))
print('R2 fifth experiment:', round(r2_score(y_test, y_pred5),3))
print('R2 sixth experiment:', round(r2_score(y_test, y_pred6),3))
print('R2 seventh experiment:', round(r2_score(y_test, y_pred7),3))
print('R2 eigth experiment:', round(r2_score(y_test, y_pred8),3))
print('R2 ninth experiment:', round(r2_score(y_test, y_pred9),3))
print('R2 tenth experiment:', round(r2_score(y_test, y_pred10),3))
print('R2 eleventh experiment:', round(r2_score(y_test, y_pred11),3))
print('R2 twelth experiment:', round(r2_score(y_test, y_pred12),3))
print('R2 thirteenth experiment:', round(r2_score(y_test, y_pred13),3))
print('R2 fourteenth experiment:', round(r2_score(y_test, y_pred14),3))
print('R2 fifteenth experiment:', round(r2_score(y_test, y_pred15),3))
print('R2 sexteenth experiment:', round(r2_score(y_test, y_pred16),3))

R2 first experiment: 0.281
R2 second experiment: 0.286
R2 third experiment: 0.326
R2 fourth experiment: 0.285
R2 fifth experiment: 0.288
R2 sixth experiment: 0.313
R2 seventh experiment: 0.324
R2 eigth experiment: 0.326
R2 ninth experiment: 0.338
R2 tenth experiment: 0.345
R2 eleventh experiment: -0.007
R2 twelth experiment: 0.237
R2 thirteenth experiment: 0.288
R2 fourteenth experiment: 0.345
R2 fifteenth experiment: 0.332
R2 sexteenth experiment: 0.304
