In [96]:
# initialisation
import pandas as pd
import numpy as np
import plotly.express as px
import itertools
import joblib
from scipy.stats import *
from scipy.optimize import *
from pandasgui import show
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

In [97]:
# load the data
raw_data = pd.read_csv('../data/Games Database - Main Page.csv')
#show(raw_data)
#drop fields we won't be needing
raw_data.drop(['Me - Review', 'Percentile', 'Launch Platform(s)', 'Hours Played', 'Completion', 'Year(s) revisited/ continued/ earlier dabbled', 'First Rating', 'Playthroughs'], axis = 1, inplace = True)
raw_data

Unnamed: 0,Game,Rating,Reviewscore,Played on,Launch Year,Play Year,DLC/ Major update played,Publisher,Developer,Franchise,Key words,Remaster Reviewscore,Remaster Release Year,Developer2
0,The Legend of Zelda: Tears of the Kingdom,100,96,Switch,2023,2023,No,Nintendo,Nintendo,The Legend of Zelda,"Adventure, open-world, sandbox, exploration, p...",,,
1,Hi-Fi Rush,97,89,XBS,2023,2023,No,Microsoft,Tango Gameworks,Hi-Fi Rush,"Action, hack and slash, rythm, linear, platfor...",,,
2,Marvel's Spider-Man 2,96,90,PS5,2023,2023,No,Sony,Insomniac,Marvel's Spider-Man,"Story-driven, open-world, superhero, platformi...",,,
3,Cocoon,94,88,XBS,2023,2023,No,Annapurna,Geometric Interactive,Cocoon,"Puzzle, adventure, linear",,,
4,Final Fantasy XVI,92,88,PS5,2023,2023,No,Square Enix,Square Enix,Final Fantasy,"Action-RPG, story-driven, action, third-person",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,Super Metroid,93,96,Switch,1994,2022,Yes,Nintendo,Nintendo,Metroid,"Platformer, metroidvania, 2D",,,
464,Donkey Kong Country,87,93,Switch,1994,2023,Yes,Nintendo,Rare,Donkey Kong,"2D, platformer",,,
465,The Legend of Zelda: A Link to the Past,80,95,Switch,1991,2023,No,Nintendo,Nintendo,The Legend of Zelda,"Adventure, top-down",,,
466,Super Mario World,91,94,Switch,1990,2023,Yes,Nintendo,Nintendo,Super Mario (2D),"2D, platformer",,,


In [98]:
# a bit of basic plotting
fig = px.scatter(raw_data, x="Reviewscore", y="Rating", color="Played on", hover_name="Game")
fig

In [99]:
# target to beat - pearson r relationship between review score and rating
print('Pearson R - ', pearsonr(raw_data['Reviewscore'],raw_data['Rating']))
# this is on all the data

Pearson R -  PearsonRResult(statistic=0.7458807032658183, pvalue=2.865976748298686e-84)


In [100]:
#create set of unique key words from the data frame
str_data = raw_data.copy()
str_data['Key words'] = str_data['Key words'].apply(lambda x: x.split(',')) #convert each entry in key words separated by comma to a list
lst = str_data['Key words'].to_list() #append all lists together
combine = sum(lst, []) #roll the lists into one
remove_space = [s.strip(' ') for s in combine] #remove leading and trailing spaces in list items
capitalize = [s.upper() for s in remove_space] #convert all characters to upper case
kwords = set(capitalize) #remove duplicate list items
kwords = list(kwords)

# add new columns which track the keywords
kw_data = raw_data.copy()
kw_data['Key words'] = kw_data['Key words'].str.upper()
for i in kwords:
    kw_data[i] = kw_data['Key words'].apply(lambda x: 1 if str(i) in x else 0)

In [101]:
#create set of unique platforms from the data frame
str_data = raw_data.copy()
str_data['Played on'] = str_data['Played on'].apply(lambda x: x.split(',')) #convert each entry in key words to a list
lst = str_data['Played on'].to_list() #append all lists together
combine = sum(lst, []) #roll the lists into one
remove_space = [s.strip(' ') for s in combine] #remove leading and trailing spaces in list items
capitalize_plats = [s.upper() for s in remove_space] #convert all characters to upper case
plats = set(capitalize_plats) #remove duplicate list items
plats = list(plats)

# add new columns which track the platforms
data = kw_data.copy()
data['Played on'] = data['Played on'].str.upper()
for i in plats:
    data[i] = data['Played on'].apply(lambda x: 1 if str(i) in x else 0)

In [102]:
# update publisher to group to other if count less than n
counts_p = data['Publisher'].value_counts() < 3 #creates a series with publisher names as index and values as false, for those not meeting condition
data.loc[data['Publisher'].isin(counts_p.index[counts_p]) == True, 'Publisher'] = 'Other' #sets publisher = other if not in counts
#notes - isin(counts.index[counts]) returns false for all publishers not meeting condition, true for those meeting the condition

#update developer to group to other if count less than n
counts_d = data['Developer'].value_counts() < 2 #creates a series with publisher names as index and values as false, for those not meeting condition
data.loc[data['Developer'].isin(counts_d.index[counts_d]) == True, 'Developer'] = 'Other' #sets publisher = other if not in counts

#update franchise to group to other if count less than n
counts_f = data['Franchise'].value_counts() < 2 #creates a series with publisher names as index and values as false, for those not meeting condition
data.loc[data['Franchise'].isin(counts_f.index[counts_f]) == True, 'Franchise'] = 'Other' #sets publisher = other if not in counts


In [103]:
#only use one of this or the next one
'''
#replace publisher string with its average review score
publisher_avg = (data[['Publisher','Reviewscore']]).groupby(['Publisher']).mean()['Reviewscore']
data = pd.merge(data, publisher_avg, on = 'Publisher')
data.rename(columns= {'Reviewscore_y' : 'Publisher avg', 'Reviewscore_x' : 'Reviewscore'}, inplace = True)

#replace developer string with its average review score
developer_avg = (data[['Developer','Reviewscore']]).groupby(['Developer']).mean()['Reviewscore']
data = pd.merge(data, developer_avg, on = 'Developer')
data.rename(columns= {'Reviewscore_y' : 'Developer avg', 'Reviewscore_x' : 'Reviewscore'}, inplace = True)

#replace franchise string with its average review score
franchise_avg = (data[['Franchise','Reviewscore']]).groupby(['Franchise']).mean()['Reviewscore']
data = pd.merge(data, franchise_avg, on = 'Franchise')
data.rename(columns= {'Reviewscore_y' : 'Franchise avg', 'Reviewscore_x' : 'Reviewscore'}, inplace = True)
'''


"\n#replace publisher string with its average review score\npublisher_avg = (data[['Publisher','Reviewscore']]).groupby(['Publisher']).mean()['Reviewscore']\ndata = pd.merge(data, publisher_avg, on = 'Publisher')\ndata.rename(columns= {'Reviewscore_y' : 'Publisher avg', 'Reviewscore_x' : 'Reviewscore'}, inplace = True)\n\n#replace developer string with its average review score\ndeveloper_avg = (data[['Developer','Reviewscore']]).groupby(['Developer']).mean()['Reviewscore']\ndata = pd.merge(data, developer_avg, on = 'Developer')\ndata.rename(columns= {'Reviewscore_y' : 'Developer avg', 'Reviewscore_x' : 'Reviewscore'}, inplace = True)\n\n#replace franchise string with its average review score\nfranchise_avg = (data[['Franchise','Reviewscore']]).groupby(['Franchise']).mean()['Reviewscore']\ndata = pd.merge(data, franchise_avg, on = 'Franchise')\ndata.rename(columns= {'Reviewscore_y' : 'Franchise avg', 'Reviewscore_x' : 'Reviewscore'}, inplace = True)\n"

In [104]:
# one hot encoding of publisher
one_hot = pd.get_dummies(data['Publisher'])
data = data.merge(one_hot, left_index=True, right_index=True)

# one hot encoding of developer
one_hot = pd.get_dummies(data['Developer'])
data = data.merge(one_hot, left_index=True, right_index=True)

# one hot encoding of franchise
one_hot = pd.get_dummies(data['Franchise'])
data = data.merge(one_hot, left_index=True, right_index=True)

In [105]:
# calculate play - launch field
data['Play - launch'] = data['Play Year'] - data['Launch Year']
data['Play - launch'].fillna(0, inplace=True)
# calculate upgraded played field
data.loc[np.isnan(data['Remaster Reviewscore']) == False, 'Remaster played'] = 1
data['Remaster played'].fillna(0, inplace=True)


# calculate latest launch field 
data['Latest launch year'] = data['Remaster Release Year']
data['Latest launch year'].fillna(data['Launch Year'], inplace = True)
# calculate play - latest launch field (remaster)
data['Play - latest launch'] = data['Play Year'] - data['Latest launch year']
data['Play - latest launch'].fillna(0, inplace=True)

# calculate any multiplayer field
data['Key words'] = data['Key words'].str.upper()
data.loc[data['Key words'].str.contains('MULTIPLAYER') == True, 'Any multiplayer'] = 1
data['Any multiplayer'].fillna(0, inplace=True)
# calculate any racing field
data.loc[data['Key words'].str.contains('RACING') == True, 'Any racing'] = 1
data['Any racing'].fillna(0, inplace=True)
# calculate any sports field
data.loc[data['Key words'].str.contains('SPORTS') == True, 'Any sports'] = 1
data['Any sports'].fillna(0, inplace=True)
# calculate any simulation field
data.loc[data['Key words'].str.contains('SIMULATION') == True, 'Any simulation'] = 1
data['Any simulation'].fillna(0, inplace=True)
# calculate any strategy field
data.loc[data['Key words'].str.contains('STRATEGY') == True, 'Any strategy'] = 1
data['Any strategy'].fillna(0, inplace=True)


In [106]:
#Not needed when allowing models to fit y-intercept (offset)
'''
# reduce launch year by the minimum value (minus 1)
min_ly = min(data['Launch Year'])
data['Launch Year'] = data['Launch Year'] - min_ly + 1
# reduce play year by the minimum value (minus 1)
min_py = min(data['Play Year'])
data['Play Year'] = data['Play Year'] - min_py + 1
# reduce latest launch year by the minimum value (minus 1)
min_lly = min(data['Latest launch year'])
data['Latest launch year'] = data['Latest launch year'] - min_lly + 1
'''

"\n# reduce launch year by the minimum value (minus 1)\nmin_ly = min(data['Launch Year'])\ndata['Launch Year'] = data['Launch Year'] - min_ly + 1\n# reduce play year by the minimum value (minus 1)\nmin_py = min(data['Play Year'])\ndata['Play Year'] = data['Play Year'] - min_py + 1\n# reduce latest launch year by the minimum value (minus 1)\nmin_lly = min(data['Latest launch year'])\ndata['Latest launch year'] = data['Latest launch year'] - min_lly + 1\n"

In [107]:
# convert the yes no fields to 1 or 0
data['DLC/ Major update played'] = data['DLC/ Major update played'].replace({'Yes': 1, 'No': 0})

In [108]:
# tidy missing values
data['Remaster Reviewscore'].fillna(0, inplace=True)

In [109]:
#have another look at the data
data

Unnamed: 0,Game,Rating,Reviewscore,Played on,Launch Year,Play Year,DLC/ Major update played,Publisher,Developer,Franchise,...,inFAMOUS,Play - launch,Remaster played,Latest launch year,Play - latest launch,Any multiplayer,Any racing,Any sports,Any simulation,Any strategy
0,The Legend of Zelda: Tears of the Kingdom,100,96,SWITCH,2023,2023,0,Nintendo,Nintendo,The Legend of Zelda,...,0,0,0.0,2023.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Hi-Fi Rush,97,89,XBS,2023,2023,0,Microsoft,Tango Gameworks,Other,...,0,0,0.0,2023.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Marvel's Spider-Man 2,96,90,PS5,2023,2023,0,Sony,Insomniac,Marvel's Spider-Man,...,0,0,0.0,2023.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Cocoon,94,88,XBS,2023,2023,0,Annapurna,Other,Other,...,0,0,0.0,2023.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Final Fantasy XVI,92,88,PS5,2023,2023,0,Square Enix,Square Enix,Final Fantasy,...,0,0,0.0,2023.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,Super Metroid,93,96,SWITCH,1994,2022,1,Nintendo,Nintendo,Metroid,...,0,28,0.0,1994.0,28.0,0.0,0.0,0.0,0.0,0.0
464,Donkey Kong Country,87,93,SWITCH,1994,2023,1,Nintendo,Rare,Donkey Kong,...,0,29,0.0,1994.0,29.0,0.0,0.0,0.0,0.0,0.0
465,The Legend of Zelda: A Link to the Past,80,95,SWITCH,1991,2023,0,Nintendo,Nintendo,Other,...,0,32,0.0,1991.0,32.0,0.0,0.0,0.0,0.0,0.0
466,Super Mario World,91,94,SWITCH,1990,2023,1,Nintendo,Nintendo,Super Mario (2D),...,0,33,0.0,1990.0,33.0,0.0,0.0,0.0,0.0,0.0


In [110]:
# prep for model
X = data.copy()
X.drop(['Rating', 'Game', 'Played on', 'Publisher', 'Developer', 'Franchise', 'Key words', 'Remaster Release Year', 'Developer2'], axis = 1, inplace = True)
y = data['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [111]:
#initialize and fit models
model1 = GradientBoostingRegressor(random_state=0)
model1.fit(X_train, y_train)
'''
model2 = LinearRegression()
model2.fit(X_train, y_train)
'''
model3 = RandomForestRegressor(random_state=0)
model3.fit(X_train, y_train)

model4 = AdaBoostRegressor(random_state=0)
model4.fit(X_train, y_train)

model5 = DecisionTreeRegressor(random_state=0)
model5.fit(X_train, y_train)

model6 = BayesianRidge()
model6.fit(X_train, y_train)

model7 = Ridge(random_state=0)
model7.fit(X_train, y_train)

model8 = RidgeCV()
model8.fit(X_train, y_train)

model9 = ElasticNet(random_state=0)
model9.fit(X_train, y_train)

model10 = ElasticNetCV(random_state=0)
model10.fit(X_train, y_train)

model11 = Lasso(random_state=0)
model11.fit(X_train, y_train)

model12 = LassoCV(random_state=0)
model12.fit(X_train, y_train)

model13 = VotingRegressor(estimators=[('Lasso', model12), ('ElasticNet', model10), ('Ridge', model8), ('BayesianRidge', model6)])
model13.fit(X_train, y_train)

In [112]:
#output some actual model performance scores
print('Model 1 - Gradient Boosting Regressor')
print('Coefficient of determination - ', model1.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model1.predict(X_test)))
print('Mean absolute error - ', mean_absolute_error(y_test,model1.predict(X_test)))
print('Root mean square error - ', mean_squared_error(y_test, model1.predict(X_test)))
print('Std of predictions - ', np.std(model1.predict(X_test)-y_test))
print('')
'''
print('Model 2 - Linear Regressor')
print('Coefficient of determination - ', model2.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model2.predict(X_test)))
'''
print('Model 3 - Random Forest Regressor')
print('Coefficient of determination - ', model3.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model3.predict(X_test)))
print('Mean absolute error - ', mean_absolute_error(y_test,model3.predict(X_test)))
print('Root mean square error - ', mean_squared_error(y_test, model3.predict(X_test)))
print('Std of predictions - ', np.std(model3.predict(X_test)-y_test))
print('')

print('Model 4 - AdaBoost Forest Regressor')
print('Coefficient of determination - ', model4.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model4.predict(X_test)))
print('Mean absolute error - ', mean_absolute_error(y_test,model4.predict(X_test)))
print('Root mean square error - ', mean_squared_error(y_test, model4.predict(X_test)))
print('Std of predictions - ', np.std(model4.predict(X_test)-y_test))
print('')

print('Model 5 - Decision Tree Regressor')
print('Coefficient of determination - ', model5.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model5.predict(X_test)))
print('Mean absolute error - ', mean_absolute_error(y_test,model5.predict(X_test)))
print('Root mean square error - ', mean_squared_error(y_test, model5.predict(X_test)))
print('Std of predictions - ', np.std(model5.predict(X_test)-y_test))
print('')

print('Model 6 - Bayesian Ridge Linear Regressor')
print('Coefficient of determination - ', model6.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model6.predict(X_test)))
print('Mean absolute error - ', mean_absolute_error(y_test,model6.predict(X_test)))
print('Root mean square error - ', mean_squared_error(y_test, model6.predict(X_test)))
print('Std of predictions - ', np.std(model6.predict(X_test)-y_test))
print('')

print('Model 7 - Ridge Linear Regressor')
print('Coefficient of determination - ', model7.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model7.predict(X_test)))
print('Mean absolute error - ', mean_absolute_error(y_test,model7.predict(X_test)))
print('Root mean square error - ', mean_squared_error(y_test, model7.predict(X_test)))
print('Std of predictions - ', np.std(model7.predict(X_test)-y_test))
print('')

print('Model 8 - RidgeCV Linear Regressor')
print('Coefficient of determination - ', model8.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model8.predict(X_test)))
print('Mean absolute error - ', mean_absolute_error(y_test,model8.predict(X_test)))
print('Root mean square error - ', mean_squared_error(y_test, model8.predict(X_test)))
print('Std of predictions - ', np.std(model8.predict(X_test)-y_test))
print('')

print('Model 9 - ElasticNet Linear Regressor')
print('Coefficient of determination - ', model9.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model9.predict(X_test)))
print('Mean absolute error - ', mean_absolute_error(y_test,model9.predict(X_test)))
print('Root mean square error - ', mean_squared_error(y_test, model9.predict(X_test)))
print('Std of predictions - ', np.std(model9.predict(X_test)-y_test))
print('')

print('Model 10 - ElasticNetCV Linear Regressor')
print('Coefficient of determination - ', model10.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model10.predict(X_test)))
print('Mean absolute error - ', mean_absolute_error(y_test,model10.predict(X_test)))
print('Root mean square error - ', mean_squared_error(y_test, model10.predict(X_test)))
print('Std of predictions - ', np.std(model10.predict(X_test)-y_test))
print('')

print('Model 11 - Lasso Linear Regressor')
print('Coefficient of determination - ', model11.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model11.predict(X_test)))
print('Mean absolute error - ', mean_absolute_error(y_test,model11.predict(X_test)))
print('Root mean square error - ', mean_squared_error(y_test, model11.predict(X_test)))
print('Std of predictions - ', np.std(model11.predict(X_test)-y_test))
print('')

print('Model 12 - LassoCV Linear Regressor')
print('Coefficient of determination - ', model12.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model12.predict(X_test)))
print('Mean absolute error - ', mean_absolute_error(y_test,model12.predict(X_test)))
print('Root mean square error - ', mean_squared_error(y_test, model12.predict(X_test)))
print('Std of predictions - ', np.std(model12.predict(X_test)-y_test))
print('')

print('Model 13 - Voting Regressor')
print('Coefficient of determination - ', model13.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model13.predict(X_test)))
print('Mean absolute error - ', mean_absolute_error(y_test,model13.predict(X_test)))
print('Root mean square error - ', mean_squared_error(y_test, model13.predict(X_test)))
print('Std of predictions - ', np.std(model13.predict(X_test)-y_test))
print('')

Model 1 - Gradient Boosting Regressor
Coefficient of determination -  0.49373762248492614
Pearson R -  PearsonRResult(statistic=0.7091524034275316, pvalue=3.6473029956741404e-19)
Mean absolute error -  6.691142997309614
Root mean square error -  82.45053652795136
Std of predictions -  9.033362566657457

Model 3 - Random Forest Regressor
Coefficient of determination -  0.4529287353862552
Pearson R -  PearsonRResult(statistic=0.6854919722428168, pvalue=1.5176135249719476e-17)
Mean absolute error -  6.873076923076923
Root mean square error -  89.09672393162391
Std of predictions -  9.397875604502968

Model 4 - AdaBoost Forest Regressor
Coefficient of determination -  0.42728512982162414
Pearson R -  PearsonRResult(statistic=0.687944426384408, pvalue=1.0482977304861878e-17)
Mean absolute error -  7.683183999108679
Root mean square error -  93.27307424170012
Std of predictions -  9.26503098903321

Model 5 - Decision Tree Regressor
Coefficient of determination -  0.09544955432114732
Pearson 

In [113]:
#compare to the pearson r for review avg in test set
print('Pearson R - ', pearsonr(X_test['Reviewscore'],y_test))

#models that have a better pearson R than benchmark:
# 4, 6, 7, 8, 9, 10, 11, 12, 13
#best model = 13, Voting Regressor (6, 8, 10, 12). Best individual = 6 - Bayesian Ridge Linear Regressor

#investigate models
#print(pd.DataFrame(zip(X.columns, model2.coef_)).to_string()) # check coefficients of linearregressor
# linear regressor seems to require scaling to size of output e.g. 0-100

Pearson R -  PearsonRResult(statistic=0.6992519917346369, pvalue=1.8142480190078242e-18)


In [114]:
# save the best model
joblib.dump(model6, 'gbrmodel.joblib')
#save the data
data.to_pickle('../data/processed_data.pkl')
X.to_pickle('../data/X.pkl')
y.to_pickle('../data/y.pkl')
X_test.to_pickle('../data/X_test.pkl')
y_test.to_pickle('../data/y_test.pkl')