In [381]:
# initialisation
import pandas as pd
import numpy as np
import plotly.express as px
import itertools
from scipy.stats import *
from pandasgui import show
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [353]:
# load the data
raw_data = pd.read_csv('../data/Games Database - Main Page.csv')
#show(raw_data)
#drop fields we won't be needing
raw_data.drop(['Me - Review', 'Percentile', 'Launch Platform(s)', 'Hours Played', 'Completion', 'Year(s) revisited/ continued/ earlier dabbled'], axis = 1, inplace = True)
raw_data

Unnamed: 0,Game,Rating,Reviewscore,Played on,Launch Year,Play Year,DLC/ Major update played,Publisher,Developer,Franchise,Key words,Remaster Reviewscore,Remaster Release Year,Developer2
0,The Legend of Zelda: Tears of the Kingdom,100,96,Switch,2023,2023,No,Nintendo,Nintendo,The Legend of Zelda,"Adventure, open-world, sandbox, exploration, p...",,,
1,The Legend of Zelda: Breath of the Wild,100,97,Switch,2017,2017,Yes,Nintendo,Nintendo,The Legend of Zelda,"Adventure, open-world, sandbox, exploration, p...",,,
2,God of War (2018),99,94,PS4,2018,2018,No,Sony,Santa Monica Studio,God of War,"Hack and slash, third-person combat, story-driven",,,
3,The Last of Us,99,95,PS4,2013,2014,Yes,Sony,Naughty Dog,The Last of Us,"Horror, TPS, stealth, survival, story-driven, ...",94.0,2014.0,
4,Metroid Prime,99,97,Switch,2002,2023,No,Nintendo,Retro Studios,Metroid,"Metroidvania, FPS, platforming, exploration",94.0,2023.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,Fifa 06 Game Boy Advance,44,49,GBA,2005,2005,No,EA,EA sports,Fifa,Sports (football),,,
434,Cruis'n Blast,42,73,Switch,2021,2021,No,Indie,Raw Thrills,Cruise'n Blast,Racing (arcade),,,
435,Fifa 14 Legacy Edition,38,38,Vita,2013,2013,No,EA,EA Sports,Fifa,Sports (football),,,
436,Sunlight,34,78,PC,2021,2023,No,Indie,Krillbite Studio,Sunlight,Adventure,,,


In [354]:
# a bit of basic plotting
fig = px.scatter(raw_data, x="Reviewscore", y="Rating", color="Played on", hover_name="Game")
fig

In [355]:
#create set of unique key words from the data frame
str_data = raw_data.copy()
str_data['Key words'] = str_data['Key words'].apply(lambda x: x.split(',')) #convert each entry in key words to a list
lst = str_data['Key words'].to_list() #append all lists together
combine = sum(lst, []) #roll the lists into one
remove_space = [s.strip(' ') for s in combine] #remove leading and trailing spaces in list items
capitalize = [s.upper() for s in remove_space] #convert all characters to upper case
kwords = set(capitalize) #remove duplicate list items
kwords = list(kwords)

# add new columns which track the keywords
kw_data = raw_data.copy()
kw_data['Key words'] = kw_data['Key words'].str.upper()
for i in kwords:
    kw_data[i] = kw_data['Key words'].apply(lambda x: 1 if str(i) in x else 0)

In [356]:
#create set of unique platforms from the data frame
str_data = raw_data.copy()
str_data['Played on'] = str_data['Played on'].apply(lambda x: x.split(',')) #convert each entry in key words to a list
lst = str_data['Played on'].to_list() #append all lists together
combine = sum(lst, []) #roll the lists into one
remove_space = [s.strip(' ') for s in combine] #remove leading and trailing spaces in list items
capitalize_plats = [s.upper() for s in remove_space] #convert all characters to upper case
plats = set(capitalize_plats) #remove duplicate list items
plats = list(plats)

# add new columns which track the keywords
data = kw_data.copy()
data['Played on'] = data['Played on'].str.upper()
for i in plats:
    data[i] = data['Played on'].apply(lambda x: 1 if str(i) in x else 0)

In [357]:
# calculate play - launch field
data['Play - launch'] = data['Play Year'] - data['Launch Year']
data['Play - launch'].fillna(0, inplace=True)
# calculate upgraded played field
data.loc[np.isnan(data['Remaster Reviewscore']) == False, 'Remaster played'] = 1
data['Remaster played'].fillna(0, inplace=True)
# calculate any multiplayer field
data['Key words'] = data['Key words'].str.upper()
data.loc[data['Key words'].str.contains('MULTIPLAYER') == True, 'Any multiplayer'] = 1
data['Any multiplayer'].fillna(0, inplace=True)
# calculate any racing field
data.loc[data['Key words'].str.contains('RACING') == True, 'Any racing'] = 1
data['Any racing'].fillna(0, inplace=True)
# calculate any sports field
data.loc[data['Key words'].str.contains('SPORTS') == True, 'Any sports'] = 1
data['Any sports'].fillna(0, inplace=True)
# calculate any simulation field
data.loc[data['Key words'].str.contains('SIMULATION') == True, 'Any simulation'] = 1
data['Any simulation'].fillna(0, inplace=True)
# calculate any strategy field
data.loc[data['Key words'].str.contains('STRATEGY') == True, 'Any strategy'] = 1
data['Any strategy'].fillna(0, inplace=True)


In [358]:
# convert the yes no fields to 1 or 0
data['DLC/ Major update played'] = data['DLC/ Major update played'].replace({'Yes': 1, 'No': 0})

In [359]:
# tidy missing values
data['Remaster Reviewscore'].fillna(0, inplace=True)

In [360]:
#have another look at the data
data

Unnamed: 0,Game,Rating,Reviewscore,Played on,Launch Year,Play Year,DLC/ Major update played,Publisher,Developer,Franchise,...,XB1,PS5,PS3,Play - launch,Remaster played,Any multiplayer,Any racing,Any sports,Any simulation,Any strategy
0,The Legend of Zelda: Tears of the Kingdom,100,96,SWITCH,2023,2023,0,Nintendo,Nintendo,The Legend of Zelda,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,The Legend of Zelda: Breath of the Wild,100,97,SWITCH,2017,2017,1,Nintendo,Nintendo,The Legend of Zelda,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,God of War (2018),99,94,PS4,2018,2018,0,Sony,Santa Monica Studio,God of War,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,The Last of Us,99,95,PS4,2013,2014,1,Sony,Naughty Dog,The Last of Us,...,0,0,0,1,1.0,0.0,0.0,0.0,0.0,0.0
4,Metroid Prime,99,97,SWITCH,2002,2023,0,Nintendo,Retro Studios,Metroid,...,0,0,0,21,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,Fifa 06 Game Boy Advance,44,49,GBA,2005,2005,0,EA,EA sports,Fifa,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
434,Cruis'n Blast,42,73,SWITCH,2021,2021,0,Indie,Raw Thrills,Cruise'n Blast,...,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0
435,Fifa 14 Legacy Edition,38,38,VITA,2013,2013,0,EA,EA Sports,Fifa,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
436,Sunlight,34,78,PC,2021,2023,0,Indie,Krillbite Studio,Sunlight,...,0,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0


In [361]:
# prep for model
X = data.copy()
X.drop(['Rating', 'Game', 'Played on', 'Publisher', 'Developer', 'Franchise', 'Key words', 'Remaster Release Year', 'Developer2'], axis = 1, inplace = True)
y = data['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [367]:
#initialize and fit model
model = GradientBoostingRegressor(random_state=0)
model.fit(X_train, y_train)

In [369]:
# obtain feature importances
importance = model.feature_importances_
importances_df = pd.DataFrame([importance], columns = X.columns)
importances_df = importances_df.transpose()
importances_df.to_csv('../data/feature_importances.csv') # send importances to csv

In [378]:
# analyse performance using test set
test_labels = pd.merge(y_test, data['Game'], left_index= True, right_index = True) #join the game name back in to rating scores
test_scores = pd.DataFrame({'Game': test_labels['Game'], 'Predicted Rating':model.predict(X_test), 'Actual Rating': test_labels['Rating']}) #join the test set to the predictions
test_scores.to_csv('../data/test_scores_output.csv')

0.5854584801265793


In [379]:
# scatter plot of scores vs actual
fig = px.scatter(test_scores, x="Actual Rating", y="Predicted Rating", hover_name= 'Game')
fig

In [385]:
#output some actual model performance scores
print('Coefficient of determination - ', model.score(X_test, y_test))
print('Pearson R - ', pearsonr(y_test,model.predict(X_test)))

Coefficient of determination -  0.5854584801265793
Pearson R -  PearsonRResult(statistic=0.7746471268206356, pvalue=3.156324140228438e-23)


In [388]:
#obtain plot of predicted rating vs review score
all_labels = pd.merge(y, data['Game'], left_index= True, right_index = True) #join the game name back in to rating scores
scores_labels = pd.merge(all_labels, data['Reviewscore'], left_index= True, right_index = True) #join the review avg back in to rating scores
scores_comp = pd.DataFrame({'Game': scores_labels['Game'], 
                            'Predicted Rating': model.predict(X), 
                            'Actual Rating': scores_labels['Rating'], 
                            'Review avg': scores_labels['Reviewscore']}) #join the test set to the predictions
# scatter plot
fig = px.scatter(scores_comp, x="Review avg", y="Predicted Rating", hover_name= 'Game')
fig
