In [73]:
# initialisation
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import itertools
import joblib
from scipy.stats import *
from pandasgui import show
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [74]:
# import the model and data
model = joblib.load('gbrmodel.joblib')
data = pd.read_pickle('../data/processed_data.pkl')
y_test = pd.read_pickle('../data/y_test.pkl')
X_test = pd.read_pickle('../data/X_test.pkl')
X = pd.read_pickle('../data/X.pkl')
y = pd.read_pickle('../data/y.pkl')

In [75]:
# analyse performance using test set
test_labels = pd.merge(y_test, data['Game'], left_index= True, right_index = True) #join the game name back in to rating scores
test_scores = pd.DataFrame({'Game': test_labels['Game'], 'Predicted Rating':model.predict(X_test), 'Actual Rating': test_labels['Rating']}) #join the test set to the predictions
test_scores.to_csv('../data/test_scores_output.csv')

In [76]:
# scatter plot of predicted scores vs actual for test set
fig = px.scatter(test_scores, x="Actual Rating", y="Predicted Rating", hover_name= 'Game')
#calculate minimum point for dashed line
min1 = min(test_scores["Actual Rating"])
min2 = min(test_scores["Predicted Rating"])
min3 = min(min1, min2)
#plot dashed line
fig.add_trace(go.Scatter(x=[min3,100], y=[min3, 100], line = dict(dash = 'dash')))

In [77]:
#obtain plot of predicted rating vs review score
all_labels = pd.merge(y, data['Game'], left_index= True, right_index = True) #join the game name back in to rating scores
scores_labels = pd.merge(all_labels, data['Reviewscore'], left_index= True, right_index = True) #join the review avg back in to rating scores
scores_comp = pd.DataFrame({'Game': scores_labels['Game'], 
                            'Predicted Rating': model.predict(X), 
                            'Actual Rating': scores_labels['Rating'], 
                            'Review avg': scores_labels['Reviewscore']}) #join the test set to the predictions

In [78]:
# scatter plot of predictions vs review avg
fig = px.scatter(scores_comp, x="Review avg", y="Predicted Rating", hover_name= 'Game')
fig
# switch to a different chart type that shows spread?

In [79]:
# scatter plot of spread of predictions by rating
scores_comp['Prediction delta'] = scores_comp['Actual Rating'] - scores_comp['Predicted Rating']
fig = px.scatter(scores_comp, x="Actual Rating", y="Prediction delta", hover_name= 'Game')
#find line of best fit
lobf = np.polyfit(scores_comp["Actual Rating"], scores_comp["Prediction delta"], 1)
best_fit_points = (np.poly1d(lobf)(scores_comp["Actual Rating"]))
#plot line of best fit
fig.add_trace(go.Scatter(x=scores_comp["Actual Rating"], y=best_fit_points, line = dict(dash = 'dash')))

In [80]:
print("gradient = ", lobf[0], ", y-intercept = ", lobf[1])
# the predictions are too high for low actual scores, too low for high actual scores. Expected for linear regression

gradient =  0.3295120899705617 , y-intercept =  -24.97446706559056


In [81]:
# scatter plot of spread of predictions by review rating
scores_comp['Prediction delta rvw'] = scores_comp['Actual Rating'] - scores_comp['Predicted Rating']
fig = px.scatter(scores_comp, x="Review avg", y="Prediction delta rvw", hover_name= 'Game')
#find line of best fit
lobf1 = np.polyfit(scores_comp["Review avg"], scores_comp["Prediction delta rvw"], 1)
best_fit_points1 = (np.poly1d(lobf1)(scores_comp["Review avg"]))
#plot line of best fit
fig.add_trace(go.Scatter(x=scores_comp["Review avg"], y=best_fit_points1, line = dict(dash = 'dash')))

In [82]:
print("gradient = ", lobf1[0], ", y-intercept = ", lobf1[1])
# No bias of prediction error with review score

gradient =  0.002102778498397109 , y-intercept =  -0.021268218827039845


In [83]:
# feature analysis

print(tuple(zip(X.columns,model.coef_)))

(('Reviewscore', 1.019110948118653), ('Launch Year', 0.07069003074139465), ('Play Year', -0.20743542049769237), ('DLC/ Major update played', 4.2518179611373945), ('Remaster Reviewscore', 0.02147749171610644), ('INTERACTIVE-STORY', -0.0342594420829347), ('LINEAR', 1.3465107657719506), ('COMPILATION', 0.17104290387143292), ('RACING (ARCADE)', -0.3224781953562501), ('STRATEGY (RTS)', -0.10799943632545606), ('SPORTS (FOOTBALL)', -0.212180656191922), ('OPEN-WORLD', -0.6654073742748533), ('RPG', -0.42142306230635207), ('TOP-DOWN', 0.39541326836364676), ('MONSTER COLLECTING', 0.9129206625453055), ('COLLECTATHON', 0.3397264947478086), ('SOULSLIKE', 1.258263751079561), ('EDUCATIONAL', -3.163954136304256e-14), ('STRATEGY (REAL-TIME)', 0.3351822828457781), ('SUPERHERO', -0.12708495449369495), ('RACING (SIMCADE)', 0.3267481136693211), ('BRAWLER', 0.21080089577640926), ('SPORTS (GOLF)', 0.4793030600958485), ('VR', -1.2897331517539832), ('ADVENTURE', -0.8663245876728121), ('STORY-DRIVEN', 2.83305593

[8.24814815e+01 2.01506838e+03 2.01782621e+03 1.42450142e-01
 5.52706553e+00 5.69800570e-03 8.54700855e-02 2.84900285e-03
 4.55840456e-02 2.84900285e-03 3.13390313e-02 1.53846154e-01
 2.10826211e-01 7.40740741e-02 2.84900285e-02 5.69800570e-03
 2.27920228e-02 0.00000000e+00 2.27920228e-02 1.99430199e-02
 3.98860399e-02 5.69800570e-03 8.54700855e-03 5.69800570e-03
 6.83760684e-02 1.90883191e-01 3.24786325e-01 1.13960114e-02
 5.98290598e-02 1.79487179e-01 0.00000000e+00 1.19658120e-01
 8.54700855e-03 0.00000000e+00 0.00000000e+00 2.19373219e-01
 2.84900285e-03 1.19658120e-01 5.98290598e-02 3.70370370e-02
 5.69800570e-03 0.00000000e+00 1.99430199e-02 5.12820513e-02
 7.97720798e-02 1.50997151e-01 8.83190883e-02 1.16809117e-01
 2.84900285e-03 5.69800570e-03 1.22507123e-01 5.69800570e-03
 2.84900285e-03 1.70940171e-02 2.84900285e-03 3.13390313e-02
 2.56410256e-02 0.00000000e+00 2.84900285e-03 4.84330484e-02
 2.84900285e-03 8.54700855e-03 0.00000000e+00 4.27350427e-02
 0.00000000e+00 8.547008