In [1]:
import re

import numpy as np
import pandas as pd
import scipy.stats as stats
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('data/data.csv')
df = df.set_index('AppID')

df

Unnamed: 0_level_0,Score,name,recent_percent,recent_count,all_percent,all_count,short_desc,long_desc,tags
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
335300,6.0,DARK SOULS™ II: Scholar of the First Sin,83.0,735.0,86.0,46611.0,DARK SOULS™ II: Scholar of the First Sin bring...,Gamers are in for a big surprise in DARK SOULS...,"['Souls-like', 'Dark Fantasy', 'RPG', 'Difficu..."
374320,6.0,DARK SOULS™ III,94.0,2741.0,94.0,187094.0,Dark Souls continues to push the boundaries wi...,Get the DARK SOULS™ III Season Pass now and ch...,"['Souls-like', 'Dark Fantasy', 'Difficult', 'R..."
570940,6.0,DARK SOULS™: REMASTERED,92.0,742.0,86.0,30940.0,"Then, there was fire. Re-experience the critic...","Then, there was fire. Re-experience the critic...","['Souls-like', 'Education', 'Dark Fantasy', 'A..."
435150,6.0,Divinity: Original Sin 2 - Definitive Edition,93.0,2560.0,95.0,113478.0,The critically acclaimed RPG that raised the b...,The Divine is dead. The Void approaches. And t...,"['Tactical RPG', 'Exploration', 'Story Rich', ..."
427520,6.0,Factorio,98.0,1816.0,98.0,100653.0,Factorio is a game about building and creating...,is a game in which you build and maintain fact...,"['Automation', 'Base Building', 'Resource Mana..."
...,...,...,...,...,...,...,...,...,...
1283410,,Tails of Iron,89.0,1548.0,89.0,1548.0,Tails of Iron is an epic RPG Adventure with pu...,"Set in a grim land plagued by war, Tails of Ir...","['Action RPG', 'Adventure', 'Souls-like', 'Act..."
367500,,Dragon's Dogma: Dark Arisen,92.0,179.0,89.0,18804.0,"Set in a huge open world, Dragon’s Dogma: Dark...","Set in a huge open world, presents a rewarding...","['RPG', 'Open World', 'Character Customization..."
1481400,,Dagon: by H. P. Lovecraft,97.0,2509.0,97.0,2509.0,Face unspeakable horrors. Succumb to madness. ...,----------------------------------------------...,"['Horror', 'Lovecraftian', 'Atmospheric', 'Vis..."
1210320,,Potion Craft: Alchemist Simulator,92.0,2749.0,92.0,2749.0,Potion Craft is an alchemist simulator where y...,"Leaves, flowers, berries, roots, fruits, miner...","['Early Access', 'Singleplayer', 'Crafting', '..."


In [3]:
df = df[df['name'].notnull()]

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df

Unnamed: 0,column_name,percent_missing
Score,Score,90.341578
name,name,0.0
recent_percent,recent_percent,0.0
recent_count,recent_count,0.0
all_percent,all_percent,0.0
all_count,all_count,0.0
short_desc,short_desc,9.658422
long_desc,long_desc,1.531213
tags,tags,0.0


In [4]:
### Data Pre-process

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
model.max_seq_length = 30

# Remove appids that were unable to pull information and fill null
df = df[df['name'].notnull()]

# Fill Null
df['short_desc'] = df['short_desc'].fillna('')
df['long_desc'] = df['short_desc'].fillna('')
df['tags'] = df['tags'].fillna('')

# Normalize ratings cols
df['recent_percent']=(df['recent_percent']-df['recent_percent'].mean())/df['recent_percent'].std()
df['recent_count']=(df['recent_count']-df['recent_count'].mean())/df['recent_count'].std()
df['all_percent']=(df['all_percent']-df['all_percent'].mean())/df['all_percent'].std()
df['all_count']=(df['all_count']-df['all_count'].mean())/df['all_count'].std()

# Process name
# Get name embeddings
df['name_emb'] = None
for idx, sentence in zip(df.index, df['name'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'name_emb'] = sentence_emb

# Explode name_emb to multiple cols
emb_len = df['name_emb'].values[0].shape[0]
emb_cols = [f'name_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['name_emb'].tolist(), index=df.index)
df = df.drop(['name_emb'], axis=1)

# Process short_desc
# Get short_desc embeddings
df['short_desc_emb'] = None
for idx, sentence in zip(df.index, df['short_desc'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'short_desc_emb'] = sentence_emb

# Explode short_desc_emb to multiple cols
emb_len = df['short_desc_emb'].values[0].shape[0]
emb_cols = [f'short_desc_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['short_desc_emb'].tolist(), index=df.index)
df = df.drop(['short_desc_emb'], axis=1)

# Process long_desc
# Get long_desc embeddings
df['long_desc_emb'] = None
for idx, sentence in zip(df.index, df['long_desc'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'long_desc_emb'] = sentence_emb

# Explode short_desc_emb to multiple cols
emb_len = df['long_desc_emb'].values[0].shape[0]
emb_cols = [f'long_desc_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['long_desc_emb'].tolist(), index=df.index)
df = df.drop(['long_desc_emb'], axis=1)

# Process tags
# Get tags embeddings
df['tags_emb'] = None
for idx, sentence in zip(df.index, df['tags'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'tags_emb'] = sentence_emb

# Explode tags to multiple cols
emb_len = df['tags_emb'].values[0].shape[0]
emb_cols = [f'tags_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['tags_emb'].tolist(), index=df.index)
df = df.drop(['tags_emb'], axis=1)

# Drop unneeded cols
df_proc = df.drop(['name', 'short_desc', 'long_desc', 'tags'], axis=1)

df_proc

  self[k1] = value[k2]


Unnamed: 0_level_0,Score,recent_percent,recent_count,all_percent,all_count,name_emb_0,name_emb_1,name_emb_2,name_emb_3,name_emb_4,...,tags_emb_374,tags_emb_375,tags_emb_376,tags_emb_377,tags_emb_378,tags_emb_379,tags_emb_380,tags_emb_381,tags_emb_382,tags_emb_383
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
335300,6.0,0.039777,-0.060399,0.184392,0.046743,-0.205885,0.530513,0.019095,0.098417,-0.168850,...,0.367525,0.402845,0.248234,0.347452,-0.244038,0.217287,0.596662,-0.185484,0.025290,0.248362
374320,6.0,0.630355,0.424180,0.634407,0.678312,-0.425299,0.070268,0.467150,-0.283747,-0.155813,...,-0.007539,0.256561,0.118918,0.446231,-0.118015,-0.008100,0.701982,-0.163791,-0.200916,0.074816
570940,6.0,0.522977,-0.058709,0.184392,-0.023709,-0.411845,0.084661,0.643239,-0.140084,0.178791,...,0.318000,0.147942,0.137227,0.104347,-0.584267,0.060784,0.632545,-0.213510,0.025605,0.109319
435150,6.0,0.576666,0.380456,0.690659,0.347357,-0.083358,0.879001,0.190073,-0.143532,-0.288290,...,0.749291,0.440631,-0.025958,0.194535,-0.064091,0.150000,-0.038542,0.379924,-0.027450,0.074269
427520,6.0,0.845110,0.200732,0.859414,0.289700,-0.203325,-0.352310,0.348856,0.072503,0.431592,...,0.598158,0.255851,-0.037133,-0.383200,0.143489,0.491210,-0.059223,-0.007958,0.271041,0.497386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1283410,,0.361910,0.135993,0.353148,-0.155847,-0.872054,0.652232,0.284597,0.426859,-0.615087,...,0.357731,0.420448,0.082633,0.339275,0.304262,0.160785,0.563766,-0.053411,-0.139348,0.180551
367500,,0.522977,-0.194710,0.353148,-0.078269,-0.070725,-0.392730,0.143116,0.816427,-0.205678,...,0.335243,0.437943,-0.019936,0.203400,0.053612,0.318893,0.228779,-0.026610,-0.135828,0.252787
1481400,,0.791422,0.368137,0.803162,-0.151526,-0.445134,0.463027,0.055783,-0.008295,0.002106,...,0.460440,0.226008,0.429912,0.420732,-0.389293,0.259608,0.723123,-0.323522,-0.453848,-0.061351
1210320,,0.522977,0.426112,0.521903,-0.150447,-0.145114,0.433815,-0.178932,0.141523,-0.502249,...,0.639029,0.421100,0.033655,0.116816,-0.047695,0.274811,0.128277,0.361566,-0.034780,0.601590


In [5]:
### ML Model

# data split
df_model = df_proc[df_proc['Score'].notnull()]
X = df_model.drop(['Score'], axis=1)
y = df_model['Score']

df_pred = df_proc[df_proc['Score'].isnull()]
X_pred = df_pred.drop(['Score'], axis=1)

# Fit Model
model = XGBRegressor(
    max_depth=4,  # 32
    n_estimators=354,  # 250
    objective='reg:squarederror',
    random_state=42,
    verbosity=0,
    n_jobs=-1)
model.fit(X, y)

# Get Cross Val Score
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
print(f' Avg. MSE: {scores.mean():0.4f} (+/- {scores.std():0.4f})')

# pred
y_pred = model.predict(X_pred)

df_pred = pd.DataFrame({
    'AppID': X_pred.index.values,
    'Pred Score': y_pred
}).sort_values('Pred Score', ascending=False).set_index('AppID')
df_pred = df_pred.join(df[['name']], how='left')
df_pred = df_pred[['name', 'Pred Score']]

 Avg. MSE: -2.2944 (+/- 1.6671)


In [7]:
"""
504230	Celeste	5.581610
236430	DARK SOULS™ II	5.548808
22380	Fallout: New Vegas	5.480839
256290	Child of Light	5.277474
1030300	Hollow Knight: Silksong	5.207141
594650	Hunt: Showdown	5.200999
489520	Minion Masters	5.153164
704450	Neverwinter Nights: Enhanced Edition	5.141562
954740	Terminator: Resistance	5.114006
632470	Disco Elysium - The Final Cut	5.104737
288470	Fable Anniversary	5.088338
230230	Divinity: Original Sin (Classic)	5.075427
207170	Legend of Grimrock	5.075035
1086940	Baldur's Gate 3	5.059088
307690	Sleeping Dogs: Definitive Edition	5.048236
610180	The Jackbox Party Pack 4	5.042070
1160220	Paradise Killer	5.023338
1218500	Galactic Mining Corp	5.018690
1281930	tModLoader	5.016625
1369630	ENDER LILIES: Quietus of the Knights	5.014005
234390	Teleglitch: Die More Edition	4.997182
383870	Firewatch	4.959198
377840	FINAL FANTASY IX	4.955317
774461	The Jackbox Party Pack 5	4.934936
1284410	GWENT: The Witcher Card Game	4.933845
"""

df_pred.head(25)

Unnamed: 0_level_0,name,Pred Score
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1
377160,Fallout 4,5.414262
221380,Age of Empires II (2013),5.362202
720620,TaleSpire,5.261229
253230,A Hat in Time,5.117088
1517290,Battlefield™ 2042,5.046405
1281930,tModLoader,5.041458
34330,Total War: SHOGUN 2,5.038711
257350,Baldur's Gate II: Enhanced Edition,5.025123
552500,Warhammer: Vermintide 2,5.021749
204360,Castle Crashers®,5.020782


In [8]:
df_pred[df_pred['name'].str.contains('Celeste')]  # Picked - Pred: 5.6, Steam Rank 11

Unnamed: 0_level_0,name,Pred Score
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1
504230,Celeste,4.435533


In [9]:
for i , (col, imp) in enumerate(sorted(zip(X_pred.columns, model.feature_importances_), key=lambda x: x[1], reverse=True)[:50]):
    print(f'{i+1}:\t {col} - {imp}')

1:	 short_desc_emb_281 - 0.18665967881679535
2:	 short_desc_emb_327 - 0.11323479562997818
3:	 name_emb_286 - 0.11280108988285065
4:	 name_emb_36 - 0.08334245532751083
5:	 name_emb_185 - 0.07473339140415192
6:	 name_emb_206 - 0.03661862760782242
7:	 short_desc_emb_300 - 0.036281660199165344
8:	 all_count - 0.02914009802043438
9:	 name_emb_45 - 0.02824324555695057
10:	 short_desc_emb_191 - 0.027385303750634193
11:	 short_desc_emb_331 - 0.02246381714940071
12:	 short_desc_emb_202 - 0.021862972527742386
13:	 name_emb_229 - 0.01981320232152939
14:	 short_desc_emb_274 - 0.019718211144208908
15:	 name_emb_208 - 0.015994984656572342
16:	 tags_emb_76 - 0.015754524618387222
17:	 tags_emb_295 - 0.012575052678585052
18:	 name_emb_218 - 0.012003966607153416
19:	 name_emb_189 - 0.008926288224756718
20:	 name_emb_231 - 0.008072699420154095
21:	 tags_emb_352 - 0.0073201823979616165
22:	 short_desc_emb_72 - 0.007291710935533047
23:	 name_emb_311 - 0.007240083534270525
24:	 name_emb_21 - 0.0069947009906

In [10]:
## Analysis

In [11]:
df = pd.read_excel('/mnt/c/Users/mcmin/gdrive/video_games/reviews/reviews_and_wishlist.xlsx', skiprows=2)

df

Unnamed: 0,Rank,Game,AppID,Platform,Year(s) Played,Score,GoG Rating,Unnamed: 7,0.2722682126,Unnamed: 9,Unnamed: 10,0.7277317874,Unnamed: 12,Unnamed: 13
0,1,Dark Souls II: Scholar of the First Sin,335300.0,PC,2020.0,6.0,5.0,,,83,4.175,1.205,,
1,2,Dark Souls III,374320.0,PC,2020.0,6.0,5.0,,Year,# of Games,Avg Rating,SD Rating,,
2,3,Dark Souls Remastered,570940.0,PC,2020.0,6.0,5.0,,2020.0,58,4.31,1.2,,
3,4,Divinity: Original Sin 2,435150.0,PC,2020.0,6.0,5.0,,2021.0,25,4.04,1.21,,
4,5,Factorio,427520.0,PC,2020.0,6.0,5.0,,2022.0,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,79,Outward,794260.0,PC,2021.0,2.0,2.0,,,,,,,
79,80,Phasmophobia,739630.0,PC,2021.0,2.0,2.0,,,,,,,
80,81,Superbrothers: Sword & Sworcery EP,204060.0,PC,2020.0,2.0,2.0,,,,,,,
81,82,War Thunder,236390.0,PC,2020.0,2.0,2.0,,,,,,,


In [None]:
## Hyper Opt Testing

from hyperopt import tpe, hp, fmin

def objective(params):
    MAX_SEQ_LENGTH = params['max_seq_length']
    MAX_DEPTH = params['max_depth']
    N_ESTIMATORS = params['n_estimators']
    
    df = pd.read_csv('data/data.csv')
    df = df.set_index('AppID')
    
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    model.max_seq_length = MAX_SEQ_LENGTH

    # Remove appids that were unable to pull information and fill null
    df = df[df['name'].notnull()]

    # Fill Null
    df['short_desc'] = df['short_desc'].fillna('')
    df['long_desc'] = df['short_desc'].fillna('')
    df['tags'] = df['tags'].fillna('')

    # Normalize ratings cols
    df['recent_percent']=(df['recent_percent']-df['recent_percent'].mean())/df['recent_percent'].std()
    df['recent_count']=(df['recent_count']-df['recent_count'].mean())/df['recent_count'].std()
    df['all_percent']=(df['all_percent']-df['all_percent'].mean())/df['all_percent'].std()
    df['all_count']=(df['all_count']-df['all_count'].mean())/df['all_count'].std()

    # Process name
    # Get name embeddings
    df['name_emb'] = None
    for idx, sentence in zip(df.index, df['name'].values):
        sentence_emb = model.encode(sentence)
        df.at[idx, 'name_emb'] = sentence_emb

    # Explode name_emb to multiple cols
    emb_len = df['name_emb'].values[0].shape[0]
    emb_cols = [f'name_emb_{i}' for i in range(0, emb_len)]
    df[emb_cols] = pd.DataFrame(df['name_emb'].tolist(), index=df.index)
    df = df.drop(['name_emb'], axis=1)

    # Process short_desc
    # Get short_desc embeddings
    df['short_desc_emb'] = None
    for idx, sentence in zip(df.index, df['short_desc'].values):
        sentence_emb = model.encode(sentence)
        df.at[idx, 'short_desc_emb'] = sentence_emb

    # Explode short_desc_emb to multiple cols
    emb_len = df['short_desc_emb'].values[0].shape[0]
    emb_cols = [f'short_desc_emb_{i}' for i in range(0, emb_len)]
    df[emb_cols] = pd.DataFrame(df['short_desc_emb'].tolist(), index=df.index)
    df = df.drop(['short_desc_emb'], axis=1)

    # Process long_desc
    # Get long_desc embeddings
    df['long_desc_emb'] = None
    for idx, sentence in zip(df.index, df['long_desc'].values):
        sentence_emb = model.encode(sentence)
        df.at[idx, 'long_desc_emb'] = sentence_emb

    # Explode short_desc_emb to multiple cols
    emb_len = df['long_desc_emb'].values[0].shape[0]
    emb_cols = [f'long_desc_emb_{i}' for i in range(0, emb_len)]
    df[emb_cols] = pd.DataFrame(df['long_desc_emb'].tolist(), index=df.index)
    df = df.drop(['long_desc_emb'], axis=1)

    # Process tags
    # Get tags embeddings
    df['tags_emb'] = None
    for idx, sentence in zip(df.index, df['tags'].values):
        sentence_emb = model.encode(sentence)
        df.at[idx, 'tags_emb'] = sentence_emb

    # Explode tags to multiple cols
    emb_len = df['tags_emb'].values[0].shape[0]
    emb_cols = [f'tags_emb_{i}' for i in range(0, emb_len)]
    df[emb_cols] = pd.DataFrame(df['tags_emb'].tolist(), index=df.index)
    df = df.drop(['tags_emb'], axis=1)

    # Drop unneeded cols
    df_proc = df.drop(['name', 'short_desc', 'long_desc', 'tags'], axis=1).copy()
    
    # data split
    df_model = df_proc[df_proc['Score'].notnull()].copy()
    X = df_model.drop(['Score'], axis=1).copy()
    y = df_model['Score'].copy()

    df_pred = df_proc[df_proc['Score'].isnull()].copy()
    X_pred = df_pred.drop(['Score'], axis=1).copy()

    # Fit Model
    model = XGBRegressor(
        max_depth=MAX_DEPTH,
        n_estimators=N_ESTIMATORS,
        objective='reg:squarederror',
        random_state=42,
        verbosity=0,
        n_jobs=-1)
    model.fit(X, y)

    # Get Cross Val Score
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
    
    return scores.mean()*-1


space = {
    'max_seq_length': hp.randint('max_seq_length', 20, 40),
    'max_depth': hp.randint('max_depth', 1, 8),
    'n_estimators': hp.randint('n_estimators', 200, 500)
}


best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=60
)

print(best)