In [28]:
import re

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBRegressor

In [29]:
df = pd.read_csv('data/data.csv')
df = df.set_index('AppID')

df

Unnamed: 0_level_0,Score,name,recent_percent,recent_count,all_percent,all_count,short_desc,long_desc,tags
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
335300,6.0,DARK SOULS™ II: Scholar of the First Sin,84.0,704.0,86.0,42211.0,DARK SOULS™ II: Scholar of the First Sin bring...,Gamers are in for a big surprise in DARK SOULS...,"['Souls-like', 'Dark Fantasy', 'RPG', 'Difficu..."
374320,6.0,DARK SOULS™ III,93.0,2870.0,93.0,170504.0,Dark Souls continues to push the boundaries wi...,Get the DARK SOULS™ III Season Pass now and ch...,"['Souls-like', 'Dark Fantasy', 'Difficult', 'R..."
570940,6.0,DARK SOULS™: REMASTERED,92.0,720.0,85.0,28282.0,"Then, there was fire. Re-experience the critic...","Then, there was fire. Re-experience the critic...","['Souls-like', 'Education', 'Dark Fantasy', 'A..."
435150,6.0,Divinity: Original Sin 2 - Definitive Edition,95.0,1780.0,95.0,106442.0,The critically acclaimed RPG that raised the b...,The Divine is dead. The Void approaches. And t...,"['Exploration', 'Tactical RPG', 'Story Rich', ..."
427520,6.0,Factorio,98.0,1305.0,98.0,93926.0,Factorio is a game about building and creating...,is a game in which you build and maintain fact...,"['Automation', 'Base Building', 'Resource Mana..."
...,...,...,...,...,...,...,...,...,...
300550,,Shadowrun: Dragonfall - Director's Cut,89.0,28.0,89.0,4273.0,Harebrained Schemes' biggest Shadowrun game to...,is a standalone release of Harebrained Schemes...,"['RPG', 'Cyberpunk', 'Turn-Based', 'Story Rich..."
1123770,,Curse of the Dead Gods,82.0,85.0,87.0,3653.0,"You seek untold riches, eternal life, divine p...","You seek untold riches, eternal life, divine p...","['Action Roguelike', 'Dark Fantasy', 'Isometri..."
1656220,,Lost Ark Platinum Founder's Pack,0.0,0.0,0.0,0.0,,Enjoy Lost Ark's launch in luxurious comfort w...,"['Action', 'Adventure', 'Free to Play', 'Massi..."
1599340,,Lost Ark,95.0,420.0,95.0,420.0,Embark on an odyssey for the Lost Ark in a vas...,Embark on an odyssey for the Lost Ark in a vas...,"['Action', 'RPG', 'MMORPG', 'Adventure', 'Acti..."


In [30]:
df = df[df['name'].notnull()]

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df

Unnamed: 0,column_name,percent_missing
Score,Score,91.286863
name,name,0.0
recent_percent,recent_percent,0.0
recent_count,recent_count,0.0
all_percent,all_percent,0.0
all_count,all_count,0.0
short_desc,short_desc,10.723861
long_desc,long_desc,1.608579
tags,tags,0.0


In [32]:
### Data Pre-process

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
model.max_seq_length = 10000

# Remove appids that were unable to pull information and fill null
df = df[df['name'].notnull()]

# Fill Null
df['short_desc'] = df['short_desc'].fillna('')
df['long_desc'] = df['short_desc'].fillna('')
df['tags'] = df['tags'].fillna('')

# Normalize ratings cols
df['recent_percent']=(df['recent_percent']-df['recent_percent'].mean())/df['recent_percent'].std()
df['recent_count']=(df['recent_count']-df['recent_count'].mean())/df['recent_count'].std()
df['all_percent']=(df['all_percent']-df['all_percent'].mean())/df['all_percent'].std()
df['all_count']=(df['all_count']-df['all_count'].mean())/df['all_count'].std()

# Process name
# Get name embeddings
df['name_emb'] = None
for idx, sentence in zip(df.index, df['name'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'name_emb'] = sentence_emb

# Explode name_emb to multiple cols
emb_len = df['name_emb'].values[0].shape[0]
emb_cols = [f'name_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['name_emb'].tolist(), index=df.index)
df = df.drop(['name_emb'], axis=1)

# Process short_desc
# Get short_desc embeddings
df['short_desc_emb'] = None
for idx, sentence in zip(df.index, df['short_desc'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'short_desc_emb'] = sentence_emb

# Explode short_desc_emb to multiple cols
emb_len = df['short_desc_emb'].values[0].shape[0]
emb_cols = [f'short_desc_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['short_desc_emb'].tolist(), index=df.index)
df = df.drop(['short_desc_emb'], axis=1)

# Process long_desc
# Get long_desc embeddings
df['long_desc_emb'] = None
for idx, sentence in zip(df.index, df['long_desc'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'long_desc_emb'] = sentence_emb

# Explode short_desc_emb to multiple cols
emb_len = df['long_desc_emb'].values[0].shape[0]
emb_cols = [f'long_desc_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['long_desc_emb'].tolist(), index=df.index)
df = df.drop(['long_desc_emb'], axis=1)

# Process tags
# Get tags embeddings
df['tags_emb'] = None
for idx, sentence in zip(df.index, df['tags'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'tags_emb'] = sentence_emb

# Explode tags to multiple cols
emb_len = df['tags_emb'].values[0].shape[0]
emb_cols = [f'tags_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['tags_emb'].tolist(), index=df.index)
df = df.drop(['tags_emb'], axis=1)

# Drop unneeded cols
df_proc = df.drop(['name', 'short_desc', 'long_desc', 'tags'], axis=1)

df_proc

Unnamed: 0_level_0,Score,recent_percent,recent_count,all_percent,all_count,name_emb_0,name_emb_1,name_emb_2,name_emb_3,name_emb_4,...,tags_emb_374,tags_emb_375,tags_emb_376,tags_emb_377,tags_emb_378,tags_emb_379,tags_emb_380,tags_emb_381,tags_emb_382,tags_emb_383
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
335300,6.0,0.135476,-0.067126,0.192202,0.021239,-0.205885,0.530514,0.019095,0.098417,-0.168850,...,0.115262,0.345862,0.314601,0.397895,0.191249,0.155089,0.538477,-0.224874,-0.174770,0.102378
374320,6.0,0.594169,0.467825,0.563386,0.590301,-0.425300,0.070268,0.467150,-0.283747,-0.155813,...,0.116821,0.334152,0.317752,0.427623,0.259606,0.187720,0.463271,-0.159190,-0.183715,0.048535
570940,6.0,0.543203,-0.063174,0.139176,-0.040546,-0.411845,0.084661,0.643239,-0.140084,0.178792,...,0.225787,0.307348,0.386953,0.379711,0.068206,0.046789,0.564373,-0.161320,-0.144110,0.041049
435150,6.0,0.696101,0.198621,0.669438,0.306145,-0.083358,0.879001,0.190073,-0.143532,-0.288290,...,0.347019,0.380082,0.186806,0.406189,0.202399,0.070317,0.111917,0.159146,0.006333,0.042420
427520,6.0,0.848999,0.081307,0.828517,0.250628,-0.203325,-0.352310,0.348856,0.072503,0.431592,...,0.356976,0.407364,0.030227,-0.060064,0.199887,0.386202,-0.035973,-0.105921,0.050927,0.349570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300550,,0.390306,-0.234082,0.351281,-0.147041,-0.056121,-0.387639,0.244720,-0.006604,0.354779,...,0.284871,0.598091,0.153172,0.397115,-0.090651,0.215774,0.145057,-0.100049,0.025129,0.260629
1123770,,0.033545,-0.220004,0.245228,-0.149791,-0.134340,0.665992,0.176863,-0.211233,-0.215129,...,0.247612,0.505186,0.259316,0.296274,-0.013975,0.291900,0.476312,-0.227017,0.209345,0.065049
1656220,,-4.145658,-0.240997,-4.368051,-0.165995,-0.504908,0.580813,0.138195,0.232517,0.339013,...,0.267895,0.377920,0.187860,0.376677,0.092930,0.217142,0.113441,-0.224005,0.028210,0.128886
1599340,,0.696101,-0.137267,0.669438,-0.164132,0.179558,0.133478,0.584903,0.291661,0.171639,...,0.259837,0.393009,0.190811,0.325813,0.104222,0.272540,0.120247,-0.210933,0.116108,0.048274


In [40]:
### ML Model

# data split
df_model = df_proc[df_proc['Score'].notnull()]
X = df_model.drop(['Score'], axis=1)
y = df_model['Score']

df_pred = df_proc[df_proc['Score'].isnull()]
X_pred = df_pred.drop(['Score'], axis=1)

# model
objective = ['reg:squarederror']
max_depth = [int(x) for x in np.linspace(25, 50, num=5) if x != 0]
n_estimators = [int(x) for x in np.linspace(50, 250, num=10) if x != 0]
random_grid = {
    'objective': objective,
    'max_depth': max_depth,
    'n_estimators': n_estimators,
}
model = XGBRegressor(random_state=42, verbosity=0, n_jobs=-1)
search = GridSearchCV(estimator=model, param_grid=random_grid, scoring='neg_mean_squared_error', cv=3, verbose=1, n_jobs=-1)
search.fit(X, y)
print(search.best_params_)

# Fit Model
model = XGBRegressor(**search.best_params_, random_state=42, verbosity=0, n_jobs=-1)
model.fit(X, y)

# Get Cross Val Score
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
print(f' Avg. MSE: {scores.mean():0.4f} (+/- {scores.std():0.4f})')

# pred
y_pred = model.predict(X_pred)

df_pred = pd.DataFrame({
    'AppID': X_pred.index.values,
    'Pred Score': y_pred
}).sort_values('Pred Score', ascending=False).set_index('AppID')
df_pred = df_pred.join(df[['name']], how='left')
df_pred = df_pred[['name', 'Pred Score']]

df_pred

Fitting 3 folds for each of 50 candidates, totalling 150 fits
{'max_depth': 25, 'n_estimators': 72, 'objective': 'reg:squarederror'}
 Avg. MSE: -2.3128 (+/- 1.4824)


Unnamed: 0_level_0,name,Pred Score
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1
640820,Pathfinder: Kingmaker - Enhanced Plus Edition,5.656024
632470,Disco Elysium - The Final Cut,5.580381
1139900,Ghostrunner,5.534202
373420,Divinity: Original Sin - Enhanced Edition,5.524318
22330,The Elder Scrolls IV: Oblivion® Game of the Ye...,5.486963
...,...,...
1418630,Dread Hunger,2.844273
601840,Griftlands,2.745710
848450,Subnautica: Below Zero,2.740181
433080,XCOM 2: Anarchy's Children,2.716713


In [41]:
"""
No Name
78	588650	5.660944 - Dead Cells
411	288470	5.444581 - Fable Anniversy
642	1174180	5.382903 - RDR2
185	550	5.309083     - L4D2
376	17460	5.304587 - Mass Effect

Name
576	976730	5.124917 - Halo: MCC
627	1174180	4.943077 - RDR2
626	1172620	4.901459 - Sea of Thieves
334	391540	4.901459 - Undertale
535	812140	4.883088 - AC: Odyssey
Avg. MSE: -1.9802 (+/- 1.4576)
"""

'\nNo Name\n78\t588650\t5.660944 - Dead Cells\n411\t288470\t5.444581 - Fable Anniversy\n642\t1174180\t5.382903 - RDR2\n185\t550\t5.309083     - L4D2\n376\t17460\t5.304587 - Mass Effect\n\nName\n576\t976730\t5.124917 - Halo: MCC\n627\t1174180\t4.943077 - RDR2\n626\t1172620\t4.901459 - Sea of Thieves\n334\t391540\t4.901459 - Undertale\n535\t812140\t4.883088 - AC: Odyssey\nAvg. MSE: -1.9802 (+/- 1.4576)\n'

In [42]:
for i , (col, imp) in enumerate(sorted(zip(X_pred.columns, model.feature_importances_), key=lambda x: x[1], reverse=True)):
    print(f'{i+1}:\t {col} - {imp}')

1:	 short_desc_emb_291 - 0.21761205792427063
2:	 name_emb_583 - 0.12712204456329346
3:	 short_desc_emb_281 - 0.11921712011098862
4:	 name_emb_343 - 0.0727023109793663
5:	 name_emb_45 - 0.05890677124261856
6:	 name_emb_289 - 0.052350860089063644
7:	 name_emb_732 - 0.04873856157064438
8:	 name_emb_658 - 0.037858348339796066
9:	 name_emb_281 - 0.026585353538393974
10:	 name_emb_120 - 0.024303916841745377
11:	 short_desc_emb_571 - 0.020129768177866936
12:	 tags_emb_286 - 0.016809502616524696
13:	 name_emb_744 - 0.01641199178993702
14:	 tags_emb_25 - 0.013854063116014004
15:	 tags_emb_211 - 0.01361350528895855
16:	 short_desc_emb_643 - 0.011295730248093605
17:	 name_emb_717 - 0.010061802342534065
18:	 name_emb_229 - 0.00984121672809124
19:	 short_desc_emb_530 - 0.008785940706729889
20:	 name_emb_12 - 0.00877720769494772
21:	 short_desc_emb_182 - 0.0076879700645804405
22:	 short_desc_emb_412 - 0.007244313135743141
23:	 short_desc_emb_114 - 0.007057476323097944
24:	 name_emb_142 - 0.006797822

1462:	 short_desc_emb_661 - 0.0
1463:	 short_desc_emb_662 - 0.0
1464:	 short_desc_emb_663 - 0.0
1465:	 short_desc_emb_665 - 0.0
1466:	 short_desc_emb_666 - 0.0
1467:	 short_desc_emb_667 - 0.0
1468:	 short_desc_emb_668 - 0.0
1469:	 short_desc_emb_669 - 0.0
1470:	 short_desc_emb_670 - 0.0
1471:	 short_desc_emb_671 - 0.0
1472:	 short_desc_emb_672 - 0.0
1473:	 short_desc_emb_673 - 0.0
1474:	 short_desc_emb_674 - 0.0
1475:	 short_desc_emb_675 - 0.0
1476:	 short_desc_emb_676 - 0.0
1477:	 short_desc_emb_677 - 0.0
1478:	 short_desc_emb_678 - 0.0
1479:	 short_desc_emb_679 - 0.0
1480:	 short_desc_emb_680 - 0.0
1481:	 short_desc_emb_681 - 0.0
1482:	 short_desc_emb_682 - 0.0
1483:	 short_desc_emb_683 - 0.0
1484:	 short_desc_emb_684 - 0.0
1485:	 short_desc_emb_685 - 0.0
1486:	 short_desc_emb_686 - 0.0
1487:	 short_desc_emb_687 - 0.0
1488:	 short_desc_emb_688 - 0.0
1489:	 short_desc_emb_689 - 0.0
1490:	 short_desc_emb_690 - 0.0
1491:	 short_desc_emb_691 - 0.0
1492:	 short_desc_emb_692 - 0.0
1493:	 s