In [6]:
import re

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBRegressor

In [7]:
df = pd.read_csv('data/data.csv')
df = df.set_index('AppID')

df

Unnamed: 0_level_0,Score,name,recent_percent,recent_count,all_percent,all_count,short_desc,long_desc,tags
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
335300,6.0,DARK SOULS™ II: Scholar of the First Sin,83.0,1886.0,86.0,43864.0,DARK SOULS™ II: Scholar of the First Sin bring...,Gamers are in for a big surprise in DARK SOULS...,"['Souls-like', 'Dark Fantasy', 'RPG', 'Difficu..."
374320,6.0,DARK SOULS™ III,94.0,7339.0,94.0,176926.0,Dark Souls continues to push the boundaries wi...,Get the DARK SOULS™ III Season Pass now and ch...,"['Souls-like', 'Dark Fantasy', 'Difficult', 'R..."
570940,6.0,DARK SOULS™: REMASTERED,93.0,852.0,85.0,28933.0,"Then, there was fire. Re-experience the critic...","Then, there was fire. Re-experience the critic...","['Souls-like', 'Education', 'Dark Fantasy', 'A..."
435150,6.0,Divinity: Original Sin 2 - Definitive Edition,94.0,2043.0,95.0,108079.0,The critically acclaimed RPG that raised the b...,The Divine is dead. The Void approaches. And t...,"['Tactical RPG', 'Exploration', 'Story Rich', ..."
427520,6.0,Factorio,98.0,2838.0,98.0,96196.0,Factorio is a game about building and creating...,is a game in which you build and maintain fact...,"['Automation', 'Base Building', 'Resource Mana..."
...,...,...,...,...,...,...,...,...,...
1536610,,OpenTTD,94.0,267.0,95.0,2812.0,OpenTTD is a business simulation game in which...,OpenTTD is a business simulation game in which...,"['Simulation', 'Trains', 'Building', 'Sandbox'..."
1569090,,Vivid Knight,92.0,830.0,92.0,830.0,Vivid Knight is a roguelike adventure game in ...,Explore a dungeon that changes with every play...,"['Roguelike Deckbuilder', 'Difficult', 'Choice..."
1582510,,Mercenaries Blaze,75.0,150.0,75.0,150.0,“Mercenaries Blaze”is the 5th fantasy tactical...,A story about a corrupt kingdom and a fight fo...,"['RPG', 'Strategy', 'Turn-Based Tactics', 'Str..."
1599340,,Lost Ark,97.0,1083.0,97.0,1083.0,Embark on an odyssey for the Lost Ark in a vas...,Embark on an odyssey for the Lost Ark in a vas...,"['Action', 'RPG', 'MMORPG', 'Action RPG', 'Adv..."


In [8]:
df = df[df['name'].notnull()]

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df

Unnamed: 0,column_name,percent_missing
Score,Score,91.076115
name,name,0.0
recent_percent,recent_percent,0.0
recent_count,recent_count,0.0
all_percent,all_percent,0.0
all_count,all_count,0.0
short_desc,short_desc,10.629921
long_desc,long_desc,1.83727
tags,tags,0.0


In [9]:
### Data Pre-process

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
model.max_seq_length = 10000

# Remove appids that were unable to pull information and fill null
df = df[df['name'].notnull()]

# Fill Null
df['short_desc'] = df['short_desc'].fillna('')
df['long_desc'] = df['short_desc'].fillna('')
df['tags'] = df['tags'].fillna('')

# Normalize ratings cols
df['recent_percent']=(df['recent_percent']-df['recent_percent'].mean())/df['recent_percent'].std()
df['recent_count']=(df['recent_count']-df['recent_count'].mean())/df['recent_count'].std()
df['all_percent']=(df['all_percent']-df['all_percent'].mean())/df['all_percent'].std()
df['all_count']=(df['all_count']-df['all_count'].mean())/df['all_count'].std()

# Process name
# Get name embeddings
df['name_emb'] = None
for idx, sentence in zip(df.index, df['name'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'name_emb'] = sentence_emb

# Explode name_emb to multiple cols
emb_len = df['name_emb'].values[0].shape[0]
emb_cols = [f'name_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['name_emb'].tolist(), index=df.index)
df = df.drop(['name_emb'], axis=1)

# Process short_desc
# Get short_desc embeddings
df['short_desc_emb'] = None
for idx, sentence in zip(df.index, df['short_desc'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'short_desc_emb'] = sentence_emb

# Explode short_desc_emb to multiple cols
emb_len = df['short_desc_emb'].values[0].shape[0]
emb_cols = [f'short_desc_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['short_desc_emb'].tolist(), index=df.index)
df = df.drop(['short_desc_emb'], axis=1)

# Process long_desc
# Get long_desc embeddings
df['long_desc_emb'] = None
for idx, sentence in zip(df.index, df['long_desc'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'long_desc_emb'] = sentence_emb

# Explode short_desc_emb to multiple cols
emb_len = df['long_desc_emb'].values[0].shape[0]
emb_cols = [f'long_desc_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['long_desc_emb'].tolist(), index=df.index)
df = df.drop(['long_desc_emb'], axis=1)

# Process tags
# Get tags embeddings
df['tags_emb'] = None
for idx, sentence in zip(df.index, df['tags'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'tags_emb'] = sentence_emb

# Explode tags to multiple cols
emb_len = df['tags_emb'].values[0].shape[0]
emb_cols = [f'tags_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['tags_emb'].tolist(), index=df.index)
df = df.drop(['tags_emb'], axis=1)

# Drop unneeded cols
df_proc = df.drop(['name', 'short_desc', 'long_desc', 'tags'], axis=1)

df_proc

  self[k1] = value[k2]


Unnamed: 0_level_0,Score,recent_percent,recent_count,all_percent,all_count,name_emb_0,name_emb_1,name_emb_2,name_emb_3,name_emb_4,...,tags_emb_374,tags_emb_375,tags_emb_376,tags_emb_377,tags_emb_378,tags_emb_379,tags_emb_380,tags_emb_381,tags_emb_382,tags_emb_383
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
335300,6.0,0.119388,0.126654,0.223968,0.029267,-0.205885,0.530513,0.019095,0.098417,-0.168850,...,0.115262,0.345862,0.314601,0.397895,0.191249,0.155089,0.538477,-0.224875,-0.174770,0.102378
374320,6.0,0.643869,1.197689,0.614615,0.618583,-0.425299,0.070268,0.467150,-0.283747,-0.155813,...,0.126674,0.339611,0.323297,0.429290,0.262892,0.199058,0.464258,-0.147462,-0.180969,0.051385
570940,6.0,0.596189,-0.076436,0.175137,-0.036860,-0.411845,0.084661,0.643239,-0.140084,0.178791,...,0.225787,0.307348,0.386953,0.379711,0.068206,0.046789,0.564373,-0.161320,-0.144110,0.041049
435150,6.0,0.643869,0.157491,0.663446,0.313668,-0.083358,0.879001,0.190073,-0.143532,-0.288290,...,0.368902,0.377223,0.195811,0.414621,0.222195,0.056970,0.097474,0.168953,0.013213,0.036279
427520,6.0,0.834589,0.313638,0.809939,0.261040,-0.203325,-0.352310,0.348856,0.072503,0.431592,...,0.362606,0.404979,0.033295,-0.065764,0.200150,0.384024,-0.044277,-0.100570,0.045499,0.350224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536610,,0.643869,-0.191337,0.663446,-0.152547,-0.991390,-0.133603,-0.438819,0.421941,-0.044087,...,0.424816,0.414719,-0.098012,0.051488,-0.005264,0.162055,-0.048542,-0.050706,0.006860,0.362241
1569090,,0.548509,-0.080757,0.516953,-0.161325,0.290014,0.133307,0.122345,0.043768,0.076412,...,0.056362,0.609228,0.119564,0.231422,0.050099,0.187765,0.506191,-0.376130,0.170645,0.256374
1582510,,-0.262053,-0.214317,-0.313171,-0.164337,-0.098120,0.447348,0.058343,-0.344018,0.370707,...,0.344954,0.644763,0.145340,0.265822,-0.108715,0.220161,-0.029297,-0.116917,0.115908,0.220530
1599340,,0.786909,-0.031065,0.761108,-0.160205,0.179558,0.133478,0.584903,0.291661,0.171639,...,0.269069,0.345348,0.190187,0.354339,0.104349,0.270990,0.176236,-0.248504,0.104688,0.063225


In [17]:
### ML Model

# data split
df_model = df_proc[df_proc['Score'].notnull()]
X = df_model.drop(['Score'], axis=1)
y = df_model['Score']

df_pred = df_proc[df_proc['Score'].isnull()]
X_pred = df_pred.drop(['Score'], axis=1)

# Fit Model
model = XGBRegressor(
    max_depth=32,
    n_estimators=250,
    objective='reg:squarederror',
    random_state=42,
    verbosity=0,
    n_jobs=-1)
model.fit(X, y)

# Get Cross Val Score
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
print(f' Avg. MSE: {scores.mean():0.4f} (+/- {scores.std():0.4f})')

# pred
y_pred = model.predict(X_pred)

df_pred = pd.DataFrame({
    'AppID': X_pred.index.values,
    'Pred Score': y_pred
}).sort_values('Pred Score', ascending=False).set_index('AppID')
df_pred = df_pred.join(df[['name']], how='left')
df_pred = df_pred[['name', 'Pred Score']]

df_pred

 Avg. MSE: -2.3078 (+/- 1.8037)


Unnamed: 0_level_0,name,Pred Score
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1
1173820,FINAL FANTASY VI,5.428536
1307550,Craftopia,5.380709
1328670,Mass Effect™ Legendary Edition,5.372001
320,Half-Life 2: Deathmatch,5.317527
373420,Divinity: Original Sin - Enhanced Edition,5.306143
...,...,...
361420,ASTRONEER,2.261503
460950,Katana ZERO,2.200018
1536610,OpenTTD,2.028811
808090,"Bury Me, My Love",2.020130


In [18]:
"""
No Name
78	588650	5.660944 - Dead Cells
411	288470	5.444581 - Fable Anniversy
642	1174180	5.382903 - RDR2
185	550	5.309083     - L4D2
376	17460	5.304587 - Mass Effect

Name
576	976730	5.124917 - Halo: MCC
627	1174180	4.943077 - RDR2
626	1172620	4.901459 - Sea of Thieves
334	391540	4.901459 - Undertale
535	812140	4.883088 - AC: Odyssey
Avg. MSE: -1.9802 (+/- 1.4576)

Pathfinder: Kingmaker - Enhanced Plus Edition	5.656024
Disco Elysium - The Final Cut	5.580381
Ghostrunner	5.534202
Divinity: Original Sin - Enhanced Edition	5.524318
The Elder Scrolls IV: Oblivion® Game of the Ye...	5.486963
"""

'\nNo Name\n78\t588650\t5.660944 - Dead Cells\n411\t288470\t5.444581 - Fable Anniversy\n642\t1174180\t5.382903 - RDR2\n185\t550\t5.309083     - L4D2\n376\t17460\t5.304587 - Mass Effect\n\nName\n576\t976730\t5.124917 - Halo: MCC\n627\t1174180\t4.943077 - RDR2\n626\t1172620\t4.901459 - Sea of Thieves\n334\t391540\t4.901459 - Undertale\n535\t812140\t4.883088 - AC: Odyssey\nAvg. MSE: -1.9802 (+/- 1.4576)\n\nPathfinder: Kingmaker - Enhanced Plus Edition\t5.656024\nDisco Elysium - The Final Cut\t5.580381\nGhostrunner\t5.534202\nDivinity: Original Sin - Enhanced Edition\t5.524318\nThe Elder Scrolls IV: Oblivion® Game of the Ye...\t5.486963\n'

In [19]:
for i , (col, imp) in enumerate(sorted(zip(X_pred.columns, model.feature_importances_), key=lambda x: x[1], reverse=True)):
    print(f'{i+1}:\t {col} - {imp}')

1:	 short_desc_emb_77 - 0.2072708010673523
2:	 tags_emb_45 - 0.1323198825120926
3:	 tags_emb_2 - 0.11169445514678955
4:	 short_desc_emb_72 - 0.06746085733175278
5:	 name_emb_14 - 0.06168180704116821
6:	 name_emb_44 - 0.04708661511540413
7:	 name_emb_343 - 0.04054763913154602
8:	 name_emb_270 - 0.03490351513028145
9:	 tags_emb_331 - 0.02970220148563385
10:	 short_desc_emb_113 - 0.029453029856085777
11:	 short_desc_emb_273 - 0.028360877186059952
12:	 name_emb_320 - 0.027783123776316643
13:	 short_desc_emb_74 - 0.026059841737151146
14:	 short_desc_emb_144 - 0.01763145811855793
15:	 name_emb_246 - 0.01581518165767193
16:	 tags_emb_251 - 0.009064163081347942
17:	 short_desc_emb_183 - 0.009015262126922607
18:	 tags_emb_295 - 0.008626204915344715
19:	 name_emb_32 - 0.008284243755042553
20:	 short_desc_emb_76 - 0.008005697280168533
21:	 name_emb_300 - 0.007071197498589754
22:	 tags_emb_14 - 0.0067472136579453945
23:	 short_desc_emb_123 - 0.006734563037753105
24:	 short_desc_emb_3 - 0.006384422