In [11]:
import re

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBRegressor

In [12]:
df = pd.read_csv('data/data.csv')
df = df.set_index('AppID')

df

Unnamed: 0_level_0,Score,name,recent_percent,recent_count,all_percent,all_count,short_desc,long_desc,tags
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
335300,6.0,DARK SOULS™ II: Scholar of the First Sin,83.0,1979.0,86.0,44834.0,DARK SOULS™ II: Scholar of the First Sin bring...,Gamers are in for a big surprise in DARK SOULS...,"['Souls-like', 'Dark Fantasy', 'RPG', 'Difficu..."
374320,6.0,DARK SOULS™ III,94.0,7512.0,94.0,180498.0,Dark Souls continues to push the boundaries wi...,Get the DARK SOULS™ III Season Pass now and ch...,"['Souls-like', 'Dark Fantasy', 'Difficult', 'R..."
570940,6.0,DARK SOULS™: REMASTERED,93.0,788.0,85.0,29344.0,"Then, there was fire. Re-experience the critic...","Then, there was fire. Re-experience the critic...","['Souls-like', 'Education', 'Dark Fantasy', 'A..."
435150,6.0,Divinity: Original Sin 2 - Definitive Edition,95.0,2304.0,95.0,109262.0,The critically acclaimed RPG that raised the b...,The Divine is dead. The Void approaches. And t...,"['Tactical RPG', 'Exploration', 'Story Rich', ..."
427520,6.0,Factorio,98.0,2185.0,98.0,97200.0,Factorio is a game about building and creating...,is a game in which you build and maintain fact...,"['Automation', 'Base Building', 'Resource Mana..."
...,...,...,...,...,...,...,...,...,...
1569090,,Vivid Knight,90.0,114.0,92.0,884.0,Vivid Knight is a roguelike adventure game in ...,Explore a dungeon that changes with every play...,"['Roguelike Deckbuilder', 'Difficult', 'Choice..."
1582510,,Mercenaries Blaze,71.0,32.0,75.0,166.0,“Mercenaries Blaze”is the 5th fantasy tactical...,A story about a corrupt kingdom and a fight fo...,"['RPG', 'Strategy', 'Turn-Based Tactics', 'Str..."
1599340,,Lost Ark,97.0,1083.0,97.0,1083.0,Embark on an odyssey for the Lost Ark in a vas...,Embark on an odyssey for the Lost Ark in a vas...,"['Action', 'RPG', 'MMORPG', 'Action RPG', 'Adv..."
1656220,,Lost Ark Platinum Founder's Pack,0.0,0.0,0.0,0.0,,Enjoy Lost Ark's launch in luxurious comfort w...,"['Action', 'Adventure', 'Free to Play', 'Massi..."


In [13]:
df = df[df['name'].notnull()]

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df

Unnamed: 0,column_name,percent_missing
Score,Score,90.944373
name,name,0.0
recent_percent,recent_percent,0.0
recent_count,recent_count,0.0
all_percent,all_percent,0.0
all_count,all_count,0.0
short_desc,short_desc,10.478655
long_desc,long_desc,1.811125
tags,tags,0.0


In [14]:
### Data Pre-process

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
model.max_seq_length = 10000

# Remove appids that were unable to pull information and fill null
df = df[df['name'].notnull()]

# Fill Null
df['short_desc'] = df['short_desc'].fillna('')
df['long_desc'] = df['short_desc'].fillna('')
df['tags'] = df['tags'].fillna('')

# Normalize ratings cols
df['recent_percent']=(df['recent_percent']-df['recent_percent'].mean())/df['recent_percent'].std()
df['recent_count']=(df['recent_count']-df['recent_count'].mean())/df['recent_count'].std()
df['all_percent']=(df['all_percent']-df['all_percent'].mean())/df['all_percent'].std()
df['all_count']=(df['all_count']-df['all_count'].mean())/df['all_count'].std()

# Process name
# Get name embeddings
df['name_emb'] = None
for idx, sentence in zip(df.index, df['name'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'name_emb'] = sentence_emb

# Explode name_emb to multiple cols
emb_len = df['name_emb'].values[0].shape[0]
emb_cols = [f'name_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['name_emb'].tolist(), index=df.index)
df = df.drop(['name_emb'], axis=1)

# Process short_desc
# Get short_desc embeddings
df['short_desc_emb'] = None
for idx, sentence in zip(df.index, df['short_desc'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'short_desc_emb'] = sentence_emb

# Explode short_desc_emb to multiple cols
emb_len = df['short_desc_emb'].values[0].shape[0]
emb_cols = [f'short_desc_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['short_desc_emb'].tolist(), index=df.index)
df = df.drop(['short_desc_emb'], axis=1)

# Process long_desc
# Get long_desc embeddings
df['long_desc_emb'] = None
for idx, sentence in zip(df.index, df['long_desc'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'long_desc_emb'] = sentence_emb

# Explode short_desc_emb to multiple cols
emb_len = df['long_desc_emb'].values[0].shape[0]
emb_cols = [f'long_desc_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['long_desc_emb'].tolist(), index=df.index)
df = df.drop(['long_desc_emb'], axis=1)

# Process tags
# Get tags embeddings
df['tags_emb'] = None
for idx, sentence in zip(df.index, df['tags'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'tags_emb'] = sentence_emb

# Explode tags to multiple cols
emb_len = df['tags_emb'].values[0].shape[0]
emb_cols = [f'tags_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['tags_emb'].tolist(), index=df.index)
df = df.drop(['tags_emb'], axis=1)

# Drop unneeded cols
df_proc = df.drop(['name', 'short_desc', 'long_desc', 'tags'], axis=1)

df_proc

  self[k1] = value[k2]


Unnamed: 0_level_0,Score,recent_percent,recent_count,all_percent,all_count,name_emb_0,name_emb_1,name_emb_2,name_emb_3,name_emb_4,...,tags_emb_374,tags_emb_375,tags_emb_376,tags_emb_377,tags_emb_378,tags_emb_379,tags_emb_380,tags_emb_381,tags_emb_382,tags_emb_383
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
335300,6.0,0.110442,0.208409,0.220522,0.033741,-0.205885,0.530513,0.019095,0.098417,-0.168850,...,0.115262,0.345862,0.314601,0.397895,0.191249,0.155089,0.538477,-0.224875,-0.174770,0.102378
374320,6.0,0.641903,1.538811,0.617411,0.633974,-0.425299,0.070268,0.467150,-0.283747,-0.155813,...,0.126674,0.339611,0.323297,0.429290,0.262892,0.199058,0.464258,-0.147462,-0.180969,0.051385
570940,6.0,0.593589,-0.077965,0.170911,-0.034793,-0.411845,0.084661,0.643239,-0.140084,0.178791,...,0.235841,0.311501,0.388661,0.383162,0.072721,0.044173,0.577991,-0.172254,-0.140407,0.039733
435150,6.0,0.690218,0.286555,0.667022,0.318797,-0.083358,0.879001,0.190073,-0.143532,-0.288290,...,0.370163,0.379355,0.195057,0.410373,0.220993,0.058797,0.099029,0.168630,0.011805,0.039026
427520,6.0,0.835162,0.257941,0.815855,0.265430,-0.203325,-0.352310,0.348856,0.072503,0.431592,...,0.362606,0.404979,0.033295,-0.065764,0.200150,0.384024,-0.044277,-0.100570,0.045499,0.350224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1569090,,0.448645,-0.240028,0.518188,-0.160711,0.290014,0.133307,0.122345,0.043768,0.076412,...,0.056362,0.609228,0.119564,0.231422,0.050099,0.187765,0.506191,-0.376130,0.170645,0.256374
1582510,,-0.469333,-0.259745,-0.325199,-0.163888,-0.098120,0.447348,0.058343,-0.344018,0.370707,...,0.328313,0.634138,0.151306,0.273260,-0.119691,0.218492,-0.013120,-0.126462,0.119150,0.207751
1599340,,0.786847,-0.007033,0.766244,-0.159831,0.179558,0.133478,0.584903,0.291661,0.171639,...,0.247450,0.369483,0.189386,0.351842,0.107656,0.282955,0.151401,-0.236875,0.094177,0.057548
1656220,,-3.899671,-0.267439,-4.046028,-0.164622,-0.504908,0.580813,0.138196,0.232518,0.339013,...,0.267895,0.377920,0.187860,0.376677,0.092930,0.217142,0.113441,-0.224005,0.028210,0.128886


In [15]:
### ML Model

# data split
df_model = df_proc[df_proc['Score'].notnull()]
X = df_model.drop(['Score'], axis=1)
y = df_model['Score']

df_pred = df_proc[df_proc['Score'].isnull()]
X_pred = df_pred.drop(['Score'], axis=1)

# Fit Model
model = XGBRegressor(
    max_depth=32,
    n_estimators=250,
    objective='reg:squarederror',
    random_state=42,
    verbosity=0,
    n_jobs=-1)
model.fit(X, y)

# Get Cross Val Score
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
print(f' Avg. MSE: {scores.mean():0.4f} (+/- {scores.std():0.4f})')

# pred
y_pred = model.predict(X_pred)

df_pred = pd.DataFrame({
    'AppID': X_pred.index.values,
    'Pred Score': y_pred
}).sort_values('Pred Score', ascending=False).set_index('AppID')
df_pred = df_pred.join(df[['name']], how='left')
df_pred = df_pred[['name', 'Pred Score']]

 Avg. MSE: -2.2045 (+/- 1.3941)


In [16]:
df_pred.head(25)

Unnamed: 0_level_0,name,Pred Score
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1
373420,Divinity: Original Sin - Enhanced Edition,5.889911
230230,Divinity: Original Sin (Classic),5.59472
258970,Gauntlet™ Slayer Edition,5.530334
727850,ELDERBORN,5.454527
236430,DARK SOULS™ II,5.429187
610180,The Jackbox Party Pack 4,5.410854
1086940,Baldur's Gate 3,5.398372
1090202,Destiny 2: Shadowkeep Digital Deluxe,5.38759
57300,Amnesia: The Dark Descent,5.353191
812140,Assassin's Creed® Odyssey,5.337534


In [18]:
for i , (col, imp) in enumerate(sorted(zip(X_pred.columns, model.feature_importances_), key=lambda x: x[1], reverse=True)):
    print(f'{i+1}:\t {col} - {imp}')

1:	 tags_emb_45 - 0.15889500081539154
2:	 short_desc_emb_77 - 0.12083449959754944
3:	 name_emb_320 - 0.08438920229673386
4:	 tags_emb_369 - 0.08187878131866455
5:	 tags_emb_140 - 0.06282053887844086
6:	 tags_emb_2 - 0.04937141016125679
7:	 tags_emb_183 - 0.040391936898231506
8:	 short_desc_emb_72 - 0.0393349714577198
9:	 name_emb_291 - 0.03929606080055237
10:	 name_emb_270 - 0.028644585981965065
11:	 name_emb_44 - 0.027433916926383972
12:	 tags_emb_331 - 0.02735893987119198
13:	 short_desc_emb_273 - 0.02616981230676174
14:	 tags_emb_164 - 0.02198970317840576
15:	 name_emb_236 - 0.019435252994298935
16:	 tags_emb_354 - 0.013787875883281231
17:	 name_emb_308 - 0.013079879805445671
18:	 short_desc_emb_270 - 0.012608706951141357
19:	 tags_emb_146 - 0.011722777038812637
20:	 name_emb_120 - 0.011319129727780819
21:	 name_emb_22 - 0.01026031281799078
22:	 name_emb_13 - 0.008552712388336658
23:	 short_desc_emb_114 - 0.008478990755975246
24:	 short_desc_emb_361 - 0.00742401834577322
25:	 name_e