In [65]:
import re

import numpy as np
import pandas as pd
import scipy.stats as stats
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBRegressor

In [66]:
df = pd.read_csv('data/data.csv')
df = df.set_index('AppID')

df

Unnamed: 0_level_0,Score,name,recent_percent,recent_count,all_percent,all_count,short_desc,long_desc,tags
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
335300,6.0,DARK SOULS™ II: Scholar of the First Sin,83.0,2014.0,86.0,44866.0,DARK SOULS™ II: Scholar of the First Sin bring...,Gamers are in for a big surprise in DARK SOULS...,"['Souls-like', 'Dark Fantasy', 'RPG', 'Difficu..."
374320,6.0,DARK SOULS™ III,94.0,7602.0,94.0,180590.0,Dark Souls continues to push the boundaries wi...,Get the DARK SOULS™ III Season Pass now and ch...,"['Souls-like', 'Dark Fantasy', 'Difficult', 'R..."
570940,6.0,DARK SOULS™: REMASTERED,93.0,799.0,85.0,29354.0,"Then, there was fire. Re-experience the critic...","Then, there was fire. Re-experience the critic...","['Souls-like', 'Education', 'Dark Fantasy', 'A..."
435150,6.0,Divinity: Original Sin 2 - Definitive Edition,95.0,2343.0,95.0,109299.0,The critically acclaimed RPG that raised the b...,The Divine is dead. The Void approaches. And t...,"['Tactical RPG', 'Exploration', 'Story Rich', ..."
427520,6.0,Factorio,98.0,2224.0,98.0,97239.0,Factorio is a game about building and creating...,is a game in which you build and maintain fact...,"['Automation', 'Base Building', 'Resource Mana..."
...,...,...,...,...,...,...,...,...,...
1599340,,Lost Ark,97.0,1083.0,97.0,1083.0,Embark on an odyssey for the Lost Ark in a vas...,Embark on an odyssey for the Lost Ark in a vas...,"['Action', 'RPG', 'MMORPG', 'Action RPG', 'Adv..."
1656220,,Lost Ark Platinum Founder's Pack,0.0,0.0,0.0,0.0,,Enjoy Lost Ark's launch in luxurious comfort w...,"['Action', 'Adventure', 'Free to Play', 'Massi..."
1277400,,Monster Hunter Stories 2: Wings of Ruin,75.0,5965.0,75.0,5965.0,A new adventure awaits you in this second inst...,A new adventure awaits you in this second inst...,"['RPG', 'Adventure', 'JRPG', 'Exploration', '3..."
1076750,,Dream Engines: Nomad Cities - A survival city ...,77.0,171.0,77.0,171.0,A survival city-building game with flying citi...,"Advanced society is long gone, the world is ov...","['Early Access', 'City Builder', 'Survival', '..."


In [67]:
df = df[df['name'].notnull()]

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df

Unnamed: 0,column_name,percent_missing
Score,Score,90.803109
name,name,0.0
recent_percent,recent_percent,0.0
recent_count,recent_count,0.0
all_percent,all_percent,0.0
all_count,all_count,0.0
short_desc,short_desc,10.492228
long_desc,long_desc,1.813472
tags,tags,0.0


In [68]:
### Data Pre-process

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
model.max_seq_length = 30

# Remove appids that were unable to pull information and fill null
df = df[df['name'].notnull()]

# Fill Null
df['short_desc'] = df['short_desc'].fillna('')
df['long_desc'] = df['short_desc'].fillna('')
df['tags'] = df['tags'].fillna('')

# Normalize ratings cols
df['recent_percent']=(df['recent_percent']-df['recent_percent'].mean())/df['recent_percent'].std()
df['recent_count']=(df['recent_count']-df['recent_count'].mean())/df['recent_count'].std()
df['all_percent']=(df['all_percent']-df['all_percent'].mean())/df['all_percent'].std()
df['all_count']=(df['all_count']-df['all_count'].mean())/df['all_count'].std()

# Process name
# Get name embeddings
df['name_emb'] = None
for idx, sentence in zip(df.index, df['name'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'name_emb'] = sentence_emb

# Explode name_emb to multiple cols
emb_len = df['name_emb'].values[0].shape[0]
emb_cols = [f'name_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['name_emb'].tolist(), index=df.index)
df = df.drop(['name_emb'], axis=1)

# Process short_desc
# Get short_desc embeddings
df['short_desc_emb'] = None
for idx, sentence in zip(df.index, df['short_desc'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'short_desc_emb'] = sentence_emb

# Explode short_desc_emb to multiple cols
emb_len = df['short_desc_emb'].values[0].shape[0]
emb_cols = [f'short_desc_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['short_desc_emb'].tolist(), index=df.index)
df = df.drop(['short_desc_emb'], axis=1)

# Process long_desc
# Get long_desc embeddings
df['long_desc_emb'] = None
for idx, sentence in zip(df.index, df['long_desc'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'long_desc_emb'] = sentence_emb

# Explode short_desc_emb to multiple cols
emb_len = df['long_desc_emb'].values[0].shape[0]
emb_cols = [f'long_desc_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['long_desc_emb'].tolist(), index=df.index)
df = df.drop(['long_desc_emb'], axis=1)

# Process tags
# Get tags embeddings
df['tags_emb'] = None
for idx, sentence in zip(df.index, df['tags'].values):
    sentence_emb = model.encode(sentence)
    df.at[idx, 'tags_emb'] = sentence_emb

# Explode tags to multiple cols
emb_len = df['tags_emb'].values[0].shape[0]
emb_cols = [f'tags_emb_{i}' for i in range(0, emb_len)]
df[emb_cols] = pd.DataFrame(df['tags_emb'].tolist(), index=df.index)
df = df.drop(['tags_emb'], axis=1)

# Drop unneeded cols
df_proc = df.drop(['name', 'short_desc', 'long_desc', 'tags'], axis=1)

df_proc

  self[k1] = value[k2]


Unnamed: 0_level_0,Score,recent_percent,recent_count,all_percent,all_count,name_emb_0,name_emb_1,name_emb_2,name_emb_3,name_emb_4,...,tags_emb_374,tags_emb_375,tags_emb_376,tags_emb_377,tags_emb_378,tags_emb_379,tags_emb_380,tags_emb_381,tags_emb_382,tags_emb_383
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
335300,6.0,0.110608,0.208854,0.220835,0.033608,-0.205885,0.530513,0.019095,0.098417,-0.168850,...,0.367525,0.402845,0.248234,0.347452,-0.244038,0.217287,0.596662,-0.185484,0.025290,0.248362
374320,6.0,0.642481,1.527405,0.617540,0.633537,-0.425299,0.070268,0.467150,-0.283747,-0.155813,...,-0.007539,0.256561,0.118918,0.446231,-0.118015,-0.008100,0.701982,-0.163791,-0.200916,0.074816
570940,6.0,0.594129,-0.077839,0.171246,-0.034958,-0.411845,0.084661,0.643239,-0.140084,0.178791,...,0.341885,0.164670,0.137789,0.106830,-0.571389,0.048195,0.653776,-0.177162,0.020019,0.108367
435150,6.0,0.690833,0.286485,0.667128,0.318416,-0.083358,0.879001,0.190073,-0.143532,-0.288290,...,0.735162,0.449787,0.012366,0.168979,-0.092204,0.181183,-0.048598,0.356594,-0.059989,0.063845
427520,6.0,0.835889,0.258406,0.815893,0.265108,-0.203325,-0.352310,0.348856,0.072503,0.431592,...,0.598158,0.255851,-0.037133,-0.383200,0.143489,0.491210,-0.059223,-0.007958,0.271041,0.497386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599340,,0.787537,-0.010826,0.766305,-0.159922,0.179558,0.133478,0.584903,0.291661,0.171639,...,0.168111,0.576673,0.004248,0.144641,0.098493,0.077850,0.370460,0.116989,-0.011114,0.065285
1656220,,-3.902612,-0.266372,-4.043751,-0.164709,-0.504908,0.580813,0.138196,0.232518,0.339013,...,0.533803,0.646251,0.163006,0.224924,-0.116009,0.256123,-0.072009,-0.025624,0.023500,-0.010729
1277400,,-0.276208,1.141136,-0.324636,-0.138342,-0.014609,0.455735,0.329910,0.062382,-0.240313,...,0.798438,0.320465,-0.093224,0.063498,0.043771,0.094388,0.193889,0.212342,0.287899,-0.007700
1076750,,-0.179504,-0.226022,-0.225459,-0.163953,0.698049,0.558619,0.262506,-0.056114,0.078203,...,0.827848,0.236626,-0.131105,-0.040002,0.136860,0.453156,0.047654,0.425447,-0.085478,0.586692


In [69]:
### ML Model

# data split
df_model = df_proc[df_proc['Score'].notnull()]
X = df_model.drop(['Score'], axis=1)
y = df_model['Score']

df_pred = df_proc[df_proc['Score'].isnull()]
X_pred = df_pred.drop(['Score'], axis=1)

# Fit Model
model = XGBRegressor(
    max_depth=4,  # 32
    n_estimators=354,  # 250
    objective='reg:squarederror',
    random_state=42,
    verbosity=0,
    n_jobs=-1)
model.fit(X, y)

# Get Cross Val Score
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
print(f' Avg. MSE: {scores.mean():0.4f} (+/- {scores.std():0.4f})')

# pred
y_pred = model.predict(X_pred)

df_pred = pd.DataFrame({
    'AppID': X_pred.index.values,
    'Pred Score': y_pred
}).sort_values('Pred Score', ascending=False).set_index('AppID')
df_pred = df_pred.join(df[['name']], how='left')
df_pred = df_pred[['name', 'Pred Score']]

 Avg. MSE: -1.7352 (+/- 1.4558)


In [70]:
df_pred.head(25)

Unnamed: 0_level_0,name,Pred Score
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1
256290,Child of Light,5.676215
632470,Disco Elysium - The Final Cut,5.603426
434170,The Jackbox Party Pack 3,5.556976
1030300,Hollow Knight: Silksong,5.530453
504230,Celeste,5.490695
924980,Trials of Mana,5.490064
57300,Amnesia: The Dark Descent,5.449245
753640,Outer Wilds,5.424916
350970,Planet of the Eyes,5.405744
236430,DARK SOULS™ II,5.39151


In [71]:
df_pred[df_pred['name'] == 'Dishonored 2']

Unnamed: 0_level_0,name,Pred Score
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1
403640,Dishonored 2,4.456824


In [72]:
"""
-2.4472 (+/- 1.5821)
Picked - Elderborn - Actual 2
373420	Divinity: Original Sin - Enhanced Edition	5.889911
230230	Divinity: Original Sin (Classic)	5.594720
258970	Gauntlet™ Slayer Edition	5.530334
727850	ELDERBORN	5.454527
236430	DARK SOULS™ II	5.429187
610180	The Jackbox Party Pack 4	5.410854
1086940	Baldur's Gate 3	5.398372
1090202	Destiny 2: Shadowkeep Digital Deluxe	5.387590
57300	Amnesia: The Dark Descent	5.353191
812140	Assassin's Creed® Odyssey	5.337534
288470	Fable Anniversary	5.336144
972660	Spiritfarer®	5.321304
466300	Planescape: Torment: Enhanced Edition	5.316258
434170	The Jackbox Party Pack 3	5.305679
973760	Thronebreaker: The Witcher Tales	5.304965
366090	Colony Survival	5.298052
995980	Fae Tactics	5.249742
1090200	Destiny 2: Shadowkeep	5.246860
55230	Saints Row: The Third	5.240653
320	Half-Life 2: Deathmatch	5.222818
471810	Death Squared	5.222415
374040	Portal Knights	5.198342
740130	Tales of Arise	5.192035
247240	Volgarr the Viking	5.189807
792710	Levelhead	5.186869
"""

"\n-2.4472 (+/- 1.5821)\nPicked - Elderborn - Actual 2\n373420\tDivinity: Original Sin - Enhanced Edition\t5.889911\n230230\tDivinity: Original Sin (Classic)\t5.594720\n258970\tGauntlet™ Slayer Edition\t5.530334\n727850\tELDERBORN\t5.454527\n236430\tDARK SOULS™ II\t5.429187\n610180\tThe Jackbox Party Pack 4\t5.410854\n1086940\tBaldur's Gate 3\t5.398372\n1090202\tDestiny 2: Shadowkeep Digital Deluxe\t5.387590\n57300\tAmnesia: The Dark Descent\t5.353191\n812140\tAssassin's Creed® Odyssey\t5.337534\n288470\tFable Anniversary\t5.336144\n972660\tSpiritfarer®\t5.321304\n466300\tPlanescape: Torment: Enhanced Edition\t5.316258\n434170\tThe Jackbox Party Pack 3\t5.305679\n973760\tThronebreaker: The Witcher Tales\t5.304965\n366090\tColony Survival\t5.298052\n995980\tFae Tactics\t5.249742\n1090200\tDestiny 2: Shadowkeep\t5.246860\n55230\tSaints Row: The Third\t5.240653\n320\tHalf-Life 2: Deathmatch\t5.222818\n471810\tDeath Squared\t5.222415\n374040\tPortal Knights\t5.198342\n740130\tTales of Aris

In [73]:
for i , (col, imp) in enumerate(sorted(zip(X_pred.columns, model.feature_importances_), key=lambda x: x[1], reverse=True)[:50]):
    print(f'{i+1}:\t {col} - {imp}')

1:	 short_desc_emb_281 - 0.1938369870185852
2:	 short_desc_emb_272 - 0.1636776626110077
3:	 tags_emb_229 - 0.08847485482692719
4:	 tags_emb_297 - 0.0751568153500557
5:	 short_desc_emb_273 - 0.05691559240221977
6:	 name_emb_45 - 0.05176933482289314
7:	 name_emb_36 - 0.04654887691140175
8:	 short_desc_emb_23 - 0.03158954530954361
9:	 name_emb_131 - 0.029899485409259796
10:	 short_desc_emb_327 - 0.02649020217359066
11:	 name_emb_218 - 0.024021204560995102
12:	 short_desc_emb_225 - 0.022083839401602745
13:	 name_emb_21 - 0.0186562892049551
14:	 name_emb_229 - 0.018284283578395844
15:	 name_emb_22 - 0.013313963077962399
16:	 tags_emb_166 - 0.013094264082610607
17:	 name_emb_54 - 0.012116559781134129
18:	 name_emb_376 - 0.01161077432334423
19:	 name_emb_44 - 0.008474244736135006
20:	 tags_emb_295 - 0.008036486804485321
21:	 short_desc_emb_231 - 0.00603900570422411
22:	 tags_emb_225 - 0.005647892598062754
23:	 tags_emb_307 - 0.0053861550986766815
24:	 name_emb_213 - 0.005348252132534981
25:	 

In [54]:
## Analysis

In [55]:
df = pd.read_excel('/mnt/c/Users/mcmin/gdrive/video_games/reviews/reviews_and_wishlist.xlsx', skiprows=2)

df

Unnamed: 0,Rank,Game,AppID,Base Category,Tier 1 Category,Platform,Year(s) Played,Score,GoG Rating,Unnamed: 9,0.2626849304,Unnamed: 11,Unnamed: 12,0.7373150696,Unnamed: 14,Unnamed: 15
0,1.0,Dark Souls II: Scholar of the First Sin,335300.0,RPG,Adventure RPG,PC,2020.0,6.0,5.0,,,79,4.225,1.22,,
1,2.0,Dark Souls III,374320.0,RPG,Adventure RPG,PC,2020.0,6.0,5.0,,Year,# of Games,Avg Rating,SD Rating,,
2,3.0,Dark Souls Remastered,570940.0,RPG,Adventure RPG,PC,2020.0,6.0,5.0,,2020.0,58,4.31,1.2,,
3,4.0,Divinity: Original Sin 2,435150.0,RPG,Traditional RPG,PC,2020.0,6.0,5.0,,2021.0,21,4.14,1.24,,
4,5.0,Factorio,427520.0,Management,Automation,PC,2020.0,6.0,5.0,,2022.0,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,75.0,Outward,794260.0,RPG,Adventure RPG,PC,2021.0,2.0,2.0,,,,,,,
75,76.0,Superbrothers: Sword & Sworcery EP,204060.0,Adventure,Point and Click,PC,2020.0,2.0,2.0,,,,,,,
76,77.0,War Thunder,236390.0,Arena,Vehicle Shooter,PC,2020.0,2.0,2.0,,,,,,,
77,78.0,"Warhammer 40,000: Mechanicus",673880.0,Strategy,Tactical Strategy,PC,2020.0,1.0,1.0,,,,,,,


In [56]:
tmp = df.groupby(['Base Category']).agg({'Score': [np.mean, np.std, np.count_nonzero]})

tmp[('Score', 'ci_left')] = stats.truncnorm.interval(
    0.95,
    (1-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    (6-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    loc=tmp[('Score', 'mean')],
    scale=tmp[('Score', 'std')]
)[0]
tmp[('Score', 'ci_right')] = stats.truncnorm.interval(
    0.95,
    (1-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    (6-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    loc=tmp[('Score', 'mean')],
    scale=tmp[('Score', 'std')]
)[1]
tmp[('Score', 'prob_like')] = 1-stats.truncnorm.cdf(
    3.5,
    (1-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    (6-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    loc=tmp[('Score', 'mean')],
    scale=tmp[('Score', 'std')]
)
tmp[('Score', 'prob_dislike')] = stats.truncnorm.cdf(
    3.5,
    (1-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    (6-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    loc=tmp[('Score', 'mean')],
    scale=tmp[('Score', 'std')]
)

tmp = tmp.sort_values(('Score', 'prob_like'), ascending=False)

tmp

Unnamed: 0_level_0,Score,Score,Score,Score,Score,Score,Score
Unnamed: 0_level_1,mean,std,count_nonzero,ci_left,ci_right,prob_like,prob_dislike
Base Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Action,4.272727,0.64667,11,3.004232,5.501697,0.883503,0.116497
Management,4.666667,1.032796,6,2.600668,5.875609,0.856759,0.143241
RPG,4.526316,1.428613,19,1.792589,5.877116,0.727495,0.272505
Arena,4.055556,1.109967,18,1.914737,5.746453,0.680923,0.319077
Adventure,4.25,1.38873,8,1.67392,5.840981,0.678597,0.321403
Strategy,4.0,1.3484,12,1.573968,5.791777,0.627055,0.372945
Other,2.0,,1,,,,
Party,5.0,,1,,,,
Platformer,6.0,,1,,,,
Puzzle,4.0,,1,,,,


In [57]:
tmp = df.groupby(['Base Category', 'Tier 1 Category']).agg({'Score': [np.mean, np.std, np.count_nonzero]})

tmp[('Score', 'ci_left')] = stats.truncnorm.interval(
    0.95,
    (1-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    (6-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    loc=tmp[('Score', 'mean')],
    scale=tmp[('Score', 'std')]
)[0]
tmp[('Score', 'ci_right')] = stats.truncnorm.interval(
    0.95,
    (1-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    (6-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    loc=tmp[('Score', 'mean')],
    scale=tmp[('Score', 'std')]
)[1]
tmp[('Score', 'prob_like')] = 1-stats.truncnorm.cdf(
    3.5,
    (1-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    (6-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    loc=tmp[('Score', 'mean')],
    scale=tmp[('Score', 'std')]
)
tmp[('Score', 'prob_dislike')] = stats.truncnorm.cdf(
    3.5,
    (1-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    (6-tmp[('Score', 'mean')])/tmp[('Score', 'std')],
    loc=tmp[('Score', 'mean')],
    scale=tmp[('Score', 'std')]
)

tmp = tmp.sort_values(('Score', 'prob_like'), ascending=False)

tmp

  lower_bound = _a * scale + loc
  upper_bound = _b * scale + loc
  x = np.asarray((x - loc)/scale, dtype=dtyp)


Unnamed: 0_level_0,Unnamed: 1_level_0,Score,Score,Score,Score,Score,Score,Score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,count_nonzero,ci_left,ci_right,prob_like,prob_dislike
Base Category,Tier 1 Category,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Management,Automation,5.5,0.707107,2,4.032951,5.957628,0.996924,0.003076
RPG,Traditional RPG,5.2,0.83666,5,3.494687,5.933722,0.974615,0.025385
Action,Narrative Action,4.5,0.57735,4,3.367254,5.589546,0.958172,0.041828
Adventure,Sandbox Adventure,4.333333,0.57735,3,3.201267,5.446743,0.925397,0.074603
Management,Tycoon,4.5,0.707107,2,3.108938,5.725608,0.919995,0.080005
Arena,Sports,4.5,0.707107,2,3.108938,5.725608,0.919995,0.080005
Strategy,Tabletop-Like,5.0,1.414214,2,2.132156,5.915513,0.812533,0.187467
Strategy,Strategic Roguelike,4.25,0.957427,4,2.365058,5.755405,0.775987,0.224013
Action,Action Roguelike,4.25,0.957427,4,2.365058,5.755405,0.775987,0.224013
Arena,Corridor Shooter,4.0,1.0,3,2.052667,5.673163,0.685226,0.314774


In [64]:
## Hyper Opt Testing

from hyperopt import tpe, hp, fmin

def objective(params):
    MAX_SEQ_LENGTH = params['max_seq_length']
    MAX_DEPTH = params['max_depth']
    N_ESTIMATORS = params['n_estimators']
    
    df = pd.read_csv('data/data.csv')
    df = df.set_index('AppID')
    
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    model.max_seq_length = MAX_SEQ_LENGTH

    # Remove appids that were unable to pull information and fill null
    df = df[df['name'].notnull()]

    # Fill Null
    df['short_desc'] = df['short_desc'].fillna('')
    df['long_desc'] = df['short_desc'].fillna('')
    df['tags'] = df['tags'].fillna('')

    # Normalize ratings cols
    df['recent_percent']=(df['recent_percent']-df['recent_percent'].mean())/df['recent_percent'].std()
    df['recent_count']=(df['recent_count']-df['recent_count'].mean())/df['recent_count'].std()
    df['all_percent']=(df['all_percent']-df['all_percent'].mean())/df['all_percent'].std()
    df['all_count']=(df['all_count']-df['all_count'].mean())/df['all_count'].std()

    # Process name
    # Get name embeddings
    df['name_emb'] = None
    for idx, sentence in zip(df.index, df['name'].values):
        sentence_emb = model.encode(sentence)
        df.at[idx, 'name_emb'] = sentence_emb

    # Explode name_emb to multiple cols
    emb_len = df['name_emb'].values[0].shape[0]
    emb_cols = [f'name_emb_{i}' for i in range(0, emb_len)]
    df[emb_cols] = pd.DataFrame(df['name_emb'].tolist(), index=df.index)
    df = df.drop(['name_emb'], axis=1)

    # Process short_desc
    # Get short_desc embeddings
    df['short_desc_emb'] = None
    for idx, sentence in zip(df.index, df['short_desc'].values):
        sentence_emb = model.encode(sentence)
        df.at[idx, 'short_desc_emb'] = sentence_emb

    # Explode short_desc_emb to multiple cols
    emb_len = df['short_desc_emb'].values[0].shape[0]
    emb_cols = [f'short_desc_emb_{i}' for i in range(0, emb_len)]
    df[emb_cols] = pd.DataFrame(df['short_desc_emb'].tolist(), index=df.index)
    df = df.drop(['short_desc_emb'], axis=1)

    # Process long_desc
    # Get long_desc embeddings
    df['long_desc_emb'] = None
    for idx, sentence in zip(df.index, df['long_desc'].values):
        sentence_emb = model.encode(sentence)
        df.at[idx, 'long_desc_emb'] = sentence_emb

    # Explode short_desc_emb to multiple cols
    emb_len = df['long_desc_emb'].values[0].shape[0]
    emb_cols = [f'long_desc_emb_{i}' for i in range(0, emb_len)]
    df[emb_cols] = pd.DataFrame(df['long_desc_emb'].tolist(), index=df.index)
    df = df.drop(['long_desc_emb'], axis=1)

    # Process tags
    # Get tags embeddings
    df['tags_emb'] = None
    for idx, sentence in zip(df.index, df['tags'].values):
        sentence_emb = model.encode(sentence)
        df.at[idx, 'tags_emb'] = sentence_emb

    # Explode tags to multiple cols
    emb_len = df['tags_emb'].values[0].shape[0]
    emb_cols = [f'tags_emb_{i}' for i in range(0, emb_len)]
    df[emb_cols] = pd.DataFrame(df['tags_emb'].tolist(), index=df.index)
    df = df.drop(['tags_emb'], axis=1)

    # Drop unneeded cols
    df_proc = df.drop(['name', 'short_desc', 'long_desc', 'tags'], axis=1).copy()
    
    # data split
    df_model = df_proc[df_proc['Score'].notnull()].copy()
    X = df_model.drop(['Score'], axis=1).copy()
    y = df_model['Score'].copy()

    df_pred = df_proc[df_proc['Score'].isnull()].copy()
    X_pred = df_pred.drop(['Score'], axis=1).copy()

    # Fit Model
    model = XGBRegressor(
        max_depth=MAX_DEPTH,
        n_estimators=N_ESTIMATORS,
        objective='reg:squarederror',
        random_state=42,
        verbosity=0,
        n_jobs=-1)
    model.fit(X, y)

    # Get Cross Val Score
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
    
    return scores.mean()*-1


space = {
    'max_seq_length': hp.randint('max_seq_length', 20, 40),
    'max_depth': hp.randint('max_depth', 1, 8),
    'n_estimators': hp.randint('n_estimators', 200, 500)
}


best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=60
)

print(best)

  0%|                                                                                                                                | 0/60 [00:00<?, ?trial/s, best loss=?]

  self[k1] = value[k2]



  2%|█▋                                                                                                    | 1/60 [00:51<51:01, 51.89s/trial, best loss: 2.3684961152294135]

  self[k1] = value[k2]



  3%|███▍                                                                                                  | 2/60 [01:31<42:58, 44.45s/trial, best loss: 2.1966338656140634]

  self[k1] = value[k2]



  5%|█████                                                                                                 | 3/60 [02:26<46:51, 49.33s/trial, best loss: 2.1966338656140634]

  self[k1] = value[k2]



  7%|██████▊                                                                                               | 4/60 [03:22<48:38, 52.12s/trial, best loss: 2.1966338656140634]

  self[k1] = value[k2]



  8%|████████▌                                                                                             | 5/60 [04:13<47:17, 51.59s/trial, best loss: 2.1966338656140634]

  self[k1] = value[k2]



 10%|██████████▏                                                                                           | 6/60 [05:05<46:33, 51.73s/trial, best loss: 2.1556771200535736]

  self[k1] = value[k2]



 12%|███████████▉                                                                                          | 7/60 [05:51<44:06, 49.93s/trial, best loss: 2.1556771200535736]

  self[k1] = value[k2]



 13%|█████████████▌                                                                                        | 8/60 [06:34<41:18, 47.67s/trial, best loss: 1.8909220800119062]

  self[k1] = value[k2]



 15%|███████████████▎                                                                                      | 9/60 [07:19<39:52, 46.91s/trial, best loss: 1.8909220800119062]

  self[k1] = value[k2]



 17%|████████████████▊                                                                                    | 10/60 [08:09<39:51, 47.83s/trial, best loss: 1.8909220800119062]

  self[k1] = value[k2]



 18%|██████████████████▋                                                                                   | 11/60 [08:57<39:00, 47.77s/trial, best loss: 1.890501218245418]

  self[k1] = value[k2]



 20%|████████████████████▍                                                                                 | 12/60 [09:39<36:57, 46.21s/trial, best loss: 1.890501218245418]

  self[k1] = value[k2]



 22%|██████████████████████                                                                                | 13/60 [10:31<37:33, 47.95s/trial, best loss: 1.890501218245418]

  self[k1] = value[k2]



 23%|███████████████████████▊                                                                              | 14/60 [11:25<38:08, 49.74s/trial, best loss: 1.890501218245418]

  self[k1] = value[k2]



 25%|█████████████████████████▌                                                                            | 15/60 [12:14<37:08, 49.53s/trial, best loss: 1.890501218245418]

  self[k1] = value[k2]



 27%|███████████████████████████▏                                                                          | 16/60 [13:08<37:13, 50.76s/trial, best loss: 1.890501218245418]

  self[k1] = value[k2]



 28%|████████████████████████████▉                                                                         | 17/60 [13:57<35:58, 50.21s/trial, best loss: 1.890501218245418]

  self[k1] = value[k2]



 30%|██████████████████████████████▌                                                                       | 18/60 [14:50<35:46, 51.10s/trial, best loss: 1.890501218245418]

  self[k1] = value[k2]



 32%|████████████████████████████████▎                                                                     | 19/60 [15:38<34:18, 50.20s/trial, best loss: 1.890501218245418]

  self[k1] = value[k2]



 33%|██████████████████████████████████                                                                    | 20/60 [16:28<33:21, 50.04s/trial, best loss: 1.890501218245418]

  self[k1] = value[k2]



 35%|███████████████████████████████████▎                                                                 | 21/60 [17:15<31:59, 49.22s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 37%|█████████████████████████████████████                                                                | 22/60 [18:08<31:51, 50.30s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 38%|██████████████████████████████████████▋                                                              | 23/60 [18:59<31:08, 50.51s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 40%|████████████████████████████████████████▍                                                            | 24/60 [19:48<30:01, 50.04s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 42%|██████████████████████████████████████████                                                           | 25/60 [20:42<29:59, 51.43s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 43%|███████████████████████████████████████████▊                                                         | 26/60 [21:27<28:03, 49.53s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 45%|█████████████████████████████████████████████▍                                                       | 27/60 [22:08<25:45, 46.84s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 47%|███████████████████████████████████████████████▏                                                     | 28/60 [23:02<26:09, 49.05s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 48%|████████████████████████████████████████████████▊                                                    | 29/60 [23:43<23:59, 46.42s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 50%|██████████████████████████████████████████████████▌                                                  | 30/60 [24:33<23:53, 47.77s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 52%|████████████████████████████████████████████████████▏                                                | 31/60 [25:27<23:53, 49.43s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 53%|█████████████████████████████████████████████████████▊                                               | 32/60 [26:01<20:59, 44.97s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 55%|███████████████████████████████████████████████████████▌                                             | 33/60 [27:01<22:15, 49.45s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 57%|█████████████████████████████████████████████████████████▏                                           | 34/60 [27:52<21:32, 49.70s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 58%|██████████████████████████████████████████████████████████▉                                          | 35/60 [28:37<20:09, 48.37s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 60%|████████████████████████████████████████████████████████████▌                                        | 36/60 [29:30<19:55, 49.81s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 62%|██████████████████████████████████████████████████████████████▎                                      | 37/60 [30:15<18:30, 48.27s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 63%|███████████████████████████████████████████████████████████████▉                                     | 38/60 [31:01<17:31, 47.81s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 65%|█████████████████████████████████████████████████████████████████▋                                   | 39/60 [31:48<16:34, 47.37s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 67%|███████████████████████████████████████████████████████████████████▎                                 | 40/60 [32:46<16:52, 50.64s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 68%|█████████████████████████████████████████████████████████████████████                                | 41/60 [33:37<16:03, 50.69s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 70%|██████████████████████████████████████████████████████████████████████▋                              | 42/60 [34:13<13:56, 46.48s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 72%|████████████████████████████████████████████████████████████████████████▍                            | 43/60 [34:56<12:48, 45.19s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 73%|██████████████████████████████████████████████████████████████████████████                           | 44/60 [35:40<11:59, 44.95s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 75%|███████████████████████████████████████████████████████████████████████████▊                         | 45/60 [36:25<11:16, 45.08s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 77%|█████████████████████████████████████████████████████████████████████████████▍                       | 46/60 [37:18<11:02, 47.33s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 78%|███████████████████████████████████████████████████████████████████████████████                      | 47/60 [38:09<10:29, 48.42s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 80%|████████████████████████████████████████████████████████████████████████████████▊                    | 48/60 [39:02<09:56, 49.68s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 82%|██████████████████████████████████████████████████████████████████████████████████▍                  | 49/60 [39:50<09:02, 49.33s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 83%|████████████████████████████████████████████████████████████████████████████████████▏                | 50/60 [40:48<08:38, 51.84s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 85%|█████████████████████████████████████████████████████████████████████████████████████▊               | 51/60 [41:27<07:11, 47.99s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 87%|███████████████████████████████████████████████████████████████████████████████████████▌             | 52/60 [42:21<06:39, 49.93s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 88%|█████████████████████████████████████████████████████████████████████████████████████████▏           | 53/60 [43:13<05:52, 50.42s/trial, best loss: 1.8893689741334285]

  self[k1] = value[k2]



 90%|██████████████████████████████████████████████████████████████████████████████████████████▉          | 54/60 [44:02<04:59, 50.00s/trial, best loss: 1.7643679802188594]

  self[k1] = value[k2]



 92%|████████████████████████████████████████████████████████████████████████████████████████████▌        | 55/60 [44:51<04:09, 49.80s/trial, best loss: 1.7643679802188594]

  self[k1] = value[k2]



 93%|██████████████████████████████████████████████████████████████████████████████████████████████▎      | 56/60 [45:47<03:26, 51.61s/trial, best loss: 1.7643679802188594]

  self[k1] = value[k2]



 95%|███████████████████████████████████████████████████████████████████████████████████████████████▉     | 57/60 [46:38<02:34, 51.50s/trial, best loss: 1.7643679802188594]

  self[k1] = value[k2]



 97%|█████████████████████████████████████████████████████████████████████████████████████████████████▋   | 58/60 [47:28<01:41, 50.85s/trial, best loss: 1.7643679802188594]

  self[k1] = value[k2]



 98%|███████████████████████████████████████████████████████████████████████████████████████████████████▎ | 59/60 [48:14<00:49, 49.46s/trial, best loss: 1.7643679802188594]

  self[k1] = value[k2]



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 60/60 [49:02<00:00, 49.03s/trial, best loss: 1.7643679802188594]
{'max_depth': 4, 'max_seq_length': 29, 'n_estimators': 354}
