In [1]:
import pandas as pd
import numpy as np
import warnings

# Import stopwords with nltk.
from nltk.corpus import stopwords
stop = stopwords.words('english')

#tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

#cos-simul
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances

In [2]:
warnings.filterwarnings('ignore')
##paths to data
steam_games_meta_path = "new_data/steam.csv" 
steam_games_description_path = "new_data/steam_description_data.csv"

##data frames
steam_games_meta_df = pd.read_csv(steam_games_meta_path)
#change col name for merge
steam_games_description_df = pd.read_csv(steam_games_description_path).rename(columns={"steam_appid":"appid"})
#merge by appid and select appid and description
merge = pd.merge(steam_games_meta_df, steam_games_description_df, on="appid").loc[:,['appid','name', 'detailed_description']]

In [3]:
steam_games_meta_df.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [4]:
steam_games_description_df.head()

Unnamed: 0,appid,detailed_description,about_the_game,short_description
0,10,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...
1,20,One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...
2,30,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...
3,40,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...
4,50,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...


In [5]:
merge.head()

Unnamed: 0,appid,name,detailed_description
0,10,Counter-Strike,Play the world's number 1 online action game. ...
1,20,Team Fortress Classic,One of the most popular online action games of...
2,30,Day of Defeat,Enlist in an intense brand of Axis vs. Allied ...
3,40,Deathmatch Classic,Enjoy fast-paced multiplayer gaming with Death...
4,50,Half-Life: Opposing Force,Return to the Black Mesa Research Facility as ...


In [6]:
#clean the data
merge['detailed_description'] = merge['detailed_description'].apply(lambda x : x.strip().lower())
merge['detailed_description'] = merge['detailed_description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
merge.head()

Unnamed: 0,appid,name,detailed_description
0,10,Counter-Strike,play world's number 1 online action game. enga...
1,20,Team Fortress Classic,"one popular online action games time, team for..."
2,30,Day of Defeat,enlist intense brand axis vs. allied teamplay ...
3,40,Deathmatch Classic,enjoy fast-paced multiplayer gaming deathmatch...
4,50,Half-Life: Opposing Force,return black mesa research facility one milita...


In [7]:
#tf-idf
vectorizer = TfidfVectorizer()
tf_idf_csr_metrix = vectorizer.fit_transform(merge['detailed_description'])
print(tf_idf_csr_metrix)

  (0, 108809)	0.32451167878064435
  (0, 23431)	0.39873042580564305
  (0, 96691)	0.24900567277735502
  (0, 63705)	0.23576963714836302
  (0, 95080)	0.15272189242520404
  (0, 102221)	0.20860476747771353
  (0, 49828)	0.10285135188653086
  (0, 110495)	0.07978181762987942
  (0, 78889)	0.12313720472985915
  (0, 108167)	0.14101898406116375
  (0, 39283)	0.1031123881198517
  (0, 111130)	0.19896542422609478
  (0, 24484)	0.2008044158091303
  (0, 29621)	0.08798121664483846
  (0, 111102)	0.3184618005506822
  (0, 89706)	0.15335269513390998
  (0, 121372)	0.2267051138182715
  (0, 120133)	0.1721921014168428
  (0, 111680)	0.21744832445410314
  (0, 33129)	0.14836511688198753
  (0, 93479)	0.12898090305265622
  (0, 65850)	0.18382589710276703
  (0, 49897)	0.1572516480757748
  (0, 56536)	0.08370961052161989
  (0, 22807)	0.08817285875146137
  :	:
  (27074, 58240)	0.05881696381313012
  (27074, 54021)	0.06773131952705896
  (27074, 22794)	0.05560717408499956
  (27074, 81838)	0.047139251496051214
  (27074, 38666)	

In [8]:
#cos-sim
cos_sim = cosine_similarity(tf_idf_csr_metrix)
print(cos_sim)

[[1.         0.19036726 0.06563372 ... 0.00658977 0.04533405 0.01946187]
 [0.19036726 1.         0.02490299 ... 0.01528716 0.00572197 0.0153241 ]
 [0.06563372 0.02490299 1.         ... 0.02577979 0.02460648 0.00383763]
 ...
 [0.00658977 0.01528716 0.02577979 ... 1.         0.1103611  0.08589958]
 [0.04533405 0.00572197 0.02460648 ... 0.1103611  1.         0.14449795]
 [0.01946187 0.0153241  0.00383763 ... 0.08589958 0.14449795 1.        ]]


In [31]:
def name_to_id(name):
    index = merge.index[merge['name']==name].tolist()[0]
    return list(merge.iloc[[index]]['appid'])[0]

def top_similar(name):
    by_name = []
    limit = 20
    appid = name_to_id(name)
    index = merge.index[merge['appid']==appid].tolist()[0]
    vector = cos_sim[index]
    sort_index = np.argsort(vector)[::-1]

    for i in sort_index:
        by_name.append(list(merge.iloc[[i]]['name'])[0])
    by_name.pop(0)

    return by_name[0:limit]

In [33]:
print(top_similar('Left 4 Dead'))

['Krystal the Adventurer', 'Left 4 Dead 2', 'Zero-G', 'Slashers: The Power Battle', 'The Path of Greatest Resistance', 'Arcana Heart 3 LOVE MAX!!!!!', 'Prismata', 'GUILTY GEAR Xrd -REVELATOR-', 'Hive', 'Machines At War 3', 'Electronic Super Joy', 'Think To Die', 'Color Party', 'Bob The Cube', 'Magic Tower', 'Non-Linear Text Quests', 'HotHead', 'Splody', 'Splendor', 'Shoot Mania VR: Fun Zombies']
