In [1]:
import pandas as pd
import numpy as np
import scipy

In [2]:
#load dataset

import os.path

datasetdir = os.path.join('.', 'dataset')
filename = 'games.clean.csv'
path = os.path.join(datasetdir, filename)

data = pd.read_csv(path, index_col='id')
data.fillna('', inplace=True)



 N   Column               Non-Null Count   Dtype      Type    Points  
---  ------               --------------   -----      ----    ------
 0   rating               103536 non-null  float64     S
 1   aggregated_rating    103536 non-null  float64     S
 2   follows              103536 non-null  float64     S
 3   game_modes           103536 non-null  object      N        20
 4   genres               93346 non-null   object      N        16
 5   involved_companies   103536 non-null  object      D
 6   keywords             56122 non-null   object      N        12
 7   summary              97520 non-null   object      N        6
 8   storyline            12658 non-null   object      N        4
 9   name                 103536 non-null  object      R
 10  platforms            103536 non-null  object      S
 11  player_perspectives  46982 non-null   object      N        6
 12  themes               63848 non-null   object      N        12
 13  campaigncoop         103536 non-null  bool        N        1
 14  dropin               103536 non-null  bool        N        1
 15  lancoop              103536 non-null  bool        N        1
 16  offlinecoop          103536 non-null  bool        N        1
 17  onlinecoop           103536 non-null  bool        N        1
 18  splitscreen          103536 non-null  bool        N        1
 19  year                 103536 non-null  int64       S

In [3]:
import stopwords
stopw = stopwords.ENGLISH_STOP_WORDS

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
    
vectorizer = CountVectorizer(stop_words=stopw)
keywordsbag = vectorizer.fit_transform(data['keywords'])
sum_words = keywordsbag.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

words_freq

[('game', 23117),
 ('steam', 9036),
 ('adventure', 7206),
 ('protagonist', 6747),
 ('digital', 6505),
 ('distribution', 5996),
 ('character', 5807),
 ('puzzle', 5730),
 ('play', 5622),
 ('fantasy', 5387),
 ('shooter', 5371),
 ('fi', 4903),
 ('achievement', 4794),
 ('sci', 4761),
 ('base', 4749),
 ('person', 4688),
 ('strategy', 4597),
 ('gun', 4519),
 ('combat', 4488),
 ('time', 4342),
 ('anime', 4304),
 ('e3', 4201),
 ('playstation', 4161),
 ('art', 4029),
 ('level', 3918),
 ('simulation', 3814),
 ('bos', 3722),
 ('support', 3669),
 ('based', 3658),
 ('platformer', 3579),
 ('role', 3530),
 ('weapon', 3522),
 ('3d', 3504),
 ('jump', 3503),
 ('player', 3476),
 ('world', 3351),
 ('fight', 3252),
 ('indie', 3225),
 ('perspective', 3188),
 ('sport', 3154),
 ('card', 3134),
 ('pax', 3117),
 ('title', 3082),
 ('action', 3036),
 ('health', 3000),
 ('multiple', 2948),
 ('screen', 2857),
 ('license', 2829),
 ('race', 2804),
 ('war', 2756),
 ('xbox', 2747),
 ('real', 2725),
 ('point', 2694),
 ('

In [5]:
for e in ['game', 'steam', 'protagonist','digital','distribution', 'character', 'play']:
    stopw.add(e)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
    
vectorizer = CountVectorizer(stop_words=stopw)
summarybag = vectorizer.fit_transform(data['summary'])
sum_words = summarybag.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

words_freq

[('player', 30742),
 ('world', 23047),
 ('new', 18095),
 ('time', 14989),
 ('use', 14128),
 ('level', 13763),
 ('puzzle', 12136),
 ('action', 12078),
 ('adventure', 11938),
 ('make', 11733),
 ('enemy', 10690),
 ('battle', 10220),
 ('release', 10219),
 ('way', 10086),
 ('mode', 9778),
 ('different', 9308),
 ('series', 9207),
 ('feature', 9179),
 ('control', 8993),
 ('fight', 8968),
 ('story', 8805),
 ('like', 8252),
 ('set', 7899),
 ('challenge', 7795),
 ('experience', 7517),
 ('include', 7471),
 ('friend', 7356),
 ('base', 7336),
 ('life', 6946),
 ('power', 6707),
 ('explore', 6634),
 ('help', 6621),
 ('create', 6617),
 ('unique', 6356),
 ('race', 6288),
 ('weapon', 6224),
 ('gameplay', 6128),
 ('develop', 6077),
 ('team', 6067),
 ('arcade', 5769),
 ('space', 5552),
 ('build', 5524),
 ('turn', 5397),
 ('collect', 5373),
 ('skill', 5289),
 ('place', 5212),
 ('try', 5205),
 ('style', 5201),
 ('need', 5069),
 ('fun', 5015),
 ('city', 4964),
 ('choose', 4942),
 ('real', 4873),
 ('save', 48

In [7]:
for e in ['player', 'new', 'world', 'use', 'level', 'gameplay']:
    stopw.add(e)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
    
vectorizer = CountVectorizer(stop_words=stopw)
storylinebag = vectorizer.fit_transform(data['storyline'])
sum_words = storylinebag.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

words_freq

[('time', 3696),
 ('make', 2734),
 ('way', 2614),
 ('life', 2399),
 ('year', 2380),
 ('story', 2376),
 ('power', 2216),
 ('help', 2087),
 ('city', 2055),
 ('know', 1996),
 ('fight', 1981),
 ('day', 1851),
 ('place', 1793),
 ('friend', 1792),
 ('come', 1745),
 ('force', 1694),
 ('set', 1663),
 ('battle', 1646),
 ('begin', 1640),
 ('war', 1604),
 ('end', 1556),
 ('people', 1554),
 ('human', 1504),
 ('like', 1455),
 ('save', 1450),
 ('leave', 1447),
 ('land', 1435),
 ('planet', 1421),
 ('defeat', 1421),
 ('discover', 1414),
 ('enemy', 1397),
 ('start', 1377),
 ('order', 1370),
 ('mysterious', 1340),
 ('adventure', 1338),
 ('turn', 1333),
 ('try', 1311),
 ('control', 1309),
 ('destroy', 1294),
 ('evil', 1284),
 ('earth', 1276),
 ('escape', 1271),
 ('attack', 1228),
 ('need', 1226),
 ('create', 1218),
 ('island', 1203),
 ('hero', 1197),
 ('different', 1193),
 ('live', 1182),
 ('girl', 1177),
 ('home', 1165),
 ('lead', 1161),
 ('great', 1146),
 ('meet', 1135),
 ('dark', 1128),
 ('monster', 1

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

tovectorize = ['game_modes',
               'genres',
               'keywords',
               'summary',
               'storyline',
               'player_perspectives',
               'themes']
data['player_perspectives'] = '0' + data['player_perspectives']
data['game_modes'] = '0' + data['game_modes']

vectorizers = {feature: CountVectorizer(stop_words=stopw) for feature in tovectorize}
bagsofwords = {feature: vectorizers[feature].fit_transform(data[feature]) for feature in tovectorize}


In [10]:
bagsofwords

{'game_modes': <103536x6 sparse matrix of type '<class 'numpy.int64'>'
 	with 103536 stored elements in Compressed Sparse Row format>,
 'genres': <103536x17 sparse matrix of type '<class 'numpy.int64'>'
 	with 147558 stored elements in Compressed Sparse Row format>,
 'keywords': <103536x17719 sparse matrix of type '<class 'numpy.int64'>'
 	with 885859 stored elements in Compressed Sparse Row format>,
 'summary': <103536x94057 sparse matrix of type '<class 'numpy.int64'>'
 	with 2481512 stored elements in Compressed Sparse Row format>,
 'storyline': <103536x42608 sparse matrix of type '<class 'numpy.int64'>'
 	with 546999 stored elements in Compressed Sparse Row format>,
 'player_perspectives': <103536x7 sparse matrix of type '<class 'numpy.int64'>'
 	with 46982 stored elements in Compressed Sparse Row format>,
 'themes': <103536x21 sparse matrix of type '<class 'numpy.int64'>'
 	with 56647 stored elements in Compressed Sparse Row format>}

In [11]:
tosparse = ['campaigncoop',
            'dropin',
            'lancoop',
            'offlinecoop',
            'onlinecoop',
            'splitscreen']

bagsofwords['multiplayermodes'] = scipy.sparse.csr_matrix(data[tosparse].astype(np.int64).values)

In [12]:
weights = {'game_modes': 20,
           'genres': 16,
           'keywords':12,
           'summary':6,
           'storyline':4,
           'player_perspectives':6,
           'themes':12,
           'multiplayermodes':1}

for i in bagsofwords:
    bagsofwords[i] = bagsofwords[i].multiply(weights[i])

X = scipy.sparse.hstack(list(bagsofwords.values()))

In [13]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(X)

In [27]:
print(data.iloc[82045])
distances, indices = model_knn.kneighbors(X.getrow(82045), n_neighbors = 100)
data.iloc[indices[0]][data.iloc[indices[0]]['platforms'] == '6']

rating                                                              70.0
aggregated_rating                                                   70.0
follows                                                              0.0
game_modes                                                            01
genres                                                            8 9 31
involved_companies                                                     1
keywords                               metroidvania side-scrolling mayan
summary                Balam and the Spirit Within be an action-adven...
storyline                                                               
name                                         Balam and the Spirit Within
platforms                                                              6
player_perspectives                                                   04
themes                                                                 1
campaigncoop                                       

Unnamed: 0_level_0,rating,aggregated_rating,follows,game_modes,genres,involved_companies,keywords,summary,storyline,name,platforms,player_perspectives,themes,campaigncoop,dropin,lancoop,offlinecoop,onlinecoop,splitscreen,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
201325,70.0,70.0,0.0,1,8 9 31,1,metroidvania side-scrolling mayan,Balam and the Spirit Within be an action-adven...,,Balam and the Spirit Within,6,4,1.0,False,False,False,False,False,False,2020
129214,70.0,70.0,0.0,1,31,1,,An otherworldly adventure in the heart of the ...,,Secret City: Chalk of Fate - Collector's Edition,6,0,,False,False,False,False,False,False,2020
33180,70.0,70.0,0.0,1,31,49087,,The final adventure conclude here,,Awakening: The Redleaf Forest - Collector's Ed...,6,0,,False,False,False,False,False,False,2016
58038,70.0,70.0,0.0,1,31,1,,An adventure hidden object game,,Alice: Behind the Mirror,6,0,,False,False,False,False,False,False,2018
93035,70.0,70.0,0.0,1,31,122262,,,,Maniac Jackson and the Moonwalking Mindbenders,6,4,,False,False,False,False,False,False,2003
32611,70.0,70.0,0.0,1,31,1,,Action adventure and exploration Descend into ...,,Cavernus,6,0,1.0,False,False,False,False,False,False,2016
128019,70.0,70.0,0.0,1,31,1,,vacation to India become legendary adventure i...,,Labyrinths of the World: The Wild Side - Colle...,6,0,,False,False,False,False,False,False,2020
53958,70.0,70.0,0.0,1,31,81809 81810 81811,,Explore the ancient world of Egypt in this mys...,,Diamon Jones Eye of the Dragon,6,4,1.0,False,False,False,False,False,False,2009
189032,70.0,70.0,0.0,1,31,1,,2D adventure game with trap to keep you on you...,,East Trapper,6,0,,False,False,False,False,False,False,2021
114060,70.0,70.0,0.0,1,31,1,,One adventure two story Play a one or both to ...,,HeartandAxe,6,0,,False,False,False,False,False,False,2019
