In [1]:
import pandas as pd
import numpy as np

In [2]:
#load dataset

import os.path

datasetdir = os.path.join('.', 'dataset')
filename = 'games.csv'
path = os.path.join(datasetdir, filename)

data = pd.read_csv(path, index_col='id')

In [3]:
rmed = data['rating'].median()
armed = data['aggregated_rating'].median()
for idx in data.index:
    if np.isnan(data.loc[idx, 'rating']) and np.isnan(data.loc[idx, 'aggregated_rating']):
        data.loc[idx, 'rating'] = rmed
        data.loc[idx, 'aggregated_rating'] = armed
    elif np.isnan(data.loc[idx, 'rating']):
        data.loc[idx, 'rating'] = data.loc[idx, 'aggregated_rating']
    elif np.isnan(data.loc[idx, 'aggregated_rating']):
        data.loc[idx, 'aggregated_rating'] = data.loc[idx, 'rating']
    else:
        data.loc[idx, 'rating'] = rmed
        data.loc[idx, 'aggregated_rating'] = armed
        

data['rating_count'].fillna(value=0, inplace=True)
data['follows'].fillna(value=0, inplace=True)
data['game_modes'].fillna(value='1', inplace=True)


data['keywords'].fillna(value='', inplace=True)
data['summary'].fillna(value='', inplace=True)
data['storyline'].fillna(value='', inplace=True)
for idx in data.index:
    if (data.loc[idx, 'summary'] != '') and (data.loc[idx, 'storyline'] != ''):
        data.loc[idx, 'storyline'] = data.loc[idx, 'storyline'] + ' ' + data.loc[idx, 'summary']
    if (data.loc[idx, 'summary'] != '') and (data.loc[idx, 'storyline'] == ''):
        data.loc[idx, 'storyline'] = data.loc[idx, 'summary']


data['year'] = pd.to_datetime(data['first_release_date'], unit='s').dt.year
data['player_perspectives'].fillna(value='7', inplace=True)

data['campaigncoop'].fillna(value=False, inplace=True)
data['dropin'].fillna(value=False, inplace=True)
data['lancoop'].fillna(value=False, inplace=True)
data['offlinecoop'].fillna(value=False, inplace=True)
data['onlinecoop'].fillna(value=False, inplace=True)
data['splitscreen'].fillna(value=False, inplace=True)

data.drop(['category', 'multiplayer_modes', 'similar_games', 'offlinecoopmax', 'rating_count', 'summary', 
           'offlinemax',  'onlinecoopmax', 'onlinemax', 'platform', 'first_release_date', 'involved_companies'], axis=1, inplace=True)

In [4]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tony\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Tony\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tony\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tony\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

def lemmatizer(text):
    wl = WordNetLemmatizer()
    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    lemmatized_sentence = []
    tokenizer = RegexpTokenizer(r'\w+[-]?\w+')
    words = tokenizer.tokenize(text)
    word_pos_tags = nltk.pos_tag(words)

    for idx, tag in enumerate(word_pos_tags):
        lemmatized_sentence.append(wl.lemmatize(tag[0], get_wordnet_pos(tag[1])))
    return " ".join(lemmatized_sentence)

In [6]:
data['keywords'] = data['keywords'].apply(lambda x: x if x == '' else lemmatizer(x).lower())
data['storyline'] = data['storyline'].apply(lambda x: x if x == '' else lemmatizer(x).lower())

In [7]:
import stopwords
stopw = stopwords.ENGLISH_STOP_WORDS

keywords_dict = dict()
for s in data['keywords'].dropna().values:
    for w in s.split():
        if w in keywords_dict:
            keywords_dict[w] += 1
        else:
            keywords_dict[w] = 1

keywords_dict = {k: keywords_dict[k] for k in keywords_dict if (keywords_dict[k] > 30) and (k not in stopw)}
keywords_set = set(keywords_dict.keys())

In [8]:
for idx in data.index:
    if data.loc[idx, 'keywords'] == '':
        keywords = []
        for k in data.loc[idx, 'storyline'].split():
            if k in keywords_set:
                keywords.append(k)
        data.loc[idx, 'keywords'] = ' '.join(keywords)
    if data.loc[idx, 'storyline'] == '':
        data.loc[idx, 'storyline'] = str(idx)

In [9]:
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
import scipy
from random import choices

X, y = data[~pd.isna(data['themes'])][['keywords', 'storyline']], data[~pd.isna(data['themes'])]['themes']

y = [[int(j) for j in i.split()] for i in y]
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)
print(y.shape)
labels = [i[0] for i in sorted(enumerate(y.sum(axis=0)), key=lambda x: x[1] )] # indexes of class samples in descending order of quantity
y = pd.DataFrame(y, index=X.index)
y_buf = pd.DataFrame(columns=y.columns)


for l in labels:
    k = 15000 - int(y_buf.sum()[l])
    if k > 0:
        y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )

y = y_buf.astype(np.int).values       
X = X.loc[y_buf.index]

(63848, 22)


  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] == 1].index), k=k) ] )
  y_buf = y_buf.append(y.loc[choices( list(y[y[l] =

In [None]:
vectorizer_key = CountVectorizer(stop_words=stopw)
vectorizer_story = CountVectorizer(stop_words=stopw)
X_key = vectorizer_key.fit_transform(X['keywords'])
X_story = vectorizer_story.fit_transform(X['storyline'])
X = scipy.sparse.hstack((X_key, X_story))

clf = OneVsRestClassifier(XGBClassifier(n_jobs=-1, max_depth=5,  n_estimators=100))
clf.fit(X, y)

In [None]:
emptythemes = pd.isna(data['themes'])
X_key = vectorizer_key.transform(data[emptythemes]['keywords'])
X_story = vectorizer_story.transform(data[emptythemes]['storyline'])
X = scipy.sparse.hstack((X_key, X_story))
predictionthemes = mlb.inverse_transform(clf.predict(X))

In [None]:
for idx, value in zip(emptythemes.index, predictionthemes):
    data.loc[idx, 'themes'] = ' '.join([str(i) for i in value])

In [None]:
predictionthemes

In [None]:
data['themes'].fillna(value='', inplace=True)
themes = {'thriller': '20',
          'sci-fi': '18',
          'action': '1',
          'horror': '19',
          'survival': '21',
          'fantasy': '17',
          'historical': '22',
          'stealth': '23',
          'comedy': '27',
          'business': '28',
          'drama': '31',
          'non-fiction': '32',
          'kids': '35',
          'sandbox': '33',
          'open world': '38',
          'warfare': '39',
          '4x': '41',
          'educational': '34',
          'mystery': '43',
          'party': '40',
          'romance': '44',
          'erotic': '42'}

data['genres'].fillna(value='', inplace=True)
genres = {'fighting': '4',
          'shooter': '5',
          'music': '7',
          'platform': '8',
          'puzzle': '9',
          'racing': '10',
          'real time strategy': '11',
          'rts': '11',
          'role-playing': '12',
          'rpg': '12',
          'simulator': '13',
          'sport': '14',
          'strategy': '15',
          'turn-based strategy': '16',
          'turn based strategy': '16',
          'tbs': '16',
          'tactical': '24',
          'quiz': '26',
          'trivia': '26',
          'hack and slash': '25',
          'hack slash': '25',
          'beat em up': '25',
          'pinball': '30',
          'adventure': '31',
          'arcade': '33',
          'visual novel': '34',
          'indie': '32',
          'card game': '35',
          'board game': '35',
          'moba': '36',
          'point and click': '2',
          'point click': '2'}

In [None]:
import os.path

datasetdir = os.path.join('.', 'dataset')
filename = 'games.clean.csv'
path = os.path.join(datasetdir, filename)
data.to_csv(path)

In [None]:
pcgames = data[data['platforms'].apply(str.split).apply(lambda x: [int(i) for i in x]).apply(lambda x: 6 in x)]
pcgames.loc[[pcgames[(pcgames['year'] == i)]['follows'].idxmax() for i in range(1990, 2023)]]

In [None]:
data['themes'][100:140]