In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse import hstack


pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [3]:
# Importing the files you want
united_df = pd.read_csv('/kaggle/input/rs-final/clean_data.csv')
# united_df = import_data('/kaggle/input/rs-final/clean_data.csv')

In [4]:
united_df.head()

Unnamed: 0,name,steam_appid,required_age,is_free,about_the_game,short_description,developers,platforms,release_date,positive,negative,owners,average_forever,median_forever,languages,tags
0,Counter-Strike,10,0.0,False,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,['Valve'],"{'windows': True, 'mac': True, 'linux': True}","{'coming_soon': False, 'date': '1 Nov, 2000'}",124534,3339,10000000-20000000,17612,317,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
1,Team Fortress Classic,20,0.0,False,One of the most popular online action games of...,One of the most popular online action games of...,['Valve'],"{'windows': True, 'mac': True, 'linux': True}","{'coming_soon': False, 'date': '1 Apr, 1999'}",3318,633,5000000-10000000,277,62,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
2,Day of Defeat,30,0.0,False,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,['Valve'],"{'windows': True, 'mac': True, 'linux': True}","{'coming_soon': False, 'date': '1 May, 2003'}",3416,398,5000000-10000000,187,34,"English, French, German, Italian, Spanish - Spain",FPS;World War II;Multiplayer
3,Deathmatch Classic,40,0.0,False,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,['Valve'],"{'windows': True, 'mac': True, 'linux': True}","{'coming_soon': False, 'date': '1 Jun, 2001'}",1273,267,5000000-10000000,258,184,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
4,Half-Life: Opposing Force,50,0.0,False,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,['Gearbox Software'],"{'windows': True, 'mac': True, 'linux': True}","{'coming_soon': False, 'date': '1 Nov, 1999'}",5250,288,5000000-10000000,624,415,"English, French, German, Korean",FPS;Action;Sci-fi


In [5]:
# Separating tags with a space
def globalization(strok:str):
    return strok.replace(';', ' ')

# Removing duplicate tags
def del_rep(strok:str):
    spl_strok = strok.split()
    return ' '.join(sorted(set(spl_strok), key=spl_strok.index))

# Converting to lower case
def to_low(strok:str):
    return strok.lower()

In [6]:
united_df.developers = united_df.developers.str[2:-2]

In [7]:
# Separating tags with a space
united_df['tags'] = united_df['tags'].apply(globalization)
united_df['tags'] = united_df['tags'].apply(del_rep)

# Convert strings to lowercase and concatenate into one column
cols = ['developers', 'tags']

united_df['alls'] = united_df['name'].copy().apply(to_low)

for i in cols:
  united_df['alls'] += ' ' + united_df[i].apply(to_low)

In [8]:
# How it looks
united_df['alls']

0              counter-strike valve action fps multiplayer
1        team fortress classic valve action fps multipl...
2         day of defeat valve fps world war ii multiplayer
3          deathmatch classic valve action fps multiplayer
4        half-life: opposing force gearbox software fps...
                               ...                        
27793    room of pandora shen jiawei adventure indie ca...
27794      cyber gun semyon maximov action indie adventure
27795     super star blast entwicklerx action indie casual
27796    new yankee 7: deer hunters yustas game studio ...
27797    rune lord adept studios gd indie casual adventure
Name: alls, Length: 27798, dtype: object

In [9]:
# Convert a collection of text documents to a matrix of token counts
count_vec = CountVectorizer(stop_words='english')
count_matrix = count_vec.fit_transform(united_df['alls'])

# Compute cosine similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [10]:
import string # used for preprocessing
import re # used for preprocessing
import nltk # the Natural Language Toolkit, used for preprocessing
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords # used for preprocessing
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

In [12]:
# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
def remove_urls(text):
    new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    return new_text
# make all text lowercase
def text_lowercase(text):
    return text.lower()
# remove numbers
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result
# remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
# tokenize
def tokenize(text):
    text = word_tokenize(text)
    return text
# remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    text = [i for i in text if not i in stop_words]
    return text
# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    text = [lemmatizer.lemmatize(token) for token in text]
    return text

def preprocessing(text):
    text = text_lowercase(text)
    text = remove_urls(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = tokenize(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    text = ' '.join(text)
    return text

In [13]:
united_df['short_description']=united_df['short_description'].apply(preprocessing)

In [14]:
united_df['short_description'].head()

0    play world number online action game engage in...
1    one popular online action game time team fortr...
2    enlist intense brand axis v allied teamplay se...
3    enjoy fast paced multiplayer gaming deathmatch...
4    return black mesa research facility one milita...
Name: short_description, dtype: object

In [15]:
# Convert a collection of raw documents to a matrix of TF-IDF features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(united_df['short_description'])

# Stack sparse matrices horizontally
st = hstack([count_matrix, tfidf_matrix])

# Compute cosine similarity
cosine_sim2 = cosine_similarity(st, st)

In [16]:
# The main function that makes recommendations
def get_rec(nam:str, cosine):
#     Determine the index
    ind = united_df[united_df['name'] == nam].index.to_list()[0]
    
#     Obtaining cosine convergence by index
    cos_scor = list(enumerate(cosine[ind]))
    
#     Getting the most suitable games
    cos_scor = sorted(cos_scor, key=lambda x: x[1], reverse=True)
    cos_scor = cos_scor[1:11]
    ten_ind = [i[0] for i in cos_scor]
    return united_df['name'].iloc[ten_ind]

In [17]:
# Example 1
naz = "S.T.A.L.K.E.R.: Shadow of Chernobyl"
print('Recommendation for', naz, '\n')
print('Recommendations using game name, developer, genre and tags \n')
print(get_rec(naz, cosine_sim),'\n')
print(print('Recommendations using game name, developer, genre, tags and description \n'))
print(get_rec(naz, cosine_sim2))

Recommendation for S.T.A.L.K.E.R.: Shadow of Chernobyl 

Recommendations using game name, developer, genre and tags 

846          S.T.A.L.K.E.R.: Call of Pripyat
471                S.T.A.L.K.E.R.: Clear Sky
509                                Fallout 3
6209                               Fallout 4
514      Fallout 3: Game of the Year Edition
17577      BLOCK WARRIORS: "Open World" Game
2961      Hard Truck Apocalypse / Ex Machina
7196                    World Ship Simulator
14447                           Fallout 4 VR
515                       Fallout: New Vegas
Name: name, dtype: object 

Recommendations using game name, developer, genre, tags and description 

None
846          S.T.A.L.K.E.R.: Call of Pripyat
471                S.T.A.L.K.E.R.: Clear Sky
509                                Fallout 3
6209                               Fallout 4
514      Fallout 3: Game of the Year Edition
17577      BLOCK WARRIORS: "Open World" Game
7196                    World Ship Simulator
14447     

In [18]:
# Example 2
naz = "Call of Duty® 4: Modern Warfare®"
print('Recommendation for', naz, '\n')
print('Recommendations using game name, developer, genre and tags \n')
print(get_rec(naz, cosine_sim),'\n')
print(print('Recommendations using game name, developer, genre, tags and description \n'))
print(get_rec(naz, cosine_sim2))

Recommendation for Call of Duty® 4: Modern Warfare® 

Recommendations using game name, developer, genre and tags 

279              Call of Duty®: Modern Warfare® 2
3176              Call of Duty®: Infinite Warfare
72                                Call of Duty® 2
6802    Call of Duty®: Modern Warfare® Remastered
1432                        Call of Duty®: Ghosts
71                                  Call of Duty®
1518        Call of Duty: Black Ops - Mac Edition
3702                 Call of Duty®: Black Ops III
1304                  Call of Duty®: Black Ops II
5                                        Ricochet
Name: name, dtype: object 

Recommendations using game name, developer, genre, tags and description 

None
279              Call of Duty®: Modern Warfare® 2
3176              Call of Duty®: Infinite Warfare
72                                Call of Duty® 2
6802    Call of Duty®: Modern Warfare® Remastered
1432                        Call of Duty®: Ghosts
71                          

In [19]:
# Example 3
naz = "Tropico 5"
print('Recommendation for', naz, '\n')
print('Recommendations using game name, developer, genre and tags \n')
print(get_rec(naz, cosine_sim),'\n')
print(print('Recommendations using game name, developer, genre, tags and description \n'))
print(get_rec(naz, cosine_sim2))

Recommendation for Tropico 5 

Recommendations using game name, developer, genre and tags 

1010                         Tropico 4
2007                         Tropico 5
528                   Grand Ages: Rome
526      Imperium Romanum Gold Edition
11086                        Caesar™ 3
13365                      Constructor
1416        Omerta - City of Gangsters
5224                      Urban Empire
9352                    Surviving Mars
12795              Pharaoh + Cleopatra
Name: name, dtype: object 

Recommendations using game name, developer, genre, tags and description 

None
1010                         Tropico 4
530                          Tropico 3
528                   Grand Ages: Rome
526      Imperium Romanum Gold Edition
11086                        Caesar™ 3
677                   Tropico Reloaded
9352                    Surviving Mars
5224                      Urban Empire
12795              Pharaoh + Cleopatra
1416        Omerta - City of Gangsters
Name: name, dtype: ob

In [20]:
united_df['alls'] = united_df['alls'] + " " + united_df['short_description']

In [21]:
united_df['alls'][0]

'counter-strike valve action fps multiplayer play world number online action game engage incredibly realistic brand terrorist warfare wildly popular team based game ally teammate complete strategic mission take enemy site rescue hostage role affect team success team success affect role'

In [22]:
united_df['short_description'][0]

'play world number online action game engage incredibly realistic brand terrorist warfare wildly popular team based game ally teammate complete strategic mission take enemy site rescue hostage role affect team success team success affect role'

In [23]:
corpus = []
for words in united_df['short_description']:
    corpus.append(words.split())

In [24]:
from gensim.models import Word2Vec

In [25]:
# EMBEDDING_FILE = "/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin"
# google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

google_model = Word2Vec(vector_size = 300, window=5, min_count = 2, workers = -1)
google_model.build_vocab(corpus)

# google_model.wv.intersect_word2vec_format(EMBEDDING_FILE, lockf=1, binary=True)

google_model.train(corpus, total_examples=google_model.corpus_count, epochs = 5)

(0, 0)

In [26]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df = 5, stop_words='english')
tfidf.fit(united_df['short_description'])

tfidf_list = dict(zip(tfidf.get_feature_names_out(), list(tfidf.idf_)))
tfidf_feature = tfidf.get_feature_names_out()

In [27]:
from tqdm import tqdm

In [28]:
tfidf_vectors = []; 
line = 0;
for desc in tqdm(corpus): 
    sent_vec = np.zeros(300) 
    weight_sum =0; 
    for word in desc: 
        if word in google_model.wv.key_to_index and word in tfidf_feature:
            vec = google_model.wv[word]
            tf_idf = tfidf_list[word] * (desc.count(word) / len(desc))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_vectors.append(sent_vec)
    line += 1

100%|██████████| 27798/27798 [03:55<00:00, 118.25it/s]


In [29]:
cosine_similarities = cosine_similarity(tfidf_vectors,  tfidf_vectors)

In [30]:
naz = "Counter-Strike"
print('Recommendation for', naz, '\n')
print('Recommendations using game name, developer, genre and tags \n')
print(get_rec(naz, cosine_similarities),'\n')

Recommendation for Counter-Strike 

Recommendations using game name, developer, genre and tags 

14937                                    RUGBY 18
15099           NARUTO TO BORUTO: SHINOBI STRIKER
17649                                      Astroe
23705                 Pro Basketball Manager 2019
23477                               Under The War
13530                              Operation swat
12048                 Pro Basketball Manager 2017
15863                                  Jammerball
9519                                  Legion TD 2
10461    Airport Fire Department - The Simulation
Name: name, dtype: object 



In [31]:
naz = "Counter-Strike"
print('Recommendation for', naz, '\n')
print('Recommendations using game name, developer, genre and tags \n')
print(get_rec(naz, cosine_similarities),'\n')

Recommendation for Counter-Strike 

Recommendations using game name, developer, genre and tags 

14937                                    RUGBY 18
15099           NARUTO TO BORUTO: SHINOBI STRIKER
17649                                      Astroe
23705                 Pro Basketball Manager 2019
23477                               Under The War
13530                              Operation swat
12048                 Pro Basketball Manager 2017
15863                                  Jammerball
9519                                  Legion TD 2
10461    Airport Fire Department - The Simulation
Name: name, dtype: object 



In [32]:
naz = "Tropico 5"
print('Recommendation for', naz, '\n')
print('Recommendations using game name, developer, genre and tags \n')
print(get_rec(naz, cosine_similarities),'\n')

Recommendation for Tropico 5 

Recommendations using game name, developer, genre and tags 

20053                           Citystate
2309                    Tales of Maj'Eyal
5465          Rise of the Triad: Dark War
18231                          Smart Cube
909            Reign: Conflict of Nations
12216                      Counter Agents
3668            The Nightmare Cooperative
1159     Steel Storm: Burning Retribution
20598                       AstroBlast VR
23325                   The Darkest Woods
Name: name, dtype: object 

