# Machine Learning Process

We are making a machine learning model that recommends games based on similarities on other games

## We import the libraries

In [3]:
import pandas as pd
import numpy as np
import scipy as sp
import pyarrow as pa
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

## Read the parquet

In [3]:
df_ml = pd.read_parquet("Api_DataFrame\df_ml.parquet")

In [4]:
df_ml.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000582 entries, 6021 to 11749140
Data columns (total 9 columns):
 #   Column              Dtype  
---  ------              -----  
 0   item_id             object 
 1   item_name           object 
 2   playtime_forever    float64
 3   user_id             object 
 4   genre               object 
 5   year                object 
 6   dev/publisher       object 
 7   recommend           bool   
 8   sentiment_analysis  float64
dtypes: bool(1), float64(2), object(6)
memory usage: 139.3+ MB


First, we are combaning the columns, since we are going to use the cosine similarity with text values

In [5]:
df_ml['combined_columns'] = df_ml["item_name"] + " " + df_ml["genre"] + ' ' + df_ml['dev/publisher'] + df_ml['item_name'] + ' ' + df_ml['user_id'] + ' ' + df_ml['genre'] + ' ' + df_ml['year'] + ' ' + df_ml['dev/publisher'] 


In [6]:
df_ml

Unnamed: 0,item_id,item_name,playtime_forever,user_id,genre,year,dev/publisher,recommend,sentiment_analysis,combined_columns
6021,240,Counter-Strike: Source,5751.0,maplemage,Action,2004,Valve,True,1.0,Counter-Strike: Source Action ValveCounter-Str...
6023,240,Counter-Strike: Source,5751.0,maplemage,Action,2004,Valve,True,2.0,Counter-Strike: Source Action ValveCounter-Str...
6024,300,Day of Defeat: Source,335.0,maplemage,Action,2010,Valve,True,1.0,Day of Defeat: Source Action ValveDay of Defea...
6026,300,Day of Defeat: Source,335.0,maplemage,Action,2010,Valve,True,2.0,Day of Defeat: Source Action ValveDay of Defea...
6027,320,Half-Life 2: Deathmatch,601.0,maplemage,Action,2004,Valve,True,1.0,Half-Life 2: Deathmatch Action ValveHalf-Life ...
...,...,...,...,...,...,...,...,...,...,...
11749135,434420,The Chosen RPG,0.0,76561198310819422,RPG,2016,Little Big Lee,True,1.0,The Chosen RPG RPG Little Big LeeThe Chosen RP...
11749137,461640,Sins Of The Demon RPG,0.0,76561198310819422,Action,2016,Chandler Rounsley,True,1.0,Sins Of The Demon RPG Action Chandler Rounsley...
11749138,461640,Sins Of The Demon RPG,0.0,76561198310819422,Adventure,2016,Chandler Rounsley,True,1.0,Sins Of The Demon RPG Adventure Chandler Rouns...
11749139,461640,Sins Of The Demon RPG,0.0,76561198310819422,Indie,2016,Chandler Rounsley,True,1.0,Sins Of The Demon RPG Indie Chandler RounsleyS...


We are going to ask for an id and then recive a similar game, so we are going to group the values by an id

In [7]:
grouped_df = df_ml.groupby('item_id').agg({
    'item_name': 'first',
    "combined_columns" : "first"
}).reset_index()

grouped_df

Unnamed: 0,item_id,item_name,combined_columns
0,10,Counter-Strike,Counter-Strike Action ValveCounter-Strike 7656...
1,1002,Rag Doll Kung Fu,Rag Doll Kung Fu Indie Mark HealeyRag Doll Kun...
2,100400,Silo 2,Silo 2 Animation &amp; Modeling Nevercenter Lt...
3,10090,Call of Duty: World at War,Call of Duty: World at War Action TreyarchCall...
4,100980,3d-coat,3d-coat Animation &amp; Modeling Pilgway3d-coa...
...,...,...,...
8028,9970,Star Raiders,Star Raiders Action Incinerator StudiosStar R...
8029,99700,NightSky,"NightSky Casual Nicalis, Inc.NightSky maplemag..."
8030,9980,The UnderGarden,The UnderGarden Casual Artech StudiosThe Under...
8031,99900,Spiral Knights,Spiral Knights Action Grey HavensSpiral Knight...


Now it's good enough to work with it

In [15]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(grouped_df['combined_columns'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

cosine_sim_df = pd.DataFrame(cosine_sim, index=grouped_df['item_id'], columns=grouped_df['item_id'])


Save the groupped dataframe in a csv

In [None]:
grouped_df.to_csv("Api_DataFrame/grouped_df.csv", index=False, encoding='utf-8')

The function that will go to the API

In [4]:
grouped_df = pd.read_csv(r"Api_DataFrame/grouped_df.csv")

In [24]:
def find_similar_games(item_id):
    #list to save the result of the loop
    list_ = []
    #the ML model, which a cosine_similarity
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(grouped_df['combined_columns'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    cosine_sim_df = pd.DataFrame(cosine_sim, index=grouped_df['item_id'], columns=grouped_df['item_id'])

    
    game_index = grouped_df[grouped_df['item_id'] == item_id].index[0]  #save the index
    
    similar_scores = cosine_sim_df.iloc[game_index]     #make a series with the similar games
    
    similar_games = similar_scores.sort_values(ascending=False)   #order the game
    
    # We dont include the game that we passed (similarity 1.0, the highest one)
    similar_games = similar_games[1:6]
    index = similar_games.index.values      #save the index of the similar games
    for i in index:                             #took the index that we saved, loop them into a mask, use the mask to
        mask = grouped_df["item_id"] == i #save the values in a list and then return it                     
        list_.append(grouped_df[mask].item_name.values[0])
    return list_

# Llama a la función para encontrar los 5 juegos más parecidos a un juego dado (reemplaza 'item_id' con el ID del juego deseado)
item_id = 342580
similar_games = find_similar_games(item_id)
similar_games

['12 Labours of Hercules V: Kids of Hellas',
 '12 Labours of Hercules III: Girl Power',
 '12 Labours of Hercules IV: Mother Nature',
 '12 Labours of Hercules II: The Cretan Bull',
 'Cursed']

The process where i made the function, step by step

In [13]:
game_index = grouped_df[grouped_df['item_id'] == 342580].index[0]
game_index

3776

In [17]:
similar_scores = cosine_sim_df.iloc[game_index]
similar_scores

item_id
10        0.000000
1002      0.000000
100400    0.000000
10090     0.032659
100980    0.000000
            ...   
9970      0.021903
99700     0.037095
9980      0.057377
99900     0.000000
99910     0.000000
Name: 342580, Length: 8033, dtype: float64

In [18]:
similar_games = similar_scores.sort_values(ascending=False)
similar_games

item_id
342580    1.000000
491330    0.848049
360650    0.822953
396800    0.812902
360640    0.789821
            ...   
317710    0.000000
317720    0.000000
317760    0.000000
317790    0.000000
99910     0.000000
Name: 342580, Length: 8033, dtype: float64

In [19]:
similar_games = similar_games[1:6]
similar_games

item_id
491330    0.848049
360650    0.822953
396800    0.812902
360640    0.789821
431260    0.304430
Name: 342580, dtype: float64

In [20]:
index = similar_games.index.values #save the index of the similar games
index

array([491330, 360650, 396800, 360640, 431260], dtype=int64)

In [22]:
list_ = []
for i in index:
    mask = grouped_df["item_id"] == i                      
    list_.append(grouped_df[mask].item_name.values[0])   

In [23]:
list_

['12 Labours of Hercules V: Kids of Hellas',
 '12 Labours of Hercules III: Girl Power',
 '12 Labours of Hercules IV: Mother Nature',
 '12 Labours of Hercules II: The Cretan Bull',
 'Cursed']

In [255]:
mask = grouped_df["item_id"] == "491330"
grouped_df[mask]

Unnamed: 0,item_id,item_name,combined_columns
7409,491330,12 Labours of Hercules V: Kids of Hellas,12 Labours of Hercules V: Kids of Hellas Casua...
