In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\o0o0o0o\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df_games = pd.read_parquet('./src/cleaned/games.parquet')
df_items = pd.read_parquet('./src/cleaned/items.parquet')
df_reviews = pd.read_parquet('./src/cleaned/reviews.parquet')

In [3]:
#Agrupamos por item_id y sumamos el playtime_forever, luego lo unimos al df games
df_hours_for_item = df_items.groupby('item_id').agg({'playtime_forever': 'sum'}).reset_index()
df_games_full = pd.merge(df_games, df_hours_for_item, left_on='id', right_on='item_id', how='left').drop('item_id', axis=1)

#Agrupamos por item_id y contamos los jugadores, luego lo unimos al df games
df_players_for_item = df_items.groupby('item_id').agg({'user_id': 'count'}).reset_index()
df_games_full = pd.merge(df_games, df_players_for_item, left_on='id', right_on='item_id', how='left').drop('item_id', axis=1)
df_games_full = df_games_full.rename(columns={'user_id': 'player_count'})
df_games_full['player_count'] = df_games_full['player_count'].fillna(0)


In [4]:
#Generamos variables dummies para los tags
dummies = df_games_full['tags'].str.replace('[', '').str.replace(']', '').str.replace("'", '').str.get_dummies(sep=', ').add_prefix('tag_')
df_games_full = pd.concat([df_games_full, dummies], axis=1).drop('tags', axis=1)

In [5]:
#Generamos variables dummies para el genero
dummies = df_games_full['genres'].str.replace('[', '').str.replace(']', '').str.replace("'", '').str.get_dummies(sep=', ').add_prefix('genre_')
df_games_full = pd.concat([df_games_full, dummies], axis=1).drop('genres', axis=1)

In [6]:
#Condicionamos la columna metascore e imputamos el promedio los valores faltantes
df_games_full.loc[(df_games_full['metascore'] == "None") | (df_games_full['metascore'] == "NA"), 'metascore'] = np.nan
df_games_full['metascore'] = df_games_full['metascore'].astype('float64')
df_games_full['metascore'] = df_games_full['metascore'].mean()


In [7]:
#Eliminamos las columnas genres y specs, ya que sus valores se encuentran en los tags, y la columna app_name, ya que es p´ractimente igual a title
df_games_full = df_games_full.drop(['specs', 'app_name'], axis=1) 

In [8]:
#Guardamos en un archivo parquet
df_games_full.to_parquet('./src/cleaned/games_for_recommendation.parquet')