### En este proceso la unión de los datasets y la selección de características numéricas son pasos esenciales para desarrollar un modelo de sistema de recomendación de videojuegos efectivo.

In [3]:
import pandas as pd
import pyarrow


### Se cargan los archivos .parquet.gz en DataFrames. 

In [4]:
df_games = pd.read_parquet('output_steam_games_clean.parquet.gz')
df_items = pd.read_parquet('australian_users_items_clean.parquet.gz')

### Se unen los dos DataFrames por la columna “id” utilizando el método .merge()

In [22]:
df_merged = pd.merge(df_games, df_items, on='id', how='inner')

### Visualización del DataFrame Final.

In [23]:
df_merged.head(3)

Unnamed: 0,publisher,genres,title,id,developer,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,...,Strategy,Utilities,Video Production,Web Publishing,year,user_id,items_count,steam_id,playtime_forever,playtime_2weeks
0,Valve,Action,Half-Life,70.0,Valve,1,0,0,0,0,...,0,0,0,0,1998,kube134,476,76561198031442694,4,0
1,Valve,Action,Half-Life,70.0,Valve,1,0,0,0,0,...,0,0,0,0,1998,76561198030567998,75,76561198030567998,21,0
2,Valve,Action,Half-Life,70.0,Valve,1,0,0,0,0,...,0,0,0,0,1998,itslonk,133,76561198027380739,8,0


### Se elimina la columna 'genre' para reducir la dimensionalidad 

In [24]:
df_merged.drop(['genres'],axis=1, inplace=True)

### El DataFrame Final se exporta a un archivo Parquet.

In [25]:
df_merged.to_parquet('dataset_final.parquet.gz', compression='gzip', index=False) 

### Se carga el dataset australian_user_reviews_clean.parquet.gz

In [26]:
df_reviews = pd.read_parquet('australian_user_reviews_clean.parquet.gz') 

### Se unen ambos DataFrame por la columna 'user_id' para obtener un DataFrame Final.

In [27]:
df_final = pd.merge(df_merged, df_reviews, on = 'user_id') 

In [28]:
df_final.columns

Index(['publisher', 'title', 'id', 'developer', 'Action', 'Adventure',
       'Animation &amp; Modeling', 'Audio Production', 'Casual',
       'Design &amp; Illustration', 'Early Access', 'Education',
       'Free to Play', 'Indie', 'Massively Multiplayer', 'Photo Editing',
       'RPG', 'Racing', 'Simulation', 'Software Training', 'Sports',
       'Strategy', 'Utilities', 'Video Production', 'Web Publishing', 'year',
       'user_id', 'items_count', 'steam_id', 'playtime_forever',
       'playtime_2weeks', 'user_url', 'funny', 'last_edited', 'item_id',
       'helpful', 'recommend', 'review', '0', 'posted_year',
       'sentiment_analysis'],
      dtype='object')

### Se eliminan columnas específicas del DataFrame con el objetivo de reducir su tamaño y mejorar su eficiencia.

In [29]:
df_final.drop(columns=['0'], inplace=True)

### El DataFrame obtenido de los diferentes procesos se exporta a un archivo Parquet.

In [30]:
df_final.to_parquet('dataset_final.parquet.gz', compression='gzip', index=False) 

In [31]:
df_final.head(3)

Unnamed: 0,publisher,title,id,developer,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,...,playtime_2weeks,user_url,funny,last_edited,item_id,helpful,recommend,review,posted_year,sentiment_analysis
0,Valve,Half-Life,70.0,Valve,1,0,0,0,0,0,...,0,http://steamcommunity.com/id/kube134,,,251990,1 of 1 people (100%) found this review helpful,True,It's good to be a magical queen... if you surv...,2014,2
1,Valve,Half-Life,70.0,Valve,1,0,0,0,0,0,...,0,http://steamcommunity.com/profiles/76561198030...,2 people found this review funny,,332800,194 of 282 people (69%) found this review helpful,True,10/10 would take kids here for birthday,2014,2
2,Valve,Half-Life,70.0,Valve,1,0,0,0,0,0,...,0,http://steamcommunity.com/profiles/76561198030...,,,319630,1 of 2 people (50%) found this review helpful,True,"Well for starters, when I write reviews they a...",2015,2


### Se crea un DataSet para seleccionar características numéricas relevantes antes de aplicar las técnicas de recomendación.

In [32]:
df_ord = df_final.select_dtypes(include = (int,float))
df_ord.drop(columns=['Action', 'Adventure',
       'Audio Production', 'Casual', 'Design &amp; Illustration',
       'Early Access', 'Education', 'Free to Play', 'Indie', 'Animation &amp; Modeling',
       'Massively Multiplayer', 'Photo Editing', 'RPG', 'Racing', 'Simulation',
       'Software Training', 'Sports', 'Strategy', 'Utilities',
       'Video Production', 'Web Publishing'], inplace=True)

### El DataFrame obtenido de los diferentes procesos se exporta a un archivo Parquet.

In [33]:
df_ord.to_parquet('dataset_final_ord.parquet.gz', compression='gzip', index=False) 