# Transformaciones


## Librerias a utilizar

In [1]:
import dask
import dask.dataframe as dd
import pandas as pd
import pyarrow.json
import gzip
import json
import ast

## Descompresión y limpieza de datasets

### steam_games

In [3]:
# Ruta del archivo comprimido
comprimido_games = 'C:/Users/57315/OneDrive/Documentos/Phyton_Henry/proyecto individual 1/PI MLOps - STEAM/datasets_iniciales/steam_games.json.gz'

# Lee el archivo JSON comprimido por lotes
df_games = dd.read_json(comprimido_games, compression='gzip', blocksize=None)

#Elimina las filas que tienen datos nulos en todas sus columnas
df_games = df_games.dropna(how='all')

# Computa y muestra el resultado
df_games.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88310,Kotoshiro,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",http://steamcommunity.com/app/761140/reviews/?...,['Single-player'],4.99,0.0,761140.0,Kotoshiro
88311,"Making Fun, Inc.","['Free to Play', 'Indie', 'RPG', 'Strategy']",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"['Free to Play', 'Strategy', 'Indie', 'RPG', '...",http://steamcommunity.com/app/643980/reviews/?...,"['Single-player', 'Multi-player', 'Online Mult...",Free To Play,0.0,643980.0,Secret Level SRL
88312,Poolians.com,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"['Free to Play', 'Simulation', 'Sports', 'Casu...",http://steamcommunity.com/app/670290/reviews/?...,"['Single-player', 'Multi-player', 'Online Mult...",Free to Play,0.0,670290.0,Poolians.com
88313,彼岸领域,"['Action', 'Adventure', 'Casual']",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"['Action', 'Adventure', 'Casual']",http://steamcommunity.com/app/767400/reviews/?...,['Single-player'],0.99,0.0,767400.0,彼岸领域
88314,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"['Action', 'Indie', 'Casual', 'Sports']",http://steamcommunity.com/app/773570/reviews/?...,"['Single-player', 'Full controller support', '...",2.99,0.0,773570.0,


In [4]:
#Elimina las columnas que no necesitan para responder las consultas o preparar los modelos 
#de aprendizaje automático
df_games = df_games.drop(columns=['publisher', 'app_name', 'url', 'tags', 'reviews_url', 
                                  'specs', 'price', 'early_access', 'developer'])

In [5]:
# Verifica las celdas con valores nulos
nulos_por_celda_games = df_games.isnull()

# Verifica si todas las celdas de cada columna son nulas
columnas_con_nulos_games = nulos_por_celda_games.all()

# Obtiene el nombre de las columnas con todos los valores nulos
columnas_con_todos_nulos_games = columnas_con_nulos_games[columnas_con_nulos_games].index.compute()

# Muestra las columnas con todos los valores nulos
print("Columnas con todos los valores nulos:", columnas_con_todos_nulos_games)

Columnas con todos los valores nulos: Index([], dtype='string')


In [6]:
#Elimina las filas duplicadas
df_games = df_games.drop_duplicates()

In [7]:
# Identificar duplicados en la columna 'id'
df_games = df_games.drop_duplicates(subset='id')

# Mostrar el dataframe sin filas duplicadas en la columna 'id'
df_games.head()

Unnamed: 0,genres,title,release_date,id
88310,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty,2018-01-04,761140.0
88311,"['Free to Play', 'Indie', 'RPG', 'Strategy']",Ironbound,2018-01-04,643980.0
88312,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",Real Pool 3D - Poolians,2017-07-24,670290.0
88313,"['Action', 'Adventure', 'Casual']",弹炸人2222,2017-12-07,767400.0
88314,,,,773570.0


In [8]:
# Convierte el DataFrame de Dask a Pandas
df_games_pandas = df_games.compute()

In [9]:
#Crea un dataframe de la columna reviews
col_genres = df_games_pandas['genres']
col_genres

88310     ['Action', 'Casual', 'Indie', 'Simulation', 'S...
88311          ['Free to Play', 'Indie', 'RPG', 'Strategy']
88312     ['Casual', 'Free to Play', 'Indie', 'Simulatio...
88313                     ['Action', 'Adventure', 'Casual']
88314                                                  <NA>
                                ...                        
120440        ['Casual', 'Indie', 'Simulation', 'Strategy']
120441                      ['Casual', 'Indie', 'Strategy']
120442                    ['Indie', 'Racing', 'Simulation']
120443                                  ['Casual', 'Indie']
120444                                                 <NA>
Name: genres, Length: 32133, dtype: string

In [10]:
print(type(col_genres))

<class 'pandas.core.series.Series'>


In [11]:
# Función que convierte las filas de la columna de cadena a lista(str a list)
def convertir_list(cadena):
    try:
        lista = ast.literal_eval(cadena)
        return lista
    except (ValueError, SyntaxError):
        return []

In [12]:
#Aplica la función convertir_list
col_genres_exp = col_genres.apply(convertir_list)
col_genres_exp

88310         [Action, Casual, Indie, Simulation, Strategy]
88311                  [Free to Play, Indie, RPG, Strategy]
88312     [Casual, Free to Play, Indie, Simulation, Sports]
88313                           [Action, Adventure, Casual]
88314                                                    []
                                ...                        
120440                [Casual, Indie, Simulation, Strategy]
120441                            [Casual, Indie, Strategy]
120442                          [Indie, Racing, Simulation]
120443                                      [Casual, Indie]
120444                                                   []
Name: genres, Length: 32133, dtype: object

In [13]:
explode_col_genres = col_genres_exp.explode()
explode_col_genres.head()

88310        Action
88310        Casual
88310         Indie
88310    Simulation
88310      Strategy
Name: genres, dtype: object

In [14]:
df_games_pandas = df_games_pandas.drop(columns =['genres'])

In [15]:
# Une los dataframe por el indice
# Unir los DataFrames utilizando merge con how='right'
games_combined = pd.merge(df_games_pandas, explode_col_genres, left_index=True, right_index=True)
games_combined.head()

Unnamed: 0,title,release_date,id,genres
88310,Lost Summoner Kitty,2018-01-04,761140.0,Action
88310,Lost Summoner Kitty,2018-01-04,761140.0,Casual
88310,Lost Summoner Kitty,2018-01-04,761140.0,Indie
88310,Lost Summoner Kitty,2018-01-04,761140.0,Simulation
88310,Lost Summoner Kitty,2018-01-04,761140.0,Strategy


### user_reviews

In [18]:
# Ruta del archivo comprimido
comprimido_reviews = "C:/Users/57315/OneDrive/Documentos/Phyton_Henry/proyecto individual 1/PI MLOps - STEAM/datasets_iniciales/user_reviews.json.gz"

# Listas para almacenar los datos
reviews_data_list = []

# Descomprime el archivo y procesa las líneas una a una
with gzip.open(comprimido_reviews, 'rb') as exp_reviews:
    for line in exp_reviews:
        try:
            # Decodifica la línea utilizando UTF-8 y convierte a cadena
            linea_decodificada = line.decode('utf-8').strip()

            # Evalua la línea como una estructura de datos de Python
            reviews_data = ast.literal_eval(linea_decodificada)

            # Agrega los datos a la lista
            reviews_data_list.append(reviews_data)
            
        # Muestra mensaje de error cuando no sea posible ver la estructura, con numero de linea y detalles del error
        except Exception as e:
            print(f"Error al evaluar la línea como estructura de datos: {linea_decodificada}")
            print(f"Detalles del error: {e}")

# Crea un DataFrame de Dask a partir de la lista
df_reviews = dd.from_pandas(pd.DataFrame(reviews_data_list), npartitions=10)  # Ajusta el número de particiones según tus necesidades

#Elimina las filas que tienen datos nulos en todas sus columnas
df_reviews = df_reviews.dropna(how='all')

# Computa y muestra el resultado
df_reviews.compute().head(2)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."


In [19]:
#Elimina las columnas que no necesitan para responder las consultas o preparar los modelos 
#de aprendizaje automático
df_reviews = df_reviews.drop(columns=['user_url'])

In [20]:
# Verifica las celdas con valores nulos
nulos_por_celda_reviews = df_reviews.isnull()

# Verifica si todas las celdas de cada columna son nulas
columnas_con_nulos_reviews = nulos_por_celda_reviews.all()

# Obtiene el nombre de las columnas con todos los valores nulos
columnas_con_todos_nulos_reviews = columnas_con_nulos_reviews[columnas_con_nulos_reviews].index.compute()

# Muestra las columnas con todos los valores nulos
print("Columnas con todos los valores nulos:", columnas_con_todos_nulos_reviews)

Columnas con todos los valores nulos: Index([], dtype='string')


In [21]:
#Elimina las filas duplicadas
df_reviews = df_reviews.drop_duplicates()

In [22]:
# Identificar duplicados en la columna 'user_id'
df_reviews = df_reviews.drop_duplicates(subset='user_id')

# Mostrar el dataframe sin filas duplicadas en la columna 'user_id'
print(df_reviews.compute())

                 user_id                                            reviews
0      76561197970982479  [{'funny': '', 'posted': 'Posted November 5, 2...
1                js41637  [{'funny': '', 'posted': 'Posted June 24, 2014...
2              evcentric  [{'funny': '', 'posted': 'Posted February 3.',...
3                  doctr  [{'funny': '', 'posted': 'Posted October 14, 2...
4              maplemage  [{'funny': '3 people found this review funny',...
...                  ...                                                ...
25794  76561198306599751  [{'funny': '', 'posted': 'Posted May 31.', 'la...
25795           Ghoustik  [{'funny': '', 'posted': 'Posted June 17.', 'l...
25796  76561198310819422  [{'funny': '1 person found this review funny',...
25797  76561198312638244  [{'funny': '', 'posted': 'Posted July 21.', 'l...
25798        LydiaMorley  [{'funny': '1 person found this review funny',...

[25485 rows x 2 columns]


In [23]:
# Convierte el DataFrame de Dask a Pandas
df_reviews_pandas = df_reviews.compute()

In [24]:
#Crea un dataframe de la columna reviews
col_reviews = df_reviews_pandas['reviews']

In [25]:
print(type(col_reviews))
print(type(col_reviews[0]))

<class 'pandas.core.series.Series'>
<class 'str'>


In [26]:
# Función que convierte las filas de la columna de cadena a lista(str a list)
def convertir_list(cadena):
    try:
        lista = ast.literal_eval(cadena)
        return lista
    except (ValueError, SyntaxError):
        return []

In [27]:
#Aplica la función convertir_list
col_reviews_exp = col_reviews.apply(convertir_list)
col_reviews_exp

0        [{'funny': '', 'posted': 'Posted November 5, 2...
1        [{'funny': '', 'posted': 'Posted June 24, 2014...
2        [{'funny': '', 'posted': 'Posted February 3.',...
3        [{'funny': '', 'posted': 'Posted October 14, 2...
4        [{'funny': '3 people found this review funny',...
                               ...                        
25794    [{'funny': '', 'posted': 'Posted May 31.', 'la...
25795    [{'funny': '', 'posted': 'Posted June 17.', 'l...
25796    [{'funny': '1 person found this review funny',...
25797    [{'funny': '', 'posted': 'Posted July 21.', 'l...
25798    [{'funny': '1 person found this review funny',...
Name: reviews, Length: 25485, dtype: object

In [28]:
# Muestra la estrutura del dataframe
print(type(col_reviews_exp))
print(type(col_reviews_exp[0]))
print(type(col_reviews_exp[0][0]))

<class 'pandas.core.series.Series'>
<class 'list'>
<class 'dict'>


In [29]:
#Expande las filas, convierte cada elemento de lista en fila conservando el indice
explode_col_reviews = col_reviews_exp.explode()
explode_col_reviews.head()

0    {'funny': '', 'posted': 'Posted November 5, 20...
0    {'funny': '', 'posted': 'Posted July 15, 2011....
0    {'funny': '', 'posted': 'Posted April 21, 2011...
1    {'funny': '', 'posted': 'Posted June 24, 2014....
1    {'funny': '', 'posted': 'Posted September 8, 2...
Name: reviews, dtype: object

In [30]:
#Expande diccionarios, convierte claves en columnas y valores en filas
dicc_explode_col_reviews = explode_col_reviews.apply(pd.Series)
dicc_explode_col_reviews

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review,0
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,
0,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,
0,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,
1,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,
1,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,
...,...,...,...,...,...,...,...,...
25797,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...,
25797,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...,
25798,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,
25798,,Posted July 20.,,730,No ratings yet,True,:D,


In [31]:
#verificar que la columna'0' tiene solo valores nulos
dicc_explode_col_reviews[0].isna().sum()

58458

In [32]:
# Eliminar columnas que no se necesitan para los calculos a realizar
columnas_para_eliminar_reviews = [0]
dicc_explode_col_reviews = dicc_explode_col_reviews.drop(
    columns=columnas_para_eliminar_reviews)
dicc_explode_col_reviews

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
0,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
0,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
1,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
1,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...
25797,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
25797,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
25798,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
25798,,Posted July 20.,,730,No ratings yet,True,:D


In [33]:
#Elimina columnas que no se necesitan
df_reviews_pandas = df_reviews_pandas.drop(columns = ['reviews'])

In [34]:
# Une los dataframe por el indice
reviews_combined = dicc_explode_col_reviews.merge(df_reviews_pandas, left_index=True, right_index=True)
reviews_combined

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review,user_id
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,76561197970982479
0,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,76561197970982479
0,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479
1,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,js41637
1,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,js41637
...,...,...,...,...,...,...,...,...
25797,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...,76561198312638244
25797,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...,76561198312638244
25798,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,LydiaMorley
25798,,Posted July 20.,,730,No ratings yet,True,:D,LydiaMorley


### user_items

In [36]:
# Ruta del archivo comprimido
comprimido_items = "C:/Users/57315/OneDrive/Documentos/Phyton_Henry/proyecto individual 1/PI MLOps - STEAM/datasets_iniciales/users_items.json.gz"

# Listas para almacenar los datos
items_data_list = []

# Descomprime el archivo y procesa las líneas
with gzip.open(comprimido_items, 'rb') as exp_items:
    for line in exp_items:
        try:
            # Decodifica la línea utilizando UTF-8 y convierte a cadena
            linea_decodificada = line.decode('utf-8').strip()

            # Evalua la línea como una estructura de datos de Python
            items_data = ast.literal_eval(linea_decodificada)

            # Agrega los datos a la lista
            items_data_list.append(items_data)

        # Muestra mensaje de error cuando no sea posible ver la estructura, con numero de linea y detalles del error
        except Exception as e:
            print(f"Error al evaluar la línea como estructura de datos: {linea_decodificada}")
            print(f"Detalles del error: {e}")

# Crea un DataFrame de Dask a partir de la lista
df_items = dd.from_pandas(pd.DataFrame(items_data_list), npartitions=10)  # Ajusta el número de particiones según tus necesidades

df_items = df_items.dropna(how='all')

# Computa y muestra el resultado
df_items.compute().head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [37]:
#Elimina las columnas que no necesitan para responder las consultas o preparar los modelos 
#de aprendizaje automático
df_items = df_items.drop(columns=[ 'steam_id', 'user_url'])

In [38]:
# Verifica las celdas con valores nulos
nulos_por_celda_items = df_items.isnull()

# Verifica si todas las celdas de cada columna son nulas
columnas_con_nulos_items = nulos_por_celda_items.all()

# Obtiene el nombre de las columnas con todos los valores nulos
columnas_con_todos_nulos_items = columnas_con_nulos_items[columnas_con_nulos_items].index.compute()

# Muestra las columnas con todos los valores nulos
print("Columnas con todos los valores nulos:", columnas_con_todos_nulos_items)

Columnas con todos los valores nulos: Index([], dtype='string')


In [39]:
#Elimina las filas duplicadas
df_items = df_items.drop_duplicates()

In [40]:
# Identificar duplicados en la columna 'user_id'
df_items = df_items.drop_duplicates(subset='user_id')

In [41]:
# Convierte el DataFrame de Dask a Pandas
df_items_pandas = df_items.compute()

In [42]:
#Crea un dataframe de la columna items
col_items = df_items_pandas['items']

In [43]:
#Muestra la estructura del dataframe
print(type(col_items))
print(type(col_items[0]))

<class 'pandas.core.series.Series'>
<class 'str'>


In [44]:
#Aplica la función convertir_list
col_items_exp = col_items.apply(convertir_list)
col_items_exp

0        [{'item_id': '10', 'item_name': 'Counter-Strik...
1        [{'item_id': '10', 'item_name': 'Counter-Strik...
2        [{'item_id': '1200', 'item_name': 'Red Orchest...
3        [{'item_id': '10', 'item_name': 'Counter-Strik...
4        [{'item_id': '300', 'item_name': 'Day of Defea...
                               ...                        
88305    [{'item_id': '413850', 'item_name': 'CS:GO Pla...
88306    [{'item_id': '11020', 'item_name': 'TrackMania...
88307                                                   []
88308    [{'item_id': '304930', 'item_name': 'Unturned'...
88309                                                   []
Name: items, Length: 87626, dtype: object

In [45]:
# Muestra la estrutura del dataframe
print(type(col_items_exp))
print(type(col_items_exp[0]))
print(type(col_items_exp[0][0]))

<class 'pandas.core.series.Series'>
<class 'list'>
<class 'dict'>


In [46]:
#Expande las filas, convierte cada elemento de lista en fila conservando el indice
explode_col_items = col_items_exp.explode()
explode_col_items

0        {'item_id': '10', 'item_name': 'Counter-Strike...
0        {'item_id': '20', 'item_name': 'Team Fortress ...
0        {'item_id': '30', 'item_name': 'Day of Defeat'...
0        {'item_id': '40', 'item_name': 'Deathmatch Cla...
0        {'item_id': '50', 'item_name': 'Half-Life: Opp...
                               ...                        
88308    {'item_id': '373330', 'item_name': 'All Is Dus...
88308    {'item_id': '388490', 'item_name': 'One Way To...
88308    {'item_id': '521570', 'item_name': 'You Have 1...
88308    {'item_id': '519140', 'item_name': 'Minds Eyes...
88309                                                  NaN
Name: items, Length: 5110796, dtype: object

In [47]:
explode_col_items.info()

<class 'pandas.core.series.Series'>
Index: 5110796 entries, 0 to 88309
Series name: items
Non-Null Count    Dtype 
--------------    ----- 
5094082 non-null  object
dtypes: object(1)
memory usage: 78.0+ MB


In [48]:
import pandas as pd
from tqdm import tqdm
from pandas import json_normalize

# Supongamos que explode_col_items es tu Serie original
# explode_col_items = ...

# Aplica json_normalize para expandir los diccionarios en la Serie
dicc_explode_col_items = json_normalize(explode_col_items)

# Si quieres llenar los valores nulos con algún valor específico, puedes hacerlo:
dicc_explode_col_items = dicc_explode_col_items.fillna(value='')

# Muestra los primeros registros del resultado
print(dicc_explode_col_items.head())

  item_id                  item_name playtime_forever playtime_2weeks
0      10             Counter-Strike              6.0             0.0
1      20      Team Fortress Classic              0.0             0.0
2      30              Day of Defeat              7.0             0.0
3      40         Deathmatch Classic              0.0             0.0
4      50  Half-Life: Opposing Force              0.0             0.0


In [49]:
#Crea una columna con el indice original
indice_original = explode_col_items.index

In [50]:
#Añande el indice original a los diccionarios expandidos
dicc_explode_col_items['indice_original'] = indice_original

In [51]:
dicc_explode_col_items.tail()

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,indice_original
5110791,373330.0,All Is Dust,0.0,0.0,88308
5110792,388490.0,One Way To Die: Steam Edition,3.0,3.0,88308
5110793,521570.0,You Have 10 Seconds 2,4.0,4.0,88308
5110794,519140.0,Minds Eyes,3.0,3.0,88308
5110795,,,,,88309


In [52]:
df_items_pandas = df_items_pandas.drop(columns=['items'])
df_items_pandas.head()

Unnamed: 0,user_id,items_count
0,76561197970982479,277
1,js41637,888
2,evcentric,137
3,Riot-Punch,328
4,doctr,541


In [53]:
# Unir los DataFrames utilizando merge con how='right'
items_combined = pd.merge(df_items_pandas, dicc_explode_col_items, left_index=True, right_on='indice_original', how='right')


In [54]:

# Mostrar el resultado
items_combined.tail(10)


Unnamed: 0,user_id,items_count,item_id,item_name,playtime_forever,playtime_2weeks,indice_original
5110786,76561198326700687,177,521570.0,You Have 10 Seconds 2,0.0,0.0,88306
5110787,XxLaughingJackClown77xX,0,,,,,88307
5110788,76561198329548331,7,304930.0,Unturned,677.0,677.0,88308
5110789,76561198329548331,7,227940.0,Heroes & Generals,43.0,43.0,88308
5110790,76561198329548331,7,346330.0,BrainBread 2,0.0,0.0,88308
5110791,76561198329548331,7,373330.0,All Is Dust,0.0,0.0,88308
5110792,76561198329548331,7,388490.0,One Way To Die: Steam Edition,3.0,3.0,88308
5110793,76561198329548331,7,521570.0,You Have 10 Seconds 2,4.0,4.0,88308
5110794,76561198329548331,7,519140.0,Minds Eyes,3.0,3.0,88308
5110795,edward_tremethick,0,,,,,88309


In [55]:
#Elimina la columna indice de pandas
items_combined = items_combined.set_index('indice_original')

In [56]:
# Convertir la columna a tipo numérico
items_combined['playtime_forever'] = pd.to_numeric(items_combined['playtime_forever'], errors='coerce')
items_combined['playtime_2weeks'] = pd.to_numeric(items_combined['playtime_2weeks'], errors='coerce')


## Cambio de tipo de archivo

steam_games

In [57]:
# Guarda el dataset resultante en formato Parquet
games_combined.to_parquet('C:/Users/57315/OneDrive/Documentos/Phyton_Henry/proyecto individual 1/PI MLOps - STEAM/datasets_limpios/df_games.parquet.gzip', compression='gzip', engine='pyarrow')

user_reviews

In [58]:
# Guarda el dataset resultante en formato Parquet
reviews_combined.to_parquet('C:/Users/57315/OneDrive/Documentos/Phyton_Henry/proyecto individual 1/PI MLOps - STEAM/datasets_limpios/df_reviews.parquet.gzip', compression='gzip', engine='pyarrow')

user_items

In [59]:
# Guarda el dataset resultante en formato Parquet
items_combined.to_parquet('C:/Users/57315/OneDrive/Documentos/Phyton_Henry/proyecto individual 1/PI MLOps - STEAM/datasets_limpios/df_items.parquet.gzip', compression='gzip', engine='pyarrow')