In [1]:
import pandas as pd
import ast
import json
import numpy as np
import nltk

In [2]:
import os

# Obteniendo  el directorio actual (donde se está ejecutando el script)
current_directory = os.getcwd()
print(f'Directorio actual: {current_directory}')

# Combinar la ruta con el subdirectorio y el nombre del archivo
file_path = os.path.join(current_directory, 'users_items.json', 'australian_users_items.json')

if os.path.exists(file_path):
    print(f'El archivo se encuentra en la ruta correcta: {file_path}')
else:
    print('El archivo no se encuentra en la ruta correcta.')


Directorio actual: c:\Users\maria\OneDrive\PROYECTO_MLOPS
El archivo se encuentra en la ruta correcta: c:\Users\maria\OneDrive\PROYECTO_MLOPS\users_items.json\australian_users_items.json


In [3]:
data_list = []

# Ruta del archivo JSON
file_path = 'users_items.json/australian_users_items.json' # la ruta del archivo'

# Abriendo el archivo y procesando cada línea
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            # Uso ast.literal_eval para convertir la línea en un diccionario
            json_data = ast.literal_eval(line)
            data_list.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

# Creo un DataFrame a partir de la lista de diccionarios
df_user_items = pd.DataFrame(data_list)


In [4]:
df_user_items

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."
...,...,...,...,...,...
88305,76561198323066619,22,76561198323066619,http://steamcommunity.com/profiles/76561198323...,"[{'item_id': '413850', 'item_name': 'CS:GO Pla..."
88306,76561198326700687,177,76561198326700687,http://steamcommunity.com/profiles/76561198326...,"[{'item_id': '11020', 'item_name': 'TrackMania..."
88307,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClo...,[]
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"[{'item_id': '304930', 'item_name': 'Unturned'..."


In [5]:
df_user_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  object
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.4+ MB


In [6]:
df_user_items_explode = df_user_items.explode('items')

df_user_items_explode.reset_index(drop=True, inplace=True)

df_user_items_explode = pd.concat([df_user_items_explode, pd.json_normalize(df_user_items_explode['items'])], axis=1)

df_user_items_explode.drop(columns=['items'], inplace=True)

df_user_items_explode.head(10)

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6.0,0.0
1,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,20,Team Fortress Classic,0.0,0.0
2,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,30,Day of Defeat,7.0,0.0
3,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,40,Deathmatch Classic,0.0,0.0
4,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,50,Half-Life: Opposing Force,0.0,0.0
5,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,60,Ricochet,0.0,0.0
6,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,70,Half-Life,0.0,0.0
7,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,130,Half-Life: Blue Shift,0.0,0.0
8,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,300,Day of Defeat: Source,4733.0,0.0
9,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,240,Counter-Strike: Source,1853.0,0.0


In [7]:
df_user_items_explode.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5170015 entries, 0 to 5170014
Data columns (total 8 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int64  
 2   steam_id          object 
 3   user_url          object 
 4   item_id           object 
 5   item_name         object 
 6   playtime_forever  float64
 7   playtime_2weeks   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 315.6+ MB


In [8]:
# ELIMINANDO COLUMNAS QUE NO SON NECESARIS:

df_user_items_explode.drop(['user_url'], axis=1, inplace=True)
df_user_items_explode.drop(['playtime_2weeks'], axis=1, inplace=True)
df_user_items_explode.drop(['steam_id'], axis=1, inplace=True)


In [10]:
df_user_items_explode.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5170015 entries, 0 to 5170014
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int64  
 2   item_id           object 
 3   item_name         object 
 4   playtime_forever  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 197.2+ MB


In [11]:
df_user_items_explode.head(5)

Unnamed: 0,user_id,items_count,item_id,item_name,playtime_forever
0,76561197970982479,277,10,Counter-Strike,6.0
1,76561197970982479,277,20,Team Fortress Classic,0.0
2,76561197970982479,277,30,Day of Defeat,7.0
3,76561197970982479,277,40,Deathmatch Classic,0.0
4,76561197970982479,277,50,Half-Life: Opposing Force,0.0


In [12]:
#verificando existencia de valores nulos
df_user_items_explode.isnull().sum()


user_id                 0
items_count             0
item_id             16806
item_name           16806
playtime_forever    16806
dtype: int64

In [13]:
#eliminando valores nulos:
df_user_items_explode.dropna(subset=['item_id', 'item_name', 'playtime_forever'], inplace=True)


In [14]:
# verificando si hay valores duplicados
df_user_items_explode.duplicated().sum()


59117

In [15]:
#eliminando valores duplicados:
df_user_items_explode.drop_duplicates(inplace=True)


In [16]:
df_user_items_explode.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5094092 entries, 0 to 5170013
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int64  
 2   item_id           object 
 3   item_name         object 
 4   playtime_forever  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 233.2+ MB


In [19]:
#Convertir la columna 'item_id' a int32
df_user_items_explode['item_id'] = df_user_items_explode['item_id'].astype('int32')



In [18]:
df_user_items_explode.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5094092 entries, 0 to 5170013
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int64  
 2   item_id           int32  
 3   item_name         object 
 4   playtime_forever  float64
dtypes: float64(1), int32(1), int64(1), object(2)
memory usage: 213.8+ MB


In [20]:
df_user_items_explode

Unnamed: 0,user_id,items_count,item_id,item_name,playtime_forever
0,76561197970982479,277,10,Counter-Strike,6.0
1,76561197970982479,277,20,Team Fortress Classic,0.0
2,76561197970982479,277,30,Day of Defeat,7.0
3,76561197970982479,277,40,Deathmatch Classic,0.0
4,76561197970982479,277,50,Half-Life: Opposing Force,0.0
...,...,...,...,...,...
5170009,76561198329548331,7,346330,BrainBread 2,0.0
5170010,76561198329548331,7,373330,All Is Dust,0.0
5170011,76561198329548331,7,388490,One Way To Die: Steam Edition,3.0
5170012,76561198329548331,7,521570,You Have 10 Seconds 2,4.0


In [21]:
import pyarrow.parquet as pq
import gzip
import pyarrow as pa



# Ruta del archivo Parquet
parquet_file_path = 'df_user_items_explode.parquet'

# Guardar el DataFrame en formato Parquet
table = pa.Table.from_pandas(df_user_items_explode)
pq.write_table(table, parquet_file_path, compression='gzip')


In [22]:
# Ruta del archivo Parquet
parquet_file_path = 'df_user_items_explode.parquet'

# Cargar el archivo Parquet en un DataFrame de Pandas
df_items = pq.read_table(parquet_file_path).to_pandas()

# Ahora, df contiene los datos del archivo Parquet
df_items


Unnamed: 0,user_id,items_count,item_id,item_name,playtime_forever
0,76561197970982479,277,10,Counter-Strike,6.0
1,76561197970982479,277,20,Team Fortress Classic,0.0
2,76561197970982479,277,30,Day of Defeat,7.0
3,76561197970982479,277,40,Deathmatch Classic,0.0
4,76561197970982479,277,50,Half-Life: Opposing Force,0.0
...,...,...,...,...,...
5170009,76561198329548331,7,346330,BrainBread 2,0.0
5170010,76561198329548331,7,373330,All Is Dust,0.0
5170011,76561198329548331,7,388490,One Way To Die: Steam Edition,3.0
5170012,76561198329548331,7,521570,You Have 10 Seconds 2,4.0


In [23]:
csv_gzip_file_path = 'df_user_items_explode.csv.gz'

# Guardar el DataFrame en formato CSV comprimido con gzip
df_user_items_explode.to_csv(csv_gzip_file_path, index=False, compression='gzip')
