In [1]:
import json
import pandas as pd

In [2]:

file_json = 'C:/Users/Mauro/Desktop/HENRY/Proyecto_individual/PI_1_MLOps/PI_MLOps-STEAM/users_items.json/australian_users_items.json'

# Listas para almacenar los diccionarios y los números de línea con errores
data_list = []
lineas_con_errores = []

# Contadores para el total de líneas y registros leídos correctamente
total_lineas = 0
registros_correctos = 0

# Abrir el archivo y leerlo línea por línea
with open(file_json, 'r', encoding='utf-8') as archivo:
    for num_linea, linea in enumerate(archivo, start=1):
        total_lineas += 1
        try:
            # Intentar cargar la línea como JSON
            data = json.loads(linea)
            data_list.append(data)
            registros_correctos += 1
        except json.JSONDecodeError:
            try:
                # Si falla como JSON, intentar cargarlo como diccionario
                data = eval(linea)  # Usa eval para interpretar la línea como un diccionario
                if isinstance(data, dict):
                    data_list.append(data)
                    registros_correctos += 1
                else:
                    lineas_con_errores.append(num_linea)
            except Exception as e:
                lineas_con_errores.append(num_linea)

# Crear un DataFrame a partir de la lista de diccionarios
df_users_items = pd.DataFrame(data_list)

# Guarda el DataFrame en un archivo CSV
df_users_items.to_csv('users_items.csv', index=False)

In [3]:
# Se importa el dataset a analizar (previamente transformado)
df_users_items = pd.read_csv('users_items.csv')

In [4]:
# Se observa la cantidad de registros y columnas en el dataset
print("Número de registros:", len(df_users_items))
print("Número de columnas:", len(df_users_items.columns))

Número de registros: 88310
Número de columnas: 5


In [5]:
# Se verifica la información de tipos de datos en el dataset
df_users_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  int64 
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.4+ MB


In [6]:
df_users_items.head(5)

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [7]:
# Manejo de Datos Nulos: Eliminar filas que contienen todos los casilleros vacíos (NaN)
df_users_items = df_users_items.dropna(how='all')
df_users_items.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  int64 
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.4+ MB


In [8]:
# Eliminar duplicados, si los hubiera
df_users_items.drop_duplicates(inplace=True)
df_users_items.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87653 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      87653 non-null  object
 1   items_count  87653 non-null  int64 
 2   steam_id     87653 non-null  int64 
 3   user_url     87653 non-null  object
 4   items        87653 non-null  object
dtypes: int64(2), object(3)
memory usage: 4.0+ MB


In [9]:
# Obtener la información completa de la columna "items"
user_id_column = df_users_items["items"]

# Imprimir los primeros 10 valores de la columna (puedes ajustar el número según tu necesidad)
print(user_id_column.head(10))

# Imprimir estadísticas resumidas de la columna
print(user_id_column.describe())


0    [{'item_id': '10', 'item_name': 'Counter-Strik...
1    [{'item_id': '10', 'item_name': 'Counter-Strik...
2    [{'item_id': '1200', 'item_name': 'Red Orchest...
3    [{'item_id': '10', 'item_name': 'Counter-Strik...
4    [{'item_id': '300', 'item_name': 'Day of Defea...
5    [{'item_id': '50', 'item_name': 'Half-Life: Op...
6    [{'item_id': '240', 'item_name': 'Counter-Stri...
7    [{'item_id': '220', 'item_name': 'Half-Life 2'...
8    [{'item_id': '240', 'item_name': 'Counter-Stri...
9                                                   []
Name: items, dtype: object
count     87653
unique    68902
top          []
freq      16714
Name: items, dtype: object


In [10]:
# Desanidar la columna "items" usando pd.json_normalize
df_desanidado = pd.json_normalize(data_list, record_path=['items'], meta=['steam_id','items_count','user_id', 'user_url'] )

# Define el nuevo orden de las columnas
data_items_resultante = df_desanidado[['user_id', 'items_count', 'steam_id', 'user_url', 'item_id', 'item_name', 'playtime_forever', 'playtime_2weeks']]


In [11]:
data_items_resultante.head(5)

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6,0
1,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,20,Team Fortress Classic,0,0
2,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,30,Day of Defeat,7,0
3,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,40,Deathmatch Classic,0,0
4,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,50,Half-Life: Opposing Force,0,0


In [12]:
# Guarda el DataFrame en un archivo CSV
data_items_resultante.to_csv('users_items_final.csv', index=False)

In [13]:
data_items_resultante.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5153209 entries, 0 to 5153208
Data columns (total 8 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   user_id           object
 1   items_count       object
 2   steam_id          object
 3   user_url          object
 4   item_id           object
 5   item_name         object
 6   playtime_forever  int64 
 7   playtime_2weeks   int64 
dtypes: int64(2), object(6)
memory usage: 314.5+ MB


In [14]:
# Identificar valores nulos
null_columns = data_items_resultante.columns[data_items_resultante.isnull().any()]
missing_data = data_items_resultante[null_columns].isnull().sum()
print(missing_data)


Series([], dtype: float64)


In [15]:
#Tratar los valores nulos
data_items_resultante = data_items_resultante.dropna()  # Eliminar filas con valores nulos
data_items_resultante['items_count'].fillna(0, inplace=True)  # Reemplazar valores nulos con 0


In [16]:
#Revisar el tipo de datos
data_items_resultante['items_count'] = pd.to_numeric(data_items_resultante['items_count'], errors='coerce')


In [17]:
#Verificar los datos limpios
print(data_items_resultante.isnull().sum())


user_id             0
items_count         0
steam_id            0
user_url            0
item_id             0
item_name           0
playtime_forever    0
playtime_2weeks     0
dtype: int64


In [18]:
data_items_resultante.head(5)

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6,0
1,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,20,Team Fortress Classic,0,0
2,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,30,Day of Defeat,7,0
3,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,40,Deathmatch Classic,0,0
4,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,50,Half-Life: Opposing Force,0,0


In [19]:
# Guarda el DataFrame en un archivo CSV
data_items_resultante.to_parquet('users_items_final.parquet', index=False)