# Librerías

In [79]:
import pandas as pd
import numpy as np
import ast
import json
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import matplotlib.pyplot as plt
sns.set()

# _________________________________________

# EDA y normalización inicial

### Funciones auxiliares

In [4]:
# Función que carga de los archivos JSON

def cargar_json(ruta_json):
    data = []
    with open(ruta_json, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(ast.literal_eval(line))
    return pd.DataFrame(data)

### Archivo _'user_reviews'_

In [9]:
# Cargar el archivo 'user_reviews' y mostrarlo.

df_user_reviews = cargar_json('/Users/mlucchesi/Henry/PI/data/user_reviews.json')
df_user_reviews.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


In [6]:
# Como la columna 'reviews' está anidada, se usa la función normalize.

df_reviews = pd.json_normalize(df_user_reviews.reviews)
df_reviews.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,"{'funny': '', 'posted': 'Posted November 5, 20...","{'funny': '', 'posted': 'Posted July 15, 2011....","{'funny': '', 'posted': 'Posted April 21, 2011...",,,,,,,
1,"{'funny': '', 'posted': 'Posted June 24, 2014....","{'funny': '', 'posted': 'Posted September 8, 2...","{'funny': '', 'posted': 'Posted November 29, 2...",,,,,,,
2,"{'funny': '', 'posted': 'Posted February 3.', ...","{'funny': '', 'posted': 'Posted December 4, 20...","{'funny': '', 'posted': 'Posted November 3, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...",,,,
3,"{'funny': '', 'posted': 'Posted October 14, 20...","{'funny': '', 'posted': 'Posted July 28, 2012....","{'funny': '', 'posted': 'Posted June 2, 2012.'...","{'funny': '', 'posted': 'Posted June 29, 2014....","{'funny': '', 'posted': 'Posted November 22, 2...","{'funny': '', 'posted': 'Posted February 23, 2...",,,,
4,"{'funny': '3 people found this review funny', ...","{'funny': '1 person found this review funny', ...","{'funny': '2 people found this review funny', ...","{'funny': '', 'posted': 'Posted July 11, 2013....",,,,,,


In [16]:
# Las columnas 0-5 siguen anidadas, se vuelven a normalizar

reviews_cols = []

for i in range(6):
    reviews_col = pd.json_normalize(df_reviews[i])
    reviews_cols.append(reviews_col)

In [19]:
# Se concatenan dichas columnas

df_reviews_rows = pd.concat([reviews_cols[0], reviews_cols[1], reviews_cols[2], reviews_cols[3], reviews_cols[4], reviews_cols[5]], ignore_index=True)
df_reviews_rows.head()

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
2,,Posted February 3.,,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...
3,,"Posted October 14, 2013.",,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...
4,3 people found this review funny,"Posted April 15, 2014.",,211420,35 of 43 people (81%) found this review helpful,True,Git gud


In [None]:
# Se dropea la columna original de 'reviews'

df_user_reviews.drop(columns='reviews', inplace=True)

In [73]:
# Se concatenan las nuevas columnas

df_user_reviews_fixed = pd.concat([df_user_reviews, df_reviews_rows],axis=1)
df_user_reviews_fixed.head(1)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...


In [25]:
df_user_reviews_fixed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154794 entries, 0 to 154793
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      25799 non-null  object
 1   user_url     25799 non-null  object
 2   funny        55832 non-null  object
 3   posted       55832 non-null  object
 4   last_edited  55832 non-null  object
 5   item_id      55832 non-null  object
 6   helpful      55832 non-null  object
 7   recommend    55832 non-null  object
 8   review       55832 non-null  object
dtypes: object(9)
memory usage: 10.6+ MB


In [75]:
# Se calcula los porcentajes de nulos por columna

nulos_user_reviews = (df_user_reviews_fixed.isnull().mean() * 100).round(2)
nulos_user_reviews

user_id        83.33
user_url       83.33
funny          63.93
posted         63.93
last_edited    63.93
item_id        63.93
helpful        63.93
recommend      63.93
review         63.93
dtype: float64

In [27]:
# Se exporta a CSV

df_user_reviews_fixed.to_csv('/Users/mlucchesi/Henry/PI/data/csv/df_user_reviews_fixed.csv')

### Archivo _'steam_games'_

In [None]:
# Cargar el archivo 'steam_games' y mostrarlo.

df_steam_games = pd.read_json('/Users/mlucchesi/Henry/PI/data/json/steam_games.json', lines=True)
df_steam_games.tail() # Se muestran los ultimos registros ya que los primeros son nulos

In [33]:
df_steam_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  float64
 11  id            32133 non-null  float64
 12  developer     28836 non-null  object 
dtypes: float64(2), object(11)
memory usage: 11.9+ MB


In [34]:
# Se exporta a CSV

df_steam_games.to_csv('/Users/mlucchesi/Henry/PI/data/csv/df_steam_games.csv')

### Archivo _'users_items'_

In [None]:
# Cargar el archivo

df_users_items = cargar_json('/Users/mlucchesi/Henry/PI/data/json/users_items.json')

In [55]:
# Mostrarlo 

df_users_items.head(1)


Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."


In [63]:
# Se normaliza la última columna de 'items' y se guarda en un dataframe separado

lista_df = []

for row in df_users_items['items']:
    item_normalizado = pd.json_normalize(row)
    lista_df.append(item_normalizado)

df_items = pd.concat(lista_df, ignore_index=True)

In [64]:
# Se muestra el dataframe de los items

df_items.head()

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks
0,10,Counter-Strike,6,0
1,20,Team Fortress Classic,0,0
2,30,Day of Defeat,7,0
3,40,Deathmatch Classic,0,0
4,50,Half-Life: Opposing Force,0,0


In [None]:
# Se elimina la columna original de 'items' para concatenar el nuevo dataframe con la información normalizada

df_users_items.drop(columns=['items'], inplace=True)

In [71]:
df_users_items_fixed = pd.concat([df_users_items, df_items], axis=1)

In [3]:
# Se muestra el dataframe completo

df_users_items_fixed.info()

NameError: name 'df_users_items_fixed' is not defined

In [76]:
nulos_users_items = (df_users_items_fixed.isnull().mean() * 100).round(2)
nulos_users_items

user_id             98.29
items_count         98.29
steam_id            98.29
user_url            98.29
item_id              0.00
item_name            0.00
playtime_forever     0.00
playtime_2weeks      0.00
dtype: float64

In [86]:
# Exporto los datos en un CSV

df_users_items_fixed.to_csv('/Users/mlucchesi/Henry/PI/data/csv/df_users_items_fixed.csv')