reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B

asin - ID of the product, e.g. 0000013714

reviewerName - name of the reviewer

vote - helpful votes of the review

style - a disctionary of the product metadata, e.g., "Format" is "Hardcover"

reviewText - text of the review

overall - rating of the product

summary - summary of the review

unixReviewTime - time of the review (unix time)

reviewTime - time of the review (raw)

image - images that users post after they have received the product

In [None]:
import pandas as pd
import io
import json
import gzip
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Configurar pandas para mostrar todas las columnas
pd.set_option('display.max_columns', None)


In [None]:
pip install pandas openpyxl

In [None]:
### Carga la data de reviews

data = []
with gzip.open('Clothing_Shoes_and_Jewelry_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

# Longitud total de la lista, este numero equivale al numero total de productos
print(len(data))

# Primera fila de la lista
print(data[0])

In [None]:
# Convierte la lista en un dataframe de pandas

df1 = pd.DataFrame.from_dict(data)

print(len(df1))

In [None]:
df1.head(5)

In [None]:
### Carga la meta data de reviews

data = []
with gzip.open('meta_Clothing_Shoes_and_Jewelry.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

# Longitud total de la lista, este numero equivale al numero total de productos
print(len(data))

# Primera fila de la lista
print(data[0])

In [None]:
# Convierte la lista en un dataframe de pandas

df2 = pd.DataFrame.from_dict(data)

print(len(df2))

In [None]:
# Guardar el DataFrame en formato Excel
#df2.to_excel('nombre_del_archivo_salida.xlsx', index=False, engine='openpyxl')

print("Archivo convertido y guardado con éxito.")

In [None]:
result = pd.merge(df1,df2, on = "asin")

In [None]:
result.head(2)

In [None]:
result.info()

In [None]:
data = result

In [None]:
# Lista de columnas a eliminar
columns_to_drop = ['feature','vote','style', 'reviewerName', 'image','title','date','imageURL','imageURLHighRes','also_view','also_buy','fit','details','similar_item','tech1','description']

# Eliminar columnas
data = data.drop(columns=columns_to_drop, errors='ignore')

In [None]:
# Suponiendo que df es tu DataFrame
data.dropna(inplace=True)

In [None]:
# Extraer todos los números y comas del principio de la cadena
extracted_series = data['rank'].str.extract(r'([\d,]+)')[0]

# Convertir "nan" (cadena) a np.nan
extracted_series.replace("nan", np.nan, inplace=True)


# Eliminar comas y convertir a int
data['extracted_rank'] = extracted_series.str.replace(',', '').fillna('0').astype(int)


# Rellenar valores NaN con 0, eliminar comas y convertir a entero
data['extracted_rank'] = extracted_series.fillna(0).astype(str).str.replace(',', '').astype(int)

In [None]:
# Extraer el primer valor cuando hay un rango de precios
data['price'] = data['price'].str.extract(r'(\d+.\d+)')[0]

# Eliminar el símbolo $ y las comas, luego convertir a float
data['price'] = data['price'].str.replace('$', '', regex=True).str.replace(',', '').astype(float)

In [None]:
# Convertir la columna unixReviewTime a un formato de fecha
data['reviewDate'] = pd.to_datetime(data['unixReviewTime'], unit='s').dt.date

In [None]:
data = data[data['verified'] == True]

In [None]:
data.info()

In [None]:
data.head(2)

In [None]:
# Lista de columnas a eliminar
columns_to_drop = ['reviewTime','unixReviewTime','rank','verified']

# Eliminar columnas
data = data.drop(columns=columns_to_drop, errors='ignore')

In [None]:
# Convertir reviewDate a tipo de dato datetime (si aún no lo has hecho)
data['reviewDate'] = pd.to_datetime(data['reviewDate'])

In [None]:
data.head(2)

In [None]:
data.info()

In [None]:
# 1. Entendimiento Básico
print(data.shape)
#print(data.info())
#print(data.head())
#print(data.tail())

In [None]:
#unique_values_freq = data['verified'].value_counts()
#print(unique_values_freq)

In [None]:
# 2. Estadísticas Descriptivas
print(data.describe())
print(data.describe(include=[np.object]))

In [None]:
# 3. Manejo de Valores Faltantes
missing_values = data.isnull().sum()
print(missing_values)
# Puedes decidir cómo manejar estos valores más adelante.

In [None]:
# Guardar el DataFrame en un archivo CSV
#data.to_csv('Clothing_Shoes_and_Jewelry.csv', index=False)  # 'index=False' para no guardar el índice del DataFrame

In [None]:
# Guardar el DataFrame en un archivo Excel
#data.to_excel('Clothing_Shoes_and_Jewelry.xlsx', index=False, engine='openpyxl')