In [44]:
import json
import pandas as pd
import os

# Rutas
input_dir = r"C:\Users\yanin\OneDrive\Desktop\etl\data_lake\raw\google\review-California-20250406T172041Z-001\review-California"
output_dir = r"C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google"
os.makedirs(output_dir, exist_ok=True)

# Lista para acumular todos los DataFrames
dataframes = []

for i in range(1, 19):  # del 6 al 18 inclusive
    file_path = os.path.join(input_dir, f"{i}.json")
    
    reviews = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                review = json.loads(line.strip())
                resp = review.get("resp") or {}

                reviews.append({
                    "user_id": review.get("user_id"),
                    "name": review.get("name"),
                    "time": review.get("time"),
                    "rating": review.get("rating"),
                    "text": review.get("text"),
                    "gmap_id": review.get("gmap_id")
                })
            except json.JSONDecodeError as e:
                print(f"Error leyendo línea en archivo {i}.json: {e}")

    df = pd.DataFrame(reviews)

    # Procesamiento
    df["time"] = pd.to_datetime(df["time"], unit='ms', errors='coerce')
    df["user_id"] = pd.to_numeric(df["user_id"], errors='coerce')
    df["name"] = df["name"].astype("string")
    df["text"] = df["text"].astype("string")
    df["gmap_id"] = df["gmap_id"].astype("string")
    df = df.drop_duplicates()

    # Guardar individual
    output_path = os.path.join(output_dir, f"review-California-{i}.parquet")
    df.to_parquet(output_path, index=False)
    print(f"✅ Guardado individual: {output_path}")

    # Añadir al acumulador
    dataframes.append(df)

# 🔗 Unir todos los DataFrames
df_total = pd.concat(dataframes, ignore_index=True)
df_total = df_total.drop_duplicates()

# Guardar DataFrame final unido
final_output_path = os.path.join(output_dir, "review-California.parquet")
df_total.to_parquet(final_output_path, index=False)
print(f"\n🎉 Archivo final guardado: {final_output_path}")

# Resumen final
print("\n📊 Dimensiones del archivo final:", df_total.shape)
print("\n❗ Nulos por columna (%):")
print((df_total.isnull().sum() / len(df_total) * 100).round(2))


✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-1.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-2.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-3.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-4.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-5.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-6.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-7.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-8.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-9.parquet
✅ Guardado

In [1]:
import json 
import pandas as pd
import os

# 📁 Rutas
input_dir = r"C:\Users\yanin\OneDrive\Desktop\etl\data_lake\raw\google\review-California-20250406T172041Z-001\review-California"
output_dir = r"C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google"
os.makedirs(output_dir, exist_ok=True)

# 🧺 Lista para acumular todos los DataFrames
dataframes = []

# 🔁 Iterar del 1 al 18 inclusive
for i in range(1, 19):
    file_path = os.path.join(input_dir, f"{i}.json")
    reviews = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                review = json.loads(line.strip())
                reviews.append({
                    "user_id": review.get("user_id"),
                    "name": review.get("name"),
                    "time": review.get("time"),
                    "rating": review.get("rating"),
                    "text": review.get("text"),
                    "gmap_id": review.get("gmap_id"),
                    "state": "CA"  # 🔥 Agregar estado California
                })
            except json.JSONDecodeError as e:
                print(f"❌ Error leyendo línea en archivo {i}.json: {e}")

    df = pd.DataFrame(reviews)

    # 🧪 Procesamiento
    df["time"] = pd.to_datetime(df["time"], unit='ms', errors='coerce')
    df["user_id"] = pd.to_numeric(df["user_id"], errors='coerce')
    df["name"] = df["name"].astype("string")
    df["text"] = df["text"].astype("string")
    df["gmap_id"] = df["gmap_id"].astype("string")
    df["state"] = df["state"].astype("string")
    df["rating"] = pd.to_numeric(df["rating"], errors='coerce').astype("Int64")

    df = df.drop_duplicates()

    # 💾 Guardar individual
    output_path = os.path.join(output_dir, f"review-California-{i}.parquet")
    df.to_parquet(output_path, index=False)
    print(f"✅ Guardado individual: {output_path}")

    dataframes.append(df)

# 🔗 Unir todos los DataFrames
df_total = pd.concat(dataframes, ignore_index=True).drop_duplicates()

# 💾 Guardar DataFrame final unido
final_output_path = os.path.join(output_dir, "review-California.parquet")
df_total.to_parquet(final_output_path, index=False)
print(f"\n🎉 Archivo final guardado: {final_output_path}")

# 📊 Resumen final
print("\n📏 Dimensiones:", df_total.shape)
print("\n🧼 Nulos por columna (%):")
print((df_total.isnull().sum() / len(df_total) * 100).round(2))


✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-1.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-2.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-3.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-4.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-5.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-6.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-7.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-8.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-California-9.parquet
✅ Guardado