In [7]:
import json
import pandas as pd
import os

# 📁 Rutas
input_dir = r"C:\Users\yanin\OneDrive\Desktop\etl\data_lake\raw\google\review-Wyoming-20250406T172136Z-001\review-Wyoming"
output_dir = r"C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google"
os.makedirs(output_dir, exist_ok=True)

# 🧺 Lista para acumular todos los DataFrames
dataframes = []

# 🔁 Procesar archivos 1.json, 2.json y 3.json
for i in range(1, 4):
    file_path = os.path.join(input_dir, f"{i}.json")
    reviews = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                review = json.loads(line.strip())
                reviews.append({
                    "user_id": review.get("user_id"),
                    "name": review.get("name"),
                    "time": review.get("time"),
                    "rating": review.get("rating"),
                    "text": review.get("text"),
                    "gmap_id": review.get("gmap_id"),
                    "state": "WY"  
                })
            except json.JSONDecodeError as e:
                print(f"❌ Error en línea del archivo {i}.json: {e}")

    df = pd.DataFrame(reviews)

    # 🧪 Procesamiento
    df["time"] = pd.to_datetime(df["time"], unit='ms', errors='coerce')
    df["user_id"] = pd.to_numeric(df["user_id"], errors='coerce')
    df["name"] = df["name"].astype("string")
    df["text"] = df["text"].astype("string")
    df["gmap_id"] = df["gmap_id"].astype("string")
    df["rating"] = pd.to_numeric(df["rating"], errors='coerce').astype("Int64")
    df["state"] = df["state"].astype("string")
    df = df.drop_duplicates()

    # 💾 Guardar individual
    output_path = os.path.join(output_dir, f"review-Wyoming-{i}.parquet")
    df.to_parquet(output_path, index=False)
    print(f"✅ Guardado individual: {output_path}")

    dataframes.append(df)

# 🔗 Unir todos los DataFrames
df_total = pd.concat(dataframes, ignore_index=True).drop_duplicates()

# 💾 Guardar DataFrame final unido
final_output_path = os.path.join(output_dir, "review-Wyoming.parquet")
df_total.to_parquet(final_output_path, index=False)
print(f"\n🎉 Archivo final guardado: {final_output_path}")

# 📊 Resumen
print("\n📏 Dimensiones:", df_total.shape)
print("\n🧼 Nulos por columna (%):")
print((df_total.isnull().sum() / len(df_total) * 100).round(2))
print("\n🔁 Duplicados eliminados antes de guardar:", df_total.duplicated().sum())


✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-Wyoming-1.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-Wyoming-2.parquet
✅ Guardado individual: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-Wyoming-3.parquet

🎉 Archivo final guardado: C:\Users\yanin\OneDrive\Desktop\etl\data_lake\clean\google\review-Wyoming.parquet

📏 Dimensiones: (423818, 7)

🧼 Nulos por columna (%):
user_id     0.00
name        0.00
time        0.00
rating      0.00
text       45.55
gmap_id     0.00
state       0.00
dtype: float64

🔁 Duplicados eliminados antes de guardar: 0
