In [1]:
import boto3, json, os
import pandas as pd
from botocore.config import Config

In [2]:
S3_BUCKET = "hab-ree-data-json"
S3_PREFIX = "demanda-real"
OUTPUT_KEY = "merged/ree-data_full.json"

In [3]:
with open("credentials/aws_keys.json") as f:
    keys = json.load(f)

os.environ["AWS_ACCESS_KEY_ID"] = keys["AWS_ACCESS_KEY_ID"]
os.environ["AWS_SECRET_ACCESS_KEY"] = keys["AWS_SECRET_ACCESS_KEY"]
os.environ["AWS_DEFAULT_REGION"] = keys["AWS_DEFAULT_REGION"]

In [4]:
s3 = boto3.client("s3")

In [5]:
dfs = [] 
paginator = s3.get_paginator("list_objects_v2")

for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=S3_PREFIX):
    for obj in page.get("Contents", []) or []:
        key = obj["Key"]
        size = obj.get("Size", 0)

        if size == 0 or not key.endswith(".json"):
            continue  # saltar carpetas o ficheros no-json

        body = s3.get_object(Bucket=S3_BUCKET, Key=key)["Body"].read()
        data = json.loads(body)

        values = data.get("indicator", {}).get("values", []) if isinstance(data, dict) else data
        if values:
            dfs.append(pd.DataFrame(values))

full_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
print("Filas totales:", len(full_df))

# Guardar como JSON (lista de records) vs split
payload = full_df.to_json(orient="records", force_ascii=False)

s3.put_object(
    Bucket=S3_BUCKET,
    Key=OUTPUT_KEY,
    Body=payload,
    ContentType="application/json",
    Metadata={"row_count": str(len(full_df))}
)

print(f"Merged listo: s3://{S3_BUCKET}/{OUTPUT_KEY}")

Filas totales: 4276
Merged listo: s3://hab-ree-data-json/merged/ree-data_full.json
