# **Instalar dependencias**

In [1]:
# Ejecutar en Colab: instala todo lo necesario
!apt-get update -qq
!apt-get install -y -qq default-jdk graphviz

# Python packages
!pip install -q kaggle pyspark pandas pyarrow numpy scikit-learn tensorflow streamlit pyngrok folium streamlit-folium requests matplotlib

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


# **Subir kaggle.json**

In [2]:
# Ejecutar en Colab y subir el archivo kaggle.json mediante el di√°logo de archivos
from google.colab import files
print("Sube tu kaggle.json desde tu m√°quina cuando aparezca la ventana.")
uploaded = files.upload()

Sube tu kaggle.json desde tu m√°quina cuando aparezca la ventana.


Saving kaggle.json to kaggle.json


# **Configurar Kaggle y descargar dataset**

In [3]:
# Configurar Kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Descargar dataset (es grande). Se descarga el ZIP completo.
!kaggle datasets download -d elemento/nyc-yellow-taxi-trip-data -p /content --unzip

Dataset URL: https://www.kaggle.com/datasets/elemento/nyc-yellow-taxi-trip-data
License(s): U.S. Government Works
Downloading nyc-yellow-taxi-trip-data.zip to /content
 99% 1.77G/1.78G [00:26<00:00, 14.0MB/s]
100% 1.78G/1.78G [00:26<00:00, 71.3MB/s]


# **Revisar archivos y listar archivos CSV**

In [4]:
import glob, os
files = glob.glob("/content/*.csv") + glob.glob("/content/data/*.csv") + glob.glob("/content/nyc_taxi_data/*.csv")
files = sorted(list(set(files)))
print(f"Archivos CSV encontrados ({len(files)}):")
for f in files[:50]:
    print(f)

Archivos CSV encontrados (4):
/content/yellow_tripdata_2015-01.csv
/content/yellow_tripdata_2016-01.csv
/content/yellow_tripdata_2016-02.csv
/content/yellow_tripdata_2016-03.csv


# **Crear sample manejable y preparar para PySpark**

In [5]:
# Ajustar SAMPLE_ROWS si tu Colab tiene suficiente RAM
SAMPLE_ROWS = 300000   # recomendable 200k-500k para Colab

import pandas as pd
import glob
import os

# Lista de csv de trips (cambiar patr√≥n si difiere)
csv_files = [f for f in files if "zone" not in os.path.basename(f).lower()]

print("Archivos de trips a procesar:", len(csv_files))

rows_needed = SAMPLE_ROWS
dfs = []
for f in csv_files:
    if rows_needed <= 0:
        break
    try:
        df_chunk = pd.read_csv(f, nrows=rows_needed)
    except Exception as e:
        print("Error leyendo", f, e)
        continue
    dfs.append(df_chunk)
    rows_needed -= len(df_chunk)
    print(f"Le√≠do {len(df_chunk)} filas desde {os.path.basename(f)}; resto {rows_needed}")

if len(dfs)==0:
    raise SystemExit("No se pudieron leer archivos CSV ‚Äî revisa paths")

df_sample = pd.concat(dfs, ignore_index=True)
print("Total filas en sample:", len(df_sample))
df_sample.head()
# Guardar parquet para uso r√°pido
df_sample.to_parquet("nyc_sample.parquet", index=False)
print("Guardado: nyc_sample.parquet")

Archivos de trips a procesar: 4
Le√≠do 300000 filas desde yellow_tripdata_2015-01.csv; resto 0
Total filas en sample: 300000
Guardado: nyc_sample.parquet


# **Descargar taxi_zone_lookup**

In [6]:
# Descarga de taxi+_zone_lookup
!wget https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv -O taxi_zone_lookup.csv  # Fuente: repositorio oficial de NYC TLC

--2025-11-16 04:25:14--  https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.154.99.47, 18.154.99.220, 18.154.99.225, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.154.99.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12331 (12K) [text/csv]
Saving to: ‚Äòtaxi_zone_lookup.csv‚Äô


2025-11-16 04:25:14 (13.0 MB/s) - ‚Äòtaxi_zone_lookup.csv‚Äô saved [12331/12331]



In [7]:
zone_df = pd.read_csv("taxi_zone_lookup.csv")
print("Guardado taxi_zone_lookup.csv")

Guardado taxi_zone_lookup.csv


# **Iniciar PySpark (para ETL grande)**

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NYC_Taxi_ETL") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x7ac92a2efd70>


# **Preprocesamiento (limpieza y features)**

In [9]:
import pandas as pd
import numpy as np

df = pd.read_parquet("nyc_sample.parquet")
print("Columnas disponibles:", df.columns.tolist())

# Normalizar nombres m√°s comunes de columnas (var√≠an seg√∫n a√±o)
# Intentar detectar nombres para pickup datetime, location ids y coords
possible_pickup_cols = [c for c in df.columns if 'pickup' in c.lower() and 'datetime' in c.lower()]
possible_dropoff_cols = [c for c in df.columns if 'dropoff' in c.lower() and 'datetime' in c.lower()]

print("Pickup cols candidate:", possible_pickup_cols)
pickup_col = possible_pickup_cols[0] if possible_pickup_cols else None
dropoff_col = possible_dropoff_cols[0] if possible_dropoff_cols else None

# Location ID cols (PULocationID, PULocationID etc.)
# Note: These older files might not have PULocationID, but use lat/lon directly.
loc_cols = [c for c in df.columns if 'pulocationid' in c.lower() or 'pickup_locationid' in c.lower() or 'locationid' in c.lower()]
print("Location ID candidates:", loc_cols)
pu_loc_id_col = loc_cols[0] if loc_cols else None

# Coordinates
lat_cols = [c for c in df.columns if 'pickup_lat' in c.lower() or 'pickup_latitude' in c.lower()]
lon_cols = [c for c in df.columns if 'pickup_lon' in c.lower() or 'pickup_longitude' in c.lower()]
pu_lat = lat_cols[0] if lat_cols else None
pu_lon = lon_cols[0] if lon_cols else None

# Basic rename to canonical names
if pickup_col:
    df.rename(columns={pickup_col: "tpep_pickup_datetime"}, inplace=True)
if dropoff_col:
    df.rename(columns={dropoff_col: "tpep_dropoff_datetime"}, inplace=True)

# If a location ID column was found, rename it to PULocationID
if pu_loc_id_col:
    df.rename(columns={pu_loc_id_col: "PULocationID"}, inplace=True)
    # Convert location id to int if possible
    try:
        df["PULocationID"] = df["PULocationID"].astype(int)
    except Exception as e:
        print(f"Warning: Could not convert 'PULocationID' to int: {e}, keeping original type.")

# If PULocationID is still not in df.columns, create a synthetic one from lat/lon
if "PULocationID" not in df.columns and pu_lat and pu_lon:
    print("Warning: 'PULocationID' not found in the dataset. Creating synthetic 'PULocationID' from binned latitude and longitude.")
    # Binning latitude and longitude to create a pseudo-zone ID
    # A bin size of 0.01 degree latitude is roughly 1.1 km.
    # Adjust `round_to` for desired granularity. 2 means 2 decimal places.
    round_to = 2
    df["PULocationID"] = (df["pickup_latitude"].round(round_to)).astype(str) + "_" + \
                         (df["pickup_longitude"].round(round_to)).astype(str)
    print(f"Synthetic 'PULocationID' created using binned lat/lon (rounded to {round_to} decimal places).")
elif "PULocationID" not in df.columns:
    print("Warning: Neither 'PULocationID' nor valid latitude/longitude columns found to create a location identifier for grouping.")

# If coordinates were found, rename them to canonical names
if pu_lat:
    df.rename(columns={pu_lat: "pickup_latitude"}, inplace=True)
if pu_lon:
    df.rename(columns={pu_lon: "pickup_longitude"}, inplace=True)


# Convert datetimes y crear features temporales
df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"], errors="coerce")
df = df.dropna(subset=["tpep_pickup_datetime"])

df["pickup_hour"] = df["tpep_pickup_datetime"].dt.hour
df["pickup_day"] = df["tpep_pickup_datetime"].dt.dayofweek  # 0=Monday
df["pickup_date"] = df["tpep_pickup_datetime"].dt.date
df["pickup_month"] = df["tpep_pickup_datetime"].dt.month

# Trip duration (min) if dropoff exists
if "tpep_dropoff_datetime" in df.columns:
    df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"], errors="coerce")
    df["trip_duration_min"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60.0

# Trip distance fallback
if "trip_distance" not in df.columns and "Trip_distance" in df.columns:
    df.rename(columns={"Trip_distance":"trip_distance"}, inplace=True)

# Filtrar viajes v√°lidos
if "trip_duration_min" in df.columns:
    df = df[(df["trip_duration_min"]>0) & (df["trip_duration_min"]<180)]

# Handle location null filtering based on what's available
if "PULocationID" in df.columns:
    df = df[df["PULocationID"].notnull()]
elif "pickup_latitude" in df.columns and "pickup_longitude" in df.columns:
    print("No 'PULocationID' found, filtering based on 'pickup_latitude' and 'pickup_longitude' not being null.")
    df = df[df["pickup_latitude"].notnull() & df["pickup_longitude"].notnull()]
else:
    print("Warning: No 'PULocationID' or valid latitude/longitude columns found for location filtering.")


print("After preprocessing rows:", len(df))
df.head()
# Guardar el preprocesado (opcional)
df.to_parquet("nyc_preprocessed.parquet", index=False)

Columnas disponibles: ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'pickup_longitude', 'pickup_latitude', 'RateCodeID', 'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount']
Pickup cols candidate: ['tpep_pickup_datetime']
Location ID candidates: []
Synthetic 'PULocationID' created using binned lat/lon (rounded to 2 decimal places).
After preprocessing rows: 299427


# **Crear variable objetivo: demanda por zona/hora**

In [10]:
# Agrupar para obtener demanda (viajes) por PULocationID, hora y dia
demand = df.groupby(["PULocationID", "pickup_date", "pickup_hour"]).size().reset_index(name="trips")
print("Total rows demand:", len(demand))
# Agregar agregados por zona+hora (media hist√≥rica) - √∫til para predecir demanda promedio por hora
demand_hourly = demand.groupby(["PULocationID", "pickup_hour"]).agg(
    demand_mean=("trips", "mean"),
    demand_median=("trips", "median"),
    demand_std=("trips", "std"),
    samples=("trips", "count")
).reset_index()
demand_hourly.head()
demand_hourly.to_parquet("demand_hourly.parquet", index=False)

Total rows demand: 40998


# **Preparar features para ML (sk-learn + RandomForest)**

In [11]:
# Preparar la tabla de entrenamiento de la demanda original (zone+date+hour)
train_tbl = demand.copy()
# Agregar el atributo month de pickup_date (convertir)
train_tbl["month"] = pd.to_datetime(train_tbl["pickup_date"]).dt.month
train_tbl["dow"] = pd.to_datetime(train_tbl["pickup_date"]).dt.dayofweek

# Para el modelado, agregar por zone+hour+month+dow -> predecir viajes (contar)
X = train_tbl[["PULocationID", "pickup_hour", "month", "dow"]]
y = train_tbl["trips"]

# Codificar PULocationID como ordinal para mantener un pipeline simple
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X_enc = X.copy()
X_enc["PULocationID_enc"] = enc.fit_transform(X[["PULocationID"]])
X_enc = X_enc[["PULocationID_enc", "pickup_hour", "month", "dow"]]

X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(n_estimators=100, max_depth=12, n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"RF RMSE: {rmse:.3f}  MAE: {mae:.3f}")

# Guadar el modelo y el encoder
import joblib
joblib.dump(rf, "rf_demand_model.joblib")
joblib.dump(enc, "ordinal_encoder.joblib")
print("Model and encoder saved.")

RF RMSE: 6.029  MAE: 3.271
Model and encoder saved.


# **(Opcional) Modelo TensorFlow ligero**

(Opcional dado que RandomForest suele ser suficiente y m√°s interpretable).

In [12]:
# Versi√≥n ligera con TF (opcional) ‚Äî entrenar una red densa peque√±a sobre mismas features
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_enc)

Xtr, Xte, ytr, yte = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(Xtr.shape[1],)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(1)
])
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.fit(Xtr, ytr, validation_data=(Xte, yte), epochs=5, batch_size=1024)

# Guardar modelo TF y scaler
model.save("tf_demand_model.keras")
joblib.dump(scaler, "scaler.joblib")
print("TF model and scaler saved.")

Epoch 1/5
[1m33/33[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m3s[0m 19ms/step - loss: 151.8728 - mae: 7.2052 - val_loss: 138.1688 - val_mae: 6.2627
Epoch 2/5
[1m33/33[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 13ms/step - loss: 134.0000 - mae: 6.2640 - val_loss: 118.0121 - val_mae: 6.0151
Epoch 3/5
[1m33/33[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 12ms/step - loss: 115.1368 - mae: 6.2334 - val_loss: 105.2029 - val_mae: 6.4898
Epoch 4/5
[1m33/33[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 10ms/step - loss: 102.1273 - mae: 6.6404 - val_loss: 101.7248 - val_mae: 6.6185
Epoch 5/5
[1m33/33[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 12ms/step - loss: 101.4446 - mae: 6.6751 - val_loss: 100.0667 - val_mae: 6.5467
TF model and scaler saved.


# **Preparar archivos necesarios para el Dashboard**

In [13]:
# Guardar demand_hourly y zone lookup en csv para lectura por Streamlit
demand_hourly.to_csv("demand_hourly.csv", index=False)
zone_df.to_csv("taxi_zone_lookup.csv", index=False)
# Guardar sample preprocessed (reducido) para heatmap
df[['PULocationID','pickup_date','pickup_hour','pickup_latitude','pickup_longitude']].to_parquet("nyc_for_dashboard.parquet", index=False)
print("Archivos para dashboard listos.")

Archivos para dashboard listos.


# **Configurar clave OpenWeather y ngrok**

In [14]:
# Si tienes OpenWeather API key, p√©gala aqu√≠ (opcional). Si no, el app usar√° Open-Meteo.
OPENWEATHER_API_KEY = ""  # <--- pega aqu√≠ tu key (si no, deja vac√≠o)

# Ngrok token (opcional para exponer)
NGROK_TOKEN = "35XbxdR5DiJ1qqoI9IWiDr0iCAT_4V2W1m53QWEtmKP3Vjrsu"  # <--- pega tu ngrok authtoken si quieres enlace p√∫blico estable

# **Escribir el Streamlit app completo (app.py)**

In [27]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import folium
from folium.plugins import HeatMap
from streamlit_folium import st_folium
import joblib
import requests
import os

st.set_page_config(page_title="TaxiVision - Predicci√≥n de Demanda NYC", layout="wide")

# Cargar assets
@st.cache_data(show_spinner=True)
def load_assets():
    demand_hourly = pd.read_csv("demand_hourly.csv")
    zone_lookup = pd.read_csv("taxi_zone_lookup.csv")
    df_points = pd.read_parquet("nyc_for_dashboard.parquet")
    model_rf = joblib.load("rf_demand_model.joblib")
    encoder = joblib.load("ordinal_encoder.joblib")
    # scaler/model TF optional
    return demand_hourly, zone_lookup, df_points, model_rf, encoder

demand_hourly, zone_lookup, df_points, model_rf, encoder = load_assets()

st.title("TaxiVision - Predicci√≥n de Demanda NYC")

# Top layout: selector zona
st.sidebar.header("Configuraci√≥n")
boroughs = ["Todos"] + sorted(zone_lookup["Borough"].dropna().unique().tolist())
sel_borough = st.sidebar.selectbox("Filtrar Borough", boroughs)

if sel_borough != "Todos":
    zones_available = zone_lookup[zone_lookup["Borough"]==sel_borough]["Zone"].tolist()
else:
    zones_available = zone_lookup["Zone"].tolist()

zone_selected = st.selectbox("Seleccione Zona (por nombre):", zones_available, index=0)
zone_row = zone_lookup[zone_lookup["Zone"]==zone_selected].iloc[0]
zone_id = int(zone_row["LocationID"])

st.markdown(f"**Zona seleccionada:** {zone_selected} ‚Äî LocationID: {zone_id} ‚Äî Borough: {zone_row['Borough']}")

# -----------------------------
# Heatmap por hora
# -----------------------------
st.header("Mapa de calor (pickups)")

hour_sel = st.slider("Selecciona la hora (pickup)", 0, 23, 12)
df_hour = df_points[df_points["pickup_hour"]==hour_sel].dropna(subset=["pickup_latitude","pickup_longitude"])
heat_data = df_hour[["pickup_latitude","pickup_longitude"]].values.tolist()

m = folium.Map(location=[40.75, -73.98], zoom_start=11)
if len(heat_data) > 0:
    HeatMap(heat_data, radius=8, blur=10, max_zoom=13).add_to(m)
st_folium(m, width=1000, height=550)

# -----------------------------
# Predicci√≥n por hora para zona seleccionada
# -----------------------------
st.header("Demanda Esperada por Hora (Predicci√≥n)")

# Construir input para las 24 horas
hours = np.arange(0,24)
# Usamos month & dow as median from dataset date selection ‚Äî fex: take most common month and dow
# But simplest: use month=6, dow=2 as placeholders OR compute from dataset distribution
common_month = int(demand_hourly.get('pickup_hour', pd.Series([1])).index[0]) if False else 6
# Build table
input_df = pd.DataFrame({
    "PULocationID": [zone_id]*24,
    "pickup_hour": hours,
    "month": [6]*24,
    "dow": [2]*24
})
# Encode and predict with RF
# Copiar entrada
X_enc = input_df[["PULocationID"]].copy()

# Convertir todo a string, incluso NaN
X_enc["PULocationID"] = X_enc["PULocationID"].astype(str)

# Reemplazar valores inv√°lidos por "-1" (string)
X_enc["PULocationID"] = X_enc["PULocationID"].replace(["nan", "None", ""], "-1")

# Transformar
X_enc["PULocationID_enc"] = encoder.transform(X_enc[["PULocationID"]])

X_enc = pd.DataFrame({
    "PULocationID_enc": X_enc["PULocationID_enc"],
    "pickup_hour": input_df["pickup_hour"],
    "month": input_df["month"],
    "dow": input_df["dow"]
})

preds = model_rf.predict(X_enc)
preds = np.clip(preds, a_min=0, a_max=None)

pred_df = pd.DataFrame({"pickup_hour": hours, "prediction": preds})
# Smooth with rolling mean for visualization
pred_df["prediction_smooth"] = pred_df["prediction"].rolling(3, center=True, min_periods=1).mean()

# Plot
st.line_chart(pred_df.set_index("pickup_hour")["prediction_smooth"])

# Show table
st.dataframe(pred_df.style.format({"prediction":"{:.1f}", "prediction_smooth":"{:.1f}"}))

# -----------------------------
# Comparar con hist√≥rico (demand_hourly)
# -----------------------------
st.header("Comparaci√≥n con demanda hist√≥rica (media por hora)")

hist = demand_hourly[demand_hourly["PULocationID"]==zone_id].sort_values("pickup_hour")
if hist.empty:
    st.info("No hay datos hist√≥ricos suficientes para esta zona en el sample.")
else:
    merged = pred_df.merge(hist[["pickup_hour","demand_mean"]], on="pickup_hour", how="left")
    st.line_chart(merged.set_index("pickup_hour")[["prediction_smooth","demand_mean"]])

# -----------------------------
# Integraci√≥n m√≠nima de Clima
# -----------------------------
st.header("üå§ Clima (hist√≥rico) para la fecha seleccionada")

# Fecha selector basado en df_points
dates = sorted(df_points["pickup_date"].unique())
sel_date = st.selectbox("Selecciona fecha (para consultar clima):", dates, index=0)

use_openweather = os.environ.get("OPENWEATHER_API_KEY", "") != "" or False
OW_KEY = os.environ.get("OPENWEATHER_API_KEY","")

if OW_KEY:
    st.write("Usando OpenWeather (API key encontrada en variable de entorno).")
else:
    st.write("Usando Open-Meteo (no requiere key).")

def get_weather_openmete(date):
    # usa Open-Meteo (latitude/longitude de NYC)
    url = (
        "https://api.open-meteo.com/v1/forecast?"
        "latitude=40.7128&longitude=-74.0060&hourly=temperature_2m,precipitation,"
        f"timezone=America%2FNew_York&start_date={date}&end_date={date}"
    )
    r = requests.get(url, timeout=10)
    return r.json()

def get_weather_openweather(date, key):
    # OpenWeather One Call (historical) requiere timestamp unix por hora y cuenta PRO para historico extendido.
    # Aqu√≠ hacemos una simple consulta de forecast para la fecha cercana (si no est√° disponible, fallback)
    url = f"https://api.openweathermap.org/data/2.5/onecall?lat=40.7128&lon=-74.0060&exclude=minutely,daily,alerts&appid={key}&units=metric"
    r = requests.get(url, timeout=10)
    return r.json()

weather_json = None
if OW_KEY:
    try:
        weather_json = get_weather_openweather(sel_date, OW_KEY)
        # Extract hourly temp if available
        if 'hourly' in weather_json:
            hrs = weather_json['hourly']
            wdf = pd.DataFrame(hrs)[['dt','temp','rain']] if 'rain' in pd.DataFrame(hrs).columns else pd.DataFrame(hrs)[['dt','temp']]
            # convert dt
            wdf['Hora'] = pd.to_datetime(wdf['dt'], unit='s').dt.strftime("%Y-%m-%dT%H:%M")
            wdf = wdf.rename(columns={'temp':'Temperatura (¬∞C)'}).fillna(0)
        else:
            weather_json = get_weather_openmete(sel_date)
            hours = weather_json["hourly"]["time"]
            temps = weather_json["hourly"]["temperature_2m"]
            precip = weather_json["hourly"]["precipitation"]
            wdf = pd.DataFrame({"Hora":hours, "Temperatura (¬∞C)":temps, "Precipitaci√≥n (mm)":precip})
    except Exception as e:
        st.warning("OpenWeather falla: usando Open-Meteo como fallback. Error: "+str(e))
        weather_json = get_weather_openmete(sel_date)
        hours = weather_json["hourly"]["time"]
        temps = weather_json["hourly"]["temperature_2m"]
        precip = weather_json["hourly"]["precipitation"]
        wdf = pd.DataFrame({"Hora":hours, "Temperatura (¬∞C)":temps, "Precipitaci√≥n (mm)":precip})
else:
    try:
        weather_json = get_weather_openmete(sel_date)

        # Validaci√≥n robusta de clave "hourly"
        if ("hourly" not in weather_json or
            "time" not in weather_json["hourly"] or
            "temperature_2m" not in weather_json["hourly"]):

            st.warning("Open-Meteo no devolvi√≥ datos horarios para esta fecha.")
            wdf = pd.DataFrame()

        else:
            hours = weather_json["hourly"]["time"]
            temps = weather_json["hourly"]["temperature_2m"]
            precip = weather_json["hourly"].get("precipitation", [0]*len(hours))

            wdf = pd.DataFrame({
                "Hora": hours,
                "Temperatura (¬∞C)": temps,
                "Precipitaci√≥n (mm)": precip
            })

    except Exception as e:
        st.error("Error consultando Open-Meteo: " + str(e))
        wdf = pd.DataFrame()

if not wdf.empty:
    st.subheader("Valores horarios del clima")
    st.dataframe(wdf)
    st.line_chart(wdf.set_index("Hora")["Temperatura (¬∞C)"])

st.markdown("---")
st.caption("Dashboard generado con sample del dataset. Para producci√≥n, usar todos los datos y re-entrenar modelo con features extendidos (clima, eventos, holidays).")

Overwriting app.py


In [24]:
print("CATEGOR√çAS DEL ENCODER:", enc.categories_)

CATEGOR√çAS DEL ENCODER: [array(['0.0_0.0', '18.63_-76.66', '19.47_-87.45', '30.13_-77.97',
       '35.03_-70.05', '35.87_-71.06', '4.79_-73.95', '40.06_-73.34',
       '40.12_-72.45', '40.17_-75.5', '40.27_-74.03', '40.47_-74.46',
       '40.54_-74.16', '40.55_-74.2', '40.55_-74.3', '40.56_-74.05',
       '40.57_-74.23', '40.58_-73.74', '40.58_-73.96', '40.58_-73.99',
       '40.59_-73.75', '40.59_-73.79', '40.59_-73.93', '40.59_-73.97',
       '40.59_-73.99', '40.59_-74.25', '40.61_-73.76', '40.61_-73.78',
       '40.61_-73.91', '40.61_-73.96', '40.61_-73.98', '40.61_-74.0',
       '40.61_-74.01', '40.61_-74.03', '40.61_-74.04', '40.61_-74.1',
       '40.62_-73.78', '40.62_-73.85', '40.62_-73.94', '40.62_-73.97',
       '40.62_-74.0', '40.62_-74.02', '40.62_-74.03', '40.62_-74.04',
       '40.62_-74.27', '40.63_-73.77', '40.63_-73.78', '40.63_-73.79',
       '40.63_-73.8', '40.63_-73.88', '40.63_-73.9', '40.63_-73.93',
       '40.63_-73.94', '40.63_-73.95', '40.63_-73.96', '40.63_-73

# **Ejecutar Streamlit y exponer con ngrok**

In [28]:
from pyngrok import ngrok, conf
import os, time, subprocess, sys

# NGROK_TOKEN = "35XbxdR5DiJ1qqoI9IWiDr0iCAT_4V2W1m53QWEtmKP3Vjrsu"  # Token ngrok
if NGROK_TOKEN:
    conf.get_default().auth_token = NGROK_TOKEN

# Terminar cualquier t√∫nel ngrok existente para evitar conflictos
ngrok.kill()
time.sleep(1) # Add a small delay after killing tunnels

# Iniciar streamlit en background
print("Iniciando Streamlit...")
# Ejecutar de forma que no bloquee la celda
get_ipython().system_raw("streamlit run app.py --server.port 8501 &\n")

# Abrir t√∫nel ngrok y mostrar la URL p√∫blica
time.sleep(3)
public_url = ngrok.connect(8501)
print("T√∫nel ngrok creado en:", public_url)
print("Abre la URL en tu navegador (puede tardar unos segundos en estar disponible).")

Iniciando Streamlit...




T√∫nel ngrok creado en: NgrokTunnel: "https://illa-cerebrational-appellatively.ngrok-free.dev" -> "http://localhost:8501"
Abre la URL en tu navegador (puede tardar unos segundos en estar disponible).
