In [None]:
# %% [markdown]
# 0 ——————————————————————————————————————————————
#  Configuración global + lectura
# ---------------------------------------------------------

# %%
import sys, pathlib, warnings, json, joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# ——— rutas ———
ROOT = pathlib.Path().resolve()           # carpeta donde abriste Jupyter
for p in [ROOT] + list(ROOT.parents):
    if (p / "memebot3").is_dir():
        REPO = p
        break
else:
    raise RuntimeError("✗ No encuentro la carpeta ‘memebot3’")

PARQUET = REPO / "memebot3" / "data" / "features" / "features_202507.parquet"
assert PARQUET.exists(), PARQUET

# silencios menores
warnings.filterwarnings("ignore", category=FutureWarning)
sns.set_theme()

df = pd.read_parquet(PARQUET)
print("Shape:", df.shape)

In [None]:
# %%
df.info()

null_pct = (df.isna().mean() * 100).sort_values(ascending=False)
display(null_pct.head(15).to_frame("null_%"))

# Resumen estadístico ampliado
display(df.describe(percentiles=[.01, .1, .25, .5, .75, .9, .99]).T)

In [None]:
# %%
label_counts = df["label"].value_counts(dropna=False)
print(label_counts, "\nProporción positiva:", label_counts.get(1,0) / label_counts.sum())
sns.countplot(x="label", data=df)
plt.title("Distribución de la etiqueta"); plt.show()

In [None]:
# %%
num_cols = df.select_dtypes(include=[np.number]).columns.drop("label")
pbiserial = {
    col: stats.pointbiserialr(df["label"], df[col].fillna(df[col].median()))[0]
    for col in num_cols
}
corr_ser = pd.Series(pbiserial).sort_values(key=np.abs, ascending=False)
display(corr_ser.head(25).to_frame("pbiserial"))

In [None]:
# %%
plt.figure(figsize=(12,10))
corr = df[num_cols].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, vmax=.8, vmin=-.8, center=0,
            cmap="vlag", square=True, linewidths=.3)
plt.title("Correlación entre variables numéricas")
plt.show()

In [None]:
# %%
def outlier_summary(series):
    q1, q3 = np.percentile(series.dropna(), [25, 75])
    iqr = q3 - q1
    low, high = q1 - 1.5*iqr, q3 + 1.5*iqr
    z = stats.zscore(series.dropna())
    return {
        "iqr_outliers": ((series < low) | (series > high)).sum(),
        "z>3": (np.abs(z) > 3).sum(),
        "pct_outliers": ((series < low) | (series > high)).mean()*100
    }

out_df = (pd.DataFrame({c: outlier_summary(df[c]) for c in num_cols}).T
          .sort_values("pct_outliers", ascending=False))
display(out_df.head(15))

In [None]:
sns.boxplot(x=df["liquidity_usd"]); plt.xscale("log")
plt.title("Boxplot liquidity_usd (log)"); plt.show()

In [None]:
# %%
TOP_FEATS = corr_ser.index[:30]    # 30 con mayor |corr|
X = df[TOP_FEATS].fillna(0)
y = df["label"].astype(int)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

logreg = LogisticRegression(max_iter=1000, class_weight="balanced")
logreg.fit(X_scaled, y)
proba_lr = logreg.predict_proba(X_scaled)[:,1]
auc_lr = roc_auc_score(y, proba_lr)
print(f"AUC entrenamiento Logistic Reg = {auc_lr:.4f}")

In [None]:
# %%
# Split temporal: 80 % primeras filas → train
df_sorted = df.sort_values("timestamp")
split = int(len(df_sorted)*.8)
train_df, test_df = df_sorted.iloc[:split], df_sorted.iloc[split:]

FEATURES = [c for c in num_cols if c != "label"]   # todas numéricas

lgb_train = lgb.Dataset(train_df[FEATURES], train_df["label"])
lgb_test  = lgb.Dataset(test_df[FEATURES],  test_df["label"], reference=lgb_train)

params = dict(
    objective="binary",
    metric="auc",
    learning_rate=0.05,
    num_leaves=64,
    min_data_in_leaf=100,
    subsample=.8,
    colsample_bytree=.8,
    seed=42,
    verbosity=-1,
)

model = lgb.train(
    params, lgb_train, num_boost_round=800,
    valid_sets=[lgb_test],
    callbacks=[lgb.early_stopping(50, verbose=False)]
)

pred = model.predict(test_df[FEATURES], num_iteration=model.best_iteration)
auc_lgb = roc_auc_score(test_df["label"], pred)
print(f"AUC hold-out LightGBM = {auc_lgb:.4f}")

In [None]:
# %%
fpr, tpr, thr = roc_curve(test_df["label"], pred)
plt.plot(fpr, tpr); plt.plot([0,1],[0,1],"--",alpha=.4)
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC – LightGBM"); plt.show()

g = tpr - fpr               # máximo de Youden
best_idx = np.argmax(g)
best_thr = thr[best_idx]
print(f"Umbral óptimo (Youden): {best_thr:.3f}")

In [None]:
# %%
imp = model.feature_importance()
imp_df = pd.DataFrame({"feature": FEATURES, "gain": imp})
imp_df = imp_df.sort_values("gain", ascending=False)
display(imp_df.head(25))

TOP40 = imp_df.head(40)["feature"].tolist()
path_top = REPO / "memebot3" / "ml" / "top_features.txt"
path_top.write_text("\n".join(TOP40))
print("✔ top_features.txt guardado en", path_top)

In [None]:
# %%
MODEL_DIR = REPO / "memebot3" / "ml"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

joblib.dump(model, MODEL_DIR / "model.pkl")
json.dump({"features": FEATURES, "auc": auc_lgb},
          open(MODEL_DIR / "model.meta.json", "w"), indent=2)
print("Modelo + metadatos guardados en", MODEL_DIR)