In [None]:
# ============================================================
# TELECOM X - PARTE 2 (PREDICCIÓN DE CHURN) - NOTEBOOK COLAB
# ============================================================

# (0) OPCIONAL: instala librerías extra si las necesitas
# Quita los "!" si ya las tienes en tu entorno local
!pip install -q imbalanced-learn xgboost

# (1) IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, roc_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

try:
    from xgboost import XGBClassifier
    XGB_AVAILABLE = True
except:
    XGB_AVAILABLE = False

sns.set(style="whitegrid", context="notebook")

print("✅ Librerías cargadas. XGBoost disponible:", XGB_AVAILABLE)


In [None]:
# (2) CARGA DE DATOS
# Elige UNA de estas opciones:

# Opción A: cargar desde un CSV en GitHub (enlace RAW)
# data_url = "https://raw.githubusercontent.com/TU_USUARIO/TU_REPO/main/data/processed/telecom_churn_clean.csv"
# df = pd.read_csv(data_url)

# Opción B: subir el CSV desde tu PC a Colab
from google.colab import files
print("Sube tu CSV (ej: telecom_churn_clean.csv) con la columna objetivo 'Churn'")
uploaded = files.upload()
csv_name = list(uploaded.keys())[0]
df = pd.read_csv(csv_name)

print("✅ Datos cargados con forma:", df.shape)
display(df.head())
display(df.sample(5))


In [None]:
# (3) LIMPIEZA BÁSICA Y AJUSTES COMUNES DEL DATASET TELCO

# 3.1. Estandarizar nombre de la columna objetivo si viene como "Churn", "churn", etc.
target_candidates = [c for c in df.columns if c.lower() == "churn"]
if len(target_candidates) == 0:
    raise ValueError("No se encontró la columna objetivo 'Churn'. Renómbrala o ajusta el código.")
TARGET = target_candidates[0]  # nombre real en el df

# 3.2. Convertir 'Churn' a 0/1 si está como texto (Yes/No, Si/No, True/False)
def to_binary(x):
    if str(x).lower() in ["yes", "si", "true", "1"]:
        return 1
    elif str(x).lower() in ["no", "false", "0"]:
        return 0
    # fallback: intenta convertir a int
    try:
        return int(x)
    except:
        return np.nan

df[TARGET] = df[TARGET].apply(to_binary)

# 3.3. Manejo común del dataset Telco: TotalCharges a numérico si existe
for col in df.columns:
    if col.lower() == "totalcharges":
        df[col] = pd.to_numeric(df[col], errors="coerce")

# 3.4. Eliminar filas con target faltante
df = df.dropna(subset=[TARGET]).copy()

# 3.5. Eliminar columnas ID si existen (no informativas)
id_like = [c for c in df.columns if "id" in c.lower()]
df = df.drop(columns=id_like, errors="ignore")

print("✅ Limpieza básica hecha. Forma:", df.shape)
display(df.head())


In [None]:
# (4) EDA RÁPIDO (Exploratorio)
fig, axes = plt.subplots(1, 2, figsize=(12,4))

# Distribución de la variable objetivo
df[TARGET].value_counts().plot(kind="bar", ax=axes[0])
axes[0].set_title("Distribución de Churn (0=No, 1=Sí)")
axes[0].set_xlabel("Churn")
axes[0].set_ylabel("Conteo")

# Cargos mensuales si existe
mch = [c for c in df.columns if c.lower() == "monthlycharges"]
if mch:
    df[mch[0]].hist(ax=axes[1], bins=30)
    axes[1].set_title("Distribución de Cargos Mensuales")
    axes[1].set_xlabel("Cargos Mensuales")
    axes[1].set_ylabel("Frecuencia")
else:
    axes[1].axis("off")

plt.tight_layout()
plt.show()

# Top correlaciones con el target (numéricas)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols) > 1:
    corr = df[num_cols].corr()[TARGET].sort_values(ascending=False)
    print("🔎 Correlación con 'Churn' (numéricas):")
    display(corr)


In [None]:
# (5) SEPARAR VARIABLES Y TIPO DE DATOS
X = df.drop(columns=[TARGET]).copy()
y = df[TARGET].astype(int)

# Detectar categóricas y numéricas automáticamente
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print("📎 Categóricas:", cat_cols[:10], "..." if len(cat_cols) > 10 else "")
print("🔢 Numéricas:", num_cols[:10], "..." if len(num_cols) > 10 else "")

# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
print("✅ Split listo. Train:", X_train.shape, "Test:", X_test.shape)


In [None]:
# (6) PREPROCESAMIENTO (imputación + encoding + escalado)
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)


In [None]:
# (7) ENTRENAR MODELOS BASE (sin tuning), con SMOTE para desbalance
models = {
    "LogisticRegression": LogisticRegression(max_iter=200, n_jobs=None if hasattr(LogisticRegression, "n_jobs") else None),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
}
if XGB_AVAILABLE:
    models["XGBoost"] = XGBClassifier(
        n_estimators=400, max_depth=4, learning_rate=0.08, subsample=0.9, colsample_bytree=0.9,
        eval_metric="logloss", random_state=42
    )

results = {}

for name, clf in models.items():
    pipe = ImbPipeline(steps=[
        ("pre", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("model", clf)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    if hasattr(pipe.named_steps["model"], "predict_proba"):
        y_proba = pipe.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, y_proba)
    else:
        # Si no hay predict_proba, aproximamos con predicciones binarias
        y_proba = None
        auc = np.nan

    rpt = classification_report(y_test, y_pred, output_dict=True)
    results[name] = {
        "pipeline": pipe,
        "report": rpt,
        "auc": auc,
        "pred": y_pred,
        "proba": y_proba
    }

    print(f"\n🧠 Modelo: {name}")
    print(pd.DataFrame(rpt).round(3))
    print("AUC:", round(auc, 4))


In [None]:
# (8) MATRICES DE CONFUSIÓN Y ROC
fig, axes = plt.subplots(1, len(results), figsize=(6*len(results), 4))

if len(results) == 1:
    axes = [axes]

for ax, (name, res) in zip(axes, results.items()):
    cm = confusion_matrix(y_test, res["pred"])
    sns.heatmap(cm, annot=True, fmt="d", cbar=False, ax=ax)
    ax.set_title(f"Matriz de Confusión - {name}")
    ax.set_xlabel("Predicción")
    ax.set_ylabel("Real")

plt.tight_layout()
plt.show()

# ROC
plt.figure(figsize=(6,4))
for name, res in results.items():
    if res["proba"] is not None:
        fpr, tpr, _ = roc_curve(y_test, res["proba"])
        plt.plot(fpr, tpr, label=f"{name} (AUC={res['auc']:.3f})")
plt.plot([0,1], [0,1], "--", alpha=0.5)
plt.title("Curvas ROC")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.show()


In [None]:
# (9) TUNING RÁPIDO DEL MEJOR MODELO (opcional)
# Elegimos el que tenga mayor AUC inicial
best_name = max(results, key=lambda k: (results[k]["auc"] if not np.isnan(results[k]["auc"]) else -1))
print("🏆 Mejor modelo inicial por AUC:", best_name)

best_pipe = results[best_name]["pipeline"]

# Definimos una grilla pequeña según el modelo
param_grid = {}
if best_name == "LogisticRegression":
    param_grid = {
        "model__C": [0.1, 1.0, 2.0],
        "model__penalty": ["l2"],
        "model__solver": ["lbfgs"]
    }
elif best_name == "RandomForest":
    param_grid = {
        "model__n_estimators": [300, 500],
        "model__max_depth": [None, 8, 12],
        "model__min_samples_split": [2, 5]
    }
elif best_name == "XGBoost" and XGB_AVAILABLE:
    param_grid = {
        "model__n_estimators": [300, 500],
        "model__max_depth": [3, 4, 6],
        "model__learning_rate": [0.05, 0.1],
        "model__subsample": [0.8, 1.0]
    }

if param_grid:
    grid = GridSearchCV(
        estimator=best_pipe,
        param_grid=param_grid,
        scoring="f1",  # puedes usar "roc_auc" o "f1"
        cv=3,
        n_jobs=-1,
        verbose=0
    )
    grid.fit(X_train, y_train)
    tuned = grid.best_estimator_
    y_pred = tuned.predict(X_test)
    y_proba = tuned.predict_proba(X_test)[:,1] if hasattr(tuned.named_steps["model"], "predict_proba") else None
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else np.nan

    print("🔧 Mejores parámetros:", grid.best_params_)
    print(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).round(3))
    print("AUC (tuned):", round(auc, 4))

    best_pipe = tuned
else:
    print("Sin grid de parámetros para este modelo o XGBoost no disponible.")


In [None]:
# (10) IMPORTANCIA DE VARIABLES (si el modelo lo permite)
# Nota: para modelos con OneHotEncoder, las columnas se expanden.
# Aquí mostramos importancias para RandomForest/XGBoost si están disponibles.

def get_feature_names(preprocessor, num_cols, cat_cols):
    num_features = num_cols
    cat_encoder = preprocessor.named_transformers_["cat"].named_steps["onehot"]
    cat_features = cat_encoder.get_feature_names_out(cat_cols).tolist()
    return num_features + cat_features

model = best_pipe.named_steps["model"]
pre = best_pipe.named_steps["pre"]

if hasattr(model, "feature_importances_"):
    feat_names = get_feature_names(pre, num_cols, cat_cols)
    importances = pd.Series(model.feature_importances_, index=feat_names).sort_values(ascending=False)
    top20 = importances.head(20)

    plt.figure(figsize=(8,6))
    top20[::-1].plot(kind="barh")
    plt.title("Top 20 Importancias de Variables")
    plt.xlabel("Importancia")
    plt.tight_layout()
    plt.show()
else:
    print("El modelo seleccionado no expone 'feature_importances_' (prueba con RandomForest o XGBoost).")


In [None]:
# (11) GUARDAR DATASET TRATADO Y MODELO
# Exportar la versión procesada mínima (X + y en un solo CSV) por si la necesitas
processed = df.copy()
processed.to_csv("telecom_churn_processed.csv", index=False)
print("💾 Guardado telecom_churn_processed.csv")

# Guardar el modelo entrenado (pipeline completo) con joblib
import joblib
joblib.dump(best_pipe, "modelo_churn_pipeline.joblib")
print("💾 Guardado modelo_churn_pipeline.joblib")

# Descargar a tu PC (si corres en Colab)
from google.colab import files
files.download("telecom_churn_processed.csv")
files.download("modelo_churn_pipeline.joblib")
