### 🔹 Paso 1: Importar bibliotecas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

### 🔹 Paso 2: Cargar y explorar los datos

In [None]:
waze = pd.read_csv("waze_dataset.csv")
print(waze.head())
print(waze.info())

### 🔹 Paso 3: Preprocesamiento de datos

In [None]:
waze = waze.dropna(subset=['label'])
waze['churned'] = waze['label'].map({'retained': 0, 'churned': 1})
waze = pd.get_dummies(waze, columns=['device'], drop_first=True)
waze.drop(['ID', 'label'], axis=1, inplace=True)
print(waze.head())
print(waze.info())

### 🔹 Paso 4: Separar variables predictoras y objetivo

In [None]:
X = waze.drop("churned", axis=1)
y = waze["churned"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 🔹 Paso 5: Modelo Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Report:")
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))

### 🔹 Paso 6: Random Forest con balanceo

In [None]:
rf_balanced = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_balanced.fit(X_train, y_train)
y_pred_rf_balanced = rf_balanced.predict(X_test)
print("Random Forest con class_weight='balanced'")
print(classification_report(y_test, y_pred_rf_balanced))
print("ROC-AUC:", roc_auc_score(y_test, rf_balanced.predict_proba(X_test)[:, 1]))

### 🔹 Paso 7: Modelo XGBoost

In [None]:
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost Report:")
print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test)[:, 1]))

### 🔹 Paso 8: Importancia de variables

In [None]:
feat_importances = pd.Series(rf.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Features - Random Forest")
plt.tight_layout()
plt.show()

### 🔹 Paso 9: Análisis de usuarios recientes

In [None]:
X_test['es_reciente'] = X_test['n_days_after_onboarding'] < 500
print(X_test['es_reciente'].value_counts())
recientes_idx = X_test['es_reciente']
print(classification_report(y_test[recientes_idx], y_pred_xgb[recientes_idx]))

### 🔹 Paso 10: Comparación de grupos

In [None]:
fn_idx = (y_test == 1) & (y_pred_xgb == 0) & (X_test['es_reciente'])
tp_idx = (y_test == 1) & (y_pred_xgb == 1) & (X_test['es_reciente'])
resto_idx = ~X_test['es_reciente']
variables = ['n_days_after_onboarding', 'drives', 'sessions', 'total_navigations_fav1']
comparacion = pd.DataFrame({
    'Falsos Negativos': X_test.loc[fn_idx, variables].mean(),
    'Verdaderos Positivos': X_test.loc[tp_idx, variables].mean(),
    'Usuarios No Recientes': X_test.loc[resto_idx, variables].mean()
})
print(comparacion)

### 🔹 Paso 11: Visualización comparativa

In [None]:
X_test_plot = X_test.copy()
X_test_plot['grupo'] = 'Otro'
X_test_plot.loc[fn_idx, 'grupo'] = 'Falsos Negativos Recientes'
X_test_plot.loc[tp_idx, 'grupo'] = 'Verdaderos Positivos Recientes'
mask = X_test_plot['grupo'].isin(['Falsos Negativos Recientes', 'Verdaderos Positivos Recientes'])
plt.figure(figsize=(10,6))
sns.boxplot(
    data=X_test_plot[mask],
    x='grupo',
    y='total_navigations_fav1',
    hue='grupo',
    palette='Set2',
    legend=False
)
plt.title("Distribución de 'total_navigations_fav1' en usuarios recientes")
plt.ylabel("Total de Navegaciones en Zona Favorita 1")
plt.xlabel("")
plt.xticks(rotation=15)
plt.grid(axis='y')
plt.tight_layout()
plt.show()