# Laboratorio 11 - Abandono de Clientes

----

Santiago Pereira 22318

Nancy Mazariegos 22513

----

### Librerías y carga de datos

In [4]:
%pip install pandas scikit-learn matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_curve, roc_auc_score, classification_report
)

PATH_TRAIN = Path("abandono_clientes.csv")
PATH_NEW   = Path("clientes_nuevos.csv")
OUT_PRED   = Path("predicciones_clientes_nuevos.csv")

pd.set_option("display.max_columns", 100)

df = pd.read_csv(PATH_TRAIN)
df_new = pd.read_csv(PATH_NEW)

print("Train shape:", df.shape)
print("New shape:", df_new.shape)

display(df.head(3))
display(df_new.head(3))



Note: you may need to restart the kernel to use updated packages.
Train shape: (900, 10)
New shape: (6, 9)


Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,Churn
0,Cameron Williams,42.0,11066.8,0,7.22,8.0,2013-08-30 07:00:40,"10265 Elizabeth Mission Barkerburgh, AK 89518",Harvey LLC,1
1,Kevin Mueller,41.0,11916.22,0,6.5,11.0,2013-08-13 00:38:46,"6157 Frank Gardens Suite 019 Carloshaven, RI 1...",Wilson PLC,1
2,Eric Lozano,38.0,12884.75,0,6.67,12.0,2016-06-29 06:20:07,"1331 Keith Court Alyssahaven, DE 90114","Miller, Johnson and Wallace",1


Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company
Andrew Mccall,37.0,9935.53,1,7.71,8.0,2011-08-29 18:37:54,"38612 Johnny Stravenue Nataliebury, WI 15717-8316",King Ltd,
Michele Wright,23.0,7526.94,1,9.28,15.0,2013-07-22 18:19:54,"21083 Nicole Junction Suite 332, Youngport, ME...",Cannon-Benson,
Jeremy Chang,65.0,100.0,1,1.0,15.0,2006-12-11 07:48:13,"085 Austin Views Lake Julialand, WY 63726-4298",Barron-Robertson,


----

### Columnas y tipos

In [5]:
print("Columnas TRAIN:\n", df.columns.tolist())
print("\nInfo TRAIN:")
print(df.dtypes)

print("\nColumnas NUEVOS:\n", df_new.columns.tolist())
print("\nInfo NUEVOS:")
print(df_new.dtypes)


Columnas TRAIN:
 ['Names', 'Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites', 'Onboard_date', 'Location', 'Company', 'Churn']

Info TRAIN:
Names               object
Age                float64
Total_Purchase     float64
Account_Manager      int64
Years              float64
Num_Sites          float64
Onboard_date        object
Location            object
Company             object
Churn                int64
dtype: object

Columnas NUEVOS:
 ['Names', 'Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites', 'Onboard_date', 'Location', 'Company']

Info NUEVOS:
Names              float64
Age                float64
Total_Purchase       int64
Account_Manager    float64
Years              float64
Num_Sites           object
Onboard_date        object
Location            object
Company            float64
dtype: object


----

### Normalización de datos

In [6]:
def normalize_cols(d):
    d = d.copy()
    d.columns = [c.strip().replace(" ", "_").replace("-", "_") for c in d.columns]
    if "Num_Sites" in d.columns and "Num_sites" not in d.columns:
        d = d.rename(columns={"Num_Sites": "Num_sites"})
    if "num_sites" in d.columns and "Num_sites" not in d.columns and "Num_sites" not in d.columns:
        d = d.rename(columns={"num_sites": "Num_sites"})
    if "Num_Sites" in d.columns:
        d = d.rename(columns={"Num_Sites": "Num_sites"})
    return d

df = normalize_cols(df)
df_new = normalize_cols(df_new)
print(df.columns.tolist())
print(df_new.columns.tolist())


['Names', 'Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_sites', 'Onboard_date', 'Location', 'Company', 'Churn']
['Names', 'Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_sites', 'Onboard_date', 'Location', 'Company']


In [None]:
feature_cols = ["Age", "Total_Purchase", "Account_Manager", "Years", "Num_sites"]
target_col = "Churn"

missing_cols = [c for c in feature_cols if c not in df.columns]
if missing_cols:
    raise ValueError(f"Faltan columnas en TRAIN: {missing_cols}")

if target_col not in df.columns:
    raise ValueError("No se encontró la columna 'Churn' en TRAIN.")

X = df[feature_cols].copy()
y = df[target_col].copy()

if X["Account_Manager"].dtype.kind not in "biu":
    X["Account_Manager"] = X["Account_Manager"].astype("int64")


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_train.shape, X_test.shape, y_train.value_counts(normalize=True).round(3), y_test.value_counts(normalize=True).round(3)


((630, 5),
 (270, 5),
 Churn
 0    0.833
 1    0.167
 Name: proportion, dtype: float64,
 Churn
 0    0.833
 1    0.167
 Name: proportion, dtype: float64)

In [9]:
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=1000, solver="lbfgs"))
])

pipe.fit(X_train, y_train)

print("Entrenado.")


Entrenado.
