Importación y Carga de Datos

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from xgboost import XGBClassifier

# Cargar el dataset
df = pd.read_csv('Telco Churn dataset 2.csv')

Preprocesamiento

In [2]:
# 1. Preprocesamiento
# a. Crear la columna 'Churn' y asignar 1 si 'Churn' es 'Yes', de lo contrario 0
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# b. Eliminar columnas con más del 50% de datos faltantes
threshold = int(0.5 * len(df))
df = df.dropna(thresh=threshold, axis=1)

# c. Reemplazar valores atípicos por la media
for col in df.select_dtypes(include=['float64', 'int64']):
    if col != 'Churn':  # Asegurarse de no modificar la columna 'Churn'
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        df[col] = df[col].apply(lambda x: df[col].mean() if (x < (Q1 - 1.5 * IQR)) or (x > (Q3 + 1.5 * IQR)) else x)

# d. Convertir variables categóricas a numéricas y llenar valores faltantes
for column in df.columns:
    if df[column].dtype == 'object' and column != 'Churn':
        df[column].fillna(df[column].mode()[0], inplace=True)
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
    elif df[column].dtype in ['int64', 'float64']:
        df[column].fillna(df[column].median(), inplace=True)

# e. Normalización
cols_to_scale = df.columns.tolist()
cols_to_scale.remove('Churn')
scaler = StandardScaler()
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# f. Asegurarse de que 'Churn' sea int
df['Churn'] = df['Churn'].astype(int)

Selección de características

In [3]:
# 2. Selección de características
xgb_for_feature_selection = XGBClassifier(
    objective='binary:logistic', 
    random_state=42, 
    use_label_encoder=False, 
    eval_metric='logloss'
)
xgb_for_feature_selection.fit(df.drop('Churn', axis=1), df['Churn'])
threshold = 0.01  
selected_features = df.drop('Churn', axis=1).columns[(xgb_for_feature_selection.feature_importances_ > threshold)].tolist()
print("Características seleccionadas:", selected_features)

Características seleccionadas: ['customerID', 'SeniorCitizen', 'tenure', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'InternationalPlan', 'VoiceMailPlan', 'NumbervMailMessages', 'TotalDayMinutes', 'TotalEveMinutes', 'TotalNightMinutes', 'TotalIntlMinutes', 'TotalIntlCalls', 'CustomerServiceCalls']


Balanceo

In [4]:
# 3. Balanceo con SMOTE
X = df[selected_features]
y = df['Churn']
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

División de Conjunto

In [5]:
# 4. División de Conjunto
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

Entrenamiento de LightGBM con hiperparámetros

In [6]:
# 5. Entrenamiento de LightGBM con búsqueda de hiperparámetros
lgb_model = lgb.LGBMClassifier(random_state=42, n_jobs=-1)

param_grid = {
    'num_leaves': [20, 31, 40, 50],
    'max_depth': [3, 5, 7, -1],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200, 300],
}

grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=5)
grid_search_result = grid_search.fit(X_train, y_train)

# Mostrar los resultados
print("Mejor: %f usando %s" % (grid_search_result.best_score_, grid_search_result.best_params_))



[LightGBM] [Info] Number of positive: 1828, number of negative: 1820
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3402
[LightGBM] [Info] Number of data points in the train set: 3648, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501096 -> initscore=0.004386
[LightGBM] [Info] Start training from score 0.004386
[LightGBM] [Info] Number of positive: 1828, number of negative: 1820
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001227 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3472
[LightGBM] [Info] Number of data points in the train set: 3648, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501096 -> initscore=0.004386
[LightGBM] 

Evaluación del modelo

In [7]:
# 6. Evaluación del modelo con los mejores hiperparámetros
y_pred = grid_search_result.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(conf_matrix)
print("Accuracy con LightGBM:", acc)
print(report)

[[565  10]
 [ 19 546]]
Accuracy con LightGBM: 0.974561403508772
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       575
           1       0.98      0.97      0.97       565

    accuracy                           0.97      1140
   macro avg       0.97      0.97      0.97      1140
weighted avg       0.97      0.97      0.97      1140

