In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df_train = pd.read_csv('../data/processed/train_clean.csv', sep=';')

df_train.columns

TARGET='churn'

In [None]:

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)

TARGET = 'churn'

# Separamos los tipos de columnas para analizarlas de forma diferente
categorical_cols = df_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = df_train.select_dtypes(include=['number']).columns.tolist()

# Nos aseguramos de que la variable objetivo no está en las listas de predictoras
if TARGET in numerical_cols:
    numerical_cols.remove(TARGET)
if TARGET in categorical_cols:
    categorical_cols.remove(TARGET)

#After checking there are no repeated Customer_ID values, that is not a relevant value even it is a number
df_train.remove['Customer_ID']

print(f"Variable Objetivo: {TARGET}")
print(f"\nVariables Numéricas ({len(numerical_cols)}): {numerical_cols}")
print(f"\nVariables Categóricas ({len(categorical_cols)}): {categorical_cols}")


Variable Objetivo: churn

Variables Numéricas (87): ['rev_Mean', 'mou_Mean', 'totmrc_Mean', 'da_Mean', 'ovrmou_Mean', 'ovrrev_Mean', 'vceovr_Mean', 'datovr_Mean', 'roam_Mean', 'change_mou', 'change_rev', 'drop_vce_Mean', 'drop_dat_Mean', 'blck_vce_Mean', 'blck_dat_Mean', 'unan_vce_Mean', 'unan_dat_Mean', 'plcd_vce_Mean', 'plcd_dat_Mean', 'recv_vce_Mean', 'recv_sms_Mean', 'comp_vce_Mean', 'comp_dat_Mean', 'custcare_Mean', 'ccrndmou_Mean', 'cc_mou_Mean', 'inonemin_Mean', 'threeway_Mean', 'mou_cvce_Mean', 'mou_cdat_Mean', 'mou_rvce_Mean', 'owylis_vce_Mean', 'mouowylisv_Mean', 'iwylis_vce_Mean', 'mouiwylisv_Mean', 'peak_vce_Mean', 'peak_dat_Mean', 'mou_peav_Mean', 'mou_pead_Mean', 'opk_vce_Mean', 'opk_dat_Mean', 'mou_opkv_Mean', 'mou_opkd_Mean', 'drop_blk_Mean', 'attempt_Mean', 'complete_Mean', 'callfwdv_Mean', 'callwait_Mean', 'months', 'uniqsubs', 'actvsubs', 'totcalls', 'totmou', 'totrev', 'adjrev', 'adjmou', 'adjqty', 'avgrev', 'avgmou', 'avgqty', 'avg3mou', 'avg3qty', 'avg3rev', 'avg6

In [19]:
for col in numerical_cols:
    unique_values = df_train[col].unique()
    if len(unique_values) == 2:
        #if 2 values, change from int or float to type category
        df_train[col] = df_train[col].astype(int).astype('category')
        print(f"Numerical column {col} has {len(unique_values)} unique values: {df_train[col].unique()}\n")

for col in categorical_cols:
    unique_values = df_train[col].unique()
    #Convert from object to category
    df_train[col] = df_train[col].astype('category')
    print(f"Categorical {col} has {len(unique_values)} unique values: {unique_values}\n")


Numerical column truck has 2 unique values: [0, 1]
Categories (2, int64): [0, 1]

Numerical column rv has 2 unique values: [0, 1]
Categories (2, int64): [0, 1]

Numerical column forgntvl has 2 unique values: [0, 1]
Categories (2, int64): [0, 1]

Numerical column infobase_isnull has 2 unique values: [0, 1]
Categories (2, int64): [0, 1]

Numerical column rev_Mean_isnull has 2 unique values: [0, 1]
Categories (2, int64): [0, 1]

Numerical column hnd_price_isnull has 2 unique values: [0, 1]
Categories (2, int64): [0, 1]

Numerical column change_rev_isnull has 2 unique values: [0, 1]
Categories (2, int64): [0, 1]

Numerical column avg6rev_isnull has 2 unique values: [0, 1]
Categories (2, int64): [0, 1]

Numerical column lor_isnull has 2 unique values: [0, 1]
Categories (2, int64): [0, 1]

Numerical column adults_isnull has 2 unique values: [0, 1]
Categories (2, int64): [0, 1]

Numerical column income_isnull has 2 unique values: [0, 1]
Categories (2, int64): [0, 1]

Numerical column numbcars

In [23]:
len(df_train['Customer_ID'].unique())

80000

### Reducción de dimensionalidad

In [20]:
X = df_train.drop(columns=[TARGET])
y = df_train[TARGET]

X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print("DataFrame original:", X.shape)
print("DataFrame codificado:", X_encoded.shape)

DataFrame original: (80000, 108)
DataFrame codificado: (80000, 236)


In [22]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_encoded, y)

feature_importance = pd.DataFrame({
    'feature': X_encoded.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("--- Top 20 Variables Más Importantes ---")
print(feature_importance.head(20))

--- Top 20 Variables Más Importantes ---
          feature  importance
76        eqpdays    0.029952
9      change_mou    0.023047
48         months    0.022735
77    Customer_ID    0.021578
1        mou_Mean    0.019169
54         adjrev    0.018955
10     change_rev    0.018652
53         totrev    0.018511
57         avgrev    0.018268
59         avgqty    0.018195
58         avgmou    0.017638
0        rev_Mean    0.017467
51       totcalls    0.017420
55         adjmou    0.017369
60        avg3mou    0.017221
56         adjqty    0.016977
52         totmou    0.016911
28  mou_cvce_Mean    0.016335
63        avg6mou    0.016152
41  mou_opkv_Mean    0.016010
