# Machine Learning

Importación de librerias

In [115]:
import pandas as pd # Librería de lectura de datos
import numpy as np # Librería de cálculo numérico

import matplotlib.pyplot as plt # Librería de visualización de datos

from sklearn.model_selection import train_test_split,GridSearchCV # Función para dividir los datos en entrenamiento y prueba
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier,plot_tree, export_text

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve # Funciones para evaluar el rendimiento del modelo

Carga del dataset

In [116]:
df = pd.read_csv('../clean_data/telco-customer.csv')

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  7043 non-null   object 
 1   seniorcitizen           7043 non-null   object 
 2   partner                 7043 non-null   object 
 3   dependents              7043 non-null   object 
 4   tenure                  7043 non-null   int64  
 5   phoneservice            7043 non-null   object 
 6   multiplelines           7043 non-null   object 
 7   internetservice         7043 non-null   object 
 8   onlinesecurity          7043 non-null   object 
 9   onlinebackup            7043 non-null   object 
 10  deviceprotection        7043 non-null   object 
 11  techsupport             7043 non-null   object 
 12  streamingtv             7043 non-null   object 
 13  streamingmovies         7043 non-null   object 
 14  contract                7043 non-null   

In [118]:
df_copy = df.copy()

Cambio necesario a variables objetos a categóricas

In [119]:
cols_object = df.select_dtypes(include=['object']).columns
df_copy[cols_object] = df_copy[cols_object].astype('category')
df_copy.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   gender                  7043 non-null   category
 1   seniorcitizen           7043 non-null   category
 2   partner                 7043 non-null   category
 3   dependents              7043 non-null   category
 4   tenure                  7043 non-null   int64   
 5   phoneservice            7043 non-null   category
 6   multiplelines           7043 non-null   category
 7   internetservice         7043 non-null   category
 8   onlinesecurity          7043 non-null   category
 9   onlinebackup            7043 non-null   category
 10  deviceprotection        7043 non-null   category
 11  techsupport             7043 non-null   category
 12  streamingtv             7043 non-null   category
 13  streamingmovies         7043 non-null   category
 14  contract                

# Codificación de variables categóricas

## One Hot Encoding

In [120]:
# Iniciliar one hot enconder
enconder = OneHotEncoder(sparse_output=False, dtype=int)

# Obtenemos las variables categóricas
cols_categoricas = df_copy.select_dtypes(include='category').columns
#print(cols_categoricas)

#Eliminamos la variable objetivo
cols_categoricas = cols_categoricas.drop('baja')
#print(cols_categoricas)

#Aplicar One Hot Encoding a las columnas categóricas
enconded = enconder.fit_transform(df_copy[cols_categoricas])

# Obtenemos el nombre de las nuevas columnas
column_names = enconder.get_feature_names_out(cols_categoricas)

# Creación del dataframe con valores codificados
df_encoded = pd.DataFrame(enconded,columns=column_names,index=df_copy.index)
df_encoded.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 43 columns):
 #   Column                                   Non-Null Count  Dtype
---  ------                                   --------------  -----
 0   gender_Female                            7043 non-null   int64
 1   gender_Male                              7043 non-null   int64
 2   seniorcitizen_SeniorCitizen              7043 non-null   int64
 3   seniorcitizen_noSeniorCitizen            7043 non-null   int64
 4   partner_No                               7043 non-null   int64
 5   partner_Yes                              7043 non-null   int64
 6   dependents_No                            7043 non-null   int64
 7   dependents_Yes                           7043 non-null   int64
 8   phoneservice_No                          7043 non-null   int64
 9   phoneservice_Yes                         7043 non-null   int64
 10  multiplelines_No                         7043 non-null   int64
 11  mult

## Label Enconding para variable objetivo

In [121]:
# Aplicar label enconder
label_encoder = LabelEncoder()
target_var = df_copy['baja']
#print(target_var)
target_var = label_encoder.fit_transform(target_var)
print(target_var)


[0 0 1 ... 0 1 0]


## Unir en un dataframe las codificaciones

In [122]:
# Obtenemos las variables numéricas
columns_num_int = df_copy.select_dtypes(include='int64').columns
print(columns_num_int)

columns_num_float = df_copy.select_dtypes(include='float64').columns
print(columns_num_float)

#Obtenemos las variables booleanas
colums_bool = df_copy.select_dtypes(include='bool').columns
print(colums_bool)

Index(['tenure'], dtype='object')
Index(['monthlycharges', 'totalcharges'], dtype='object')
Index(['cliente_larga_duracion', 'phone+internet'], dtype='object')


In [123]:
#Unimos todas los datos en un dataframe
column_target = pd.Series(target_var, name='baja', index=df_copy.index)


X_num_int   = df_copy[columns_num_int]
X_num_float = df_copy[columns_num_float]
X_bool      = df_copy[colums_bool]

df_final = pd.concat([column_target, X_num_int, X_num_float, X_bool, df_encoded], axis=1)

df_final.head()



Unnamed: 0,baja,tenure,monthlycharges,totalcharges,cliente_larga_duracion,phone+internet,gender_Female,gender_Male,seniorcitizen_SeniorCitizen,seniorcitizen_noSeniorCitizen,...,streamingmovies_Yes,contract_Month-to-month,contract_One year,contract_Two year,paperlessbilling_No,paperlessbilling_Yes,paymentmethod_Bank transfer (automatic),paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,0,1,29.85,29.85,False,False,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
1,0,34,56.95,1889.5,True,True,0,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,1,2,53.85,108.15,False,True,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,0,45,42.3,1840.75,True,False,0,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,1,2,70.7,151.65,False,True,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0


## Algoritmos de clasificación

In [124]:
#Separación entre variable objetivo y las demás
#Demás variables
X = df_final.drop(columns='baja')
#X.info()
#Variable objetivo
Y = df_final['baja']

In [125]:
#Divisíon 70%/30%
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3, random_state=42, stratify=Y
)

## Optimización de hiperparámetros con optuna

### Regresión logística

In [126]:

#Crear modelo
modelo = LogisticRegression()
#Entrenar modelo
modelo.fit(X_train,y_train)

print("Intercepto (β0):", modelo.intercept_[0])
print("Coeficiente (β1):", modelo.coef_[0][0])

Intercepto (β0): -0.14077662042989167
Coeficiente (β1): -0.05737512511200509


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Evaluación y Predicción del modelo

In [127]:
from sklearn.metrics import classification_report

y_pred = modelo.predict(X_test)
y_pred_prob = modelo.predict_proba(X_test)[:, 1]

confusion_matrix = confusion_matrix(y_test,y_pred)
precision = precision_score(y_test,y_pred, zero_division = 1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)
auc = roc_auc_score(y_test, y_pred_prob)

# Mostrar las métricas
print("Matriz de Confusión:\n", confusion_matrix)
print("Precisión:", precision)
print("Sensibilidad (Recall):", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc)

Matriz de Confusión:
 [[1402  150]
 [ 250  311]]
Precisión: 0.6746203904555315
Sensibilidad (Recall): 0.5543672014260249
F1 Score: 0.6086105675146771
AUC-ROC: 0.8448548936913097


## SVM

In [128]:
from sklearn import svm
from sklearn.metrics import confusion_matrix as cm_func
model_svm = svm.SVC(kernel="rbf",
    probability=True,
    class_weight="balanced")
model_svm.fit(X_train,y_train)

y_pred = model_svm.predict(X_test)
y_pred_prob = model_svm.predict_proba(X_test)[:, 1]

cm = cm_func(y_test,y_pred)
precision = precision_score(y_test,y_pred, zero_division = 1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)
auc = roc_auc_score(y_test, y_pred_prob)

# Mostrar las métricas
print("Matriz de Confusión:\n", cm)
print("Precisión:", precision)
print("Sensibilidad (Recall):", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc)


Matriz de Confusión:
 [[1103  449]
 [ 245  316]]
Precisión: 0.4130718954248366
Sensibilidad (Recall): 0.5632798573975044
F1 Score: 0.4766214177978884
AUC-ROC: 0.7205038177407794


## Arboles de decision

In [129]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix as cm_func2

modelo_tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
modelo_tree.fit(X, Y)

y_pred = modelo_tree.predict(X_test)
y_pred_prob = modelo_tree.predict_proba(X_test)[:, 1]

cm2 = cm_func2(y_test,y_pred)
precision = precision_score(y_test,y_pred, zero_division = 1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)
auc = roc_auc_score(y_test, y_pred_prob)

# Mostrar las métricas
print("Matriz de Confusión:\n", cm2)
print("Precisión:", precision)
print("Sensibilidad (Recall):", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc)

Matriz de Confusión:
 [[1551    1]
 [   6  555]]
Precisión: 0.9982014388489209
Sensibilidad (Recall): 0.9893048128342246
F1 Score: 0.9937332139659804
AUC-ROC: 0.9999833461969605


## K-nearest Neighbour(KNN)

In [130]:
from sklearn.metrics import confusion_matrix as cm_func3


knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
y_pred_prob = knn.predict_proba(X_test)[:, 1]

cm2 = cm_func3(y_test,y_pred)
precision = precision_score(y_test,y_pred, zero_division = 1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)
auc = roc_auc_score(y_test, y_pred_prob)

# Mostrar las métricas
print("Matriz de Confusión:\n", cm2)
print("Precisión:", precision)
print("Sensibilidad (Recall):", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc)

Matriz de Confusión:
 [[1357  195]
 [ 317  244]]
Precisión: 0.5558086560364465
Sensibilidad (Recall): 0.43493761140819964
F1 Score: 0.488
AUC-ROC: 0.7190227778084055
