# Machine Learning

Importación de librerias

In [310]:
import pandas as pd # Librería de lectura de datos
import numpy as np # Librería de cálculo numérico

import matplotlib.pyplot as plt # Librería de visualización de datos

from sklearn.model_selection import train_test_split,GridSearchCV # Función para dividir los datos en entrenamiento y prueba
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier,plot_tree, export_text

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_curve,roc_auc_score # Funciones para evaluar el rendimiento del modelo

Carga del dataset

In [311]:
df = pd.read_csv('../csv/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [312]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


Modificaciones de valores

In [313]:
df['SeniorCitizen'] = df['SeniorCitizen'].astype('category')


df['TotalCharges'] = df['TotalCharges'].astype(str).str.strip()
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)



df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   customerID        7043 non-null   object  
 1   gender            7043 non-null   object  
 2   SeniorCitizen     7043 non-null   category
 3   Partner           7043 non-null   object  
 4   Dependents        7043 non-null   object  
 5   tenure            7043 non-null   int64   
 6   PhoneService      7043 non-null   object  
 7   MultipleLines     7043 non-null   object  
 8   InternetService   7043 non-null   object  
 9   OnlineSecurity    7043 non-null   object  
 10  OnlineBackup      7043 non-null   object  
 11  DeviceProtection  7043 non-null   object  
 12  TechSupport       7043 non-null   object  
 13  StreamingTV       7043 non-null   object  
 14  StreamingMovies   7043 non-null   object  
 15  Contract          7043 non-null   object  
 16  PaperlessBilling  7043 n

# Codificación de variables categóricas

In [314]:
df_enconding = df.copy()

## One Hot Encoding

In [315]:
# Iniciliar one hot enconder
enconder = OneHotEncoder(sparse_output=False, dtype=int)
# Obtenemos las variables categóricas
cols_categoricas = df_enconding.select_dtypes(include=['object']).columns

# Eliminamos columnas no necesarias
cols_categoricas = cols_categoricas.drop(['customerID','Churn'])
#print(cols_categoricas)

# Aplicamos one hot enconder a las variables categóricas
enconded = enconder.fit_transform(df_enconding[cols_categoricas])

# Nombre de las nuevas columnas 
cols_names = enconder.get_feature_names_out(cols_categoricas)
#print(cols_names)

# Creacion del dataframe con las nuevas columnas 
df_enconding = pd.DataFrame(enconded,columns=cols_names)
#df_enconding.info()

# Juntar dataframe con el original eliminando las variables categóricas y quedandonos con las variables encodeadas

df_final = pd.concat([df,df_enconding],axis=1)

#Eliminar variables categóricas
df_final = df_final.drop(columns=cols_categoricas)
df_final = df_final.drop(columns=['customerID'])
df_final.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 46 columns):
 #   Column                                   Non-Null Count  Dtype   
---  ------                                   --------------  -----   
 0   SeniorCitizen                            7043 non-null   category
 1   tenure                                   7043 non-null   int64   
 2   MonthlyCharges                           7043 non-null   float64 
 3   TotalCharges                             7043 non-null   float64 
 4   Churn                                    7043 non-null   object  
 5   gender_Female                            7043 non-null   int64   
 6   gender_Male                              7043 non-null   int64   
 7   Partner_No                               7043 non-null   int64   
 8   Partner_Yes                              7043 non-null   int64   
 9   Dependents_No                            7043 non-null   int64   
 10  Dependents_Yes                      

## Label Enconding para variable categórica

In [316]:
# Aplicar label enconder
label_encoder = LabelEncoder()
df_final['Churn'] = label_encoder.fit_transform(df['Churn'])
print(df_final)

     SeniorCitizen  tenure  MonthlyCharges  TotalCharges  Churn  \
0                0       1           29.85         29.85      0   
1                0      34           56.95       1889.50      0   
2                0       2           53.85        108.15      1   
3                0      45           42.30       1840.75      0   
4                0       2           70.70        151.65      1   
...            ...     ...             ...           ...    ...   
7038             0      24           84.80       1990.50      0   
7039             0      72          103.20       7362.90      0   
7040             0      11           29.60        346.45      0   
7041             1       4           74.40        306.60      1   
7042             0      66          105.65       6844.50      0   

      gender_Female  gender_Male  Partner_No  Partner_Yes  Dependents_No  ...  \
0                 1            0           0            1              1  ...   
1                 0            1 