# Práctico 03: Introducción al aprendizaje automático

## Importar librerias

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
# Opción para ver todas las columnas del dataset en el notebook
pd.set_option('display.max_columns', 50)

### Leer el dataset

In [3]:
df = pd.read_csv('./data.csv', sep=';')
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,334,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,383,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,189,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,442,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


### Análisis de las variables

#### Valores nulos

#### Estandarización
No estandarizaremos ninguna columna por el momento, ya que, implementaremos XGBoost como modelo.

#### Variables categóricas y numéricas

In [4]:
df["y"] = df["y"].replace({"no": 0, "yes":1})

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

def get_contactado(x):
    if x >= 999:
        return '0'
    elif x < 6:
        return '1'
    elif 5 < x < 11:
        return '2'   
    else:
        return '3'
    
def productos_financieros(x):
    if x.loan == 'yes' or x.housing == 'yes':
        return 'yes'
    else:
        return 'no'

class CatCustom(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        
        X["job"] = X.job.replace("unknown", X.job.mode()[0])
        X["marital"] = X.marital.replace("unknown", X.marital.mode()[0])
        X["education"] = X.education.replace("unknown", X.education.mode()[0])
        X["loan"] = X.loan.replace("unknown", X.loan.mode()[0])
        X["housing"] = X.housing.replace("unknown", X.housing.mode()[0])
        
        # Education
        X["education"] = X["education"].replace({
                            'illiterate': 'ninguno',
                            'basic.4y': 'primario',
                           'basic.6y':'primario',
                           'basic.9y': 'primario',
                           'high.school':'secundario',
                           'professional.course':'terciario',
                           'university.degree':'universitario'})
        # Creamos esta columna para ver si el usuario adquirio productos financieros
        X['productos_financieros'] = X.apply(lambda x: productos_financieros(x), axis=1)
        # Drop de columnas
        X = X.drop(['loan', 'housing'], axis=1)
        
        X = pd.get_dummies(X)
        return X
    
class NumCustom(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # Borrar outliers
        cols = ['age']
        for c in cols:
            z_scores = zscore(df[c])
            abs_z_scores = np.abs(z_scores)
            filtered_entries = (abs_z_scores < 3)
            X = X[filtered_entries]
        
        X_with_nan = X[cols][X[cols].isna().any(axis=1)]
        X.merge(X_with_nan)
        
        #Contactado
        X['contactado'] = X.apply(lambda x: get_contactado(x['pdays']), axis=1)    
        
        X = X.drop(["pdays", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "nr.employed", "duration"], axis=1)
        return X

In [6]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ("num_custom", NumCustom())
])

cat_pipeline = Pipeline([
    ("cat_custom", CatCustom()),
])
'''
pipeline_completo = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, obj_cols),
])
'''

pipeline_completo = Pipeline([
    ("num_custom", NumCustom()),
    ("cat_custom", CatCustom()),
])
df = pipeline_completo.fit_transform(X=df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['contactado'] = X.apply(lambda x: get_contactado(x['pdays']), axis=1)


In [7]:
X = df.drop(["y"], axis=1).copy()
y = df.y.copy()

# Desbalanceo de clases
## Usar SMOTETomek para desbalanceo de clases

SMOTE (Synthetic Minority Oversampling Technique) es una tecnica de oversampling y crea muestras sintéticas nuevas de la clase minoritaria. Tomek es usado para quitar las muestras que se encuentran cerca de la frontera de las dos clases, y con ello se puede aumentar la separacion entre las dos clases.

In [8]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X, y = smt.fit_resample(X,y)

In [9]:
X_temp, X_val, y_temp, y_val = train_test_split(X, y, test_size=0.2)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_temp, y_temp, test_size=0.2)

# Modelo baseline

In [11]:
import xgboost as xgb

clf = xgb.XGBClassifier(objective="binary:logistic",
                        use_label_encoder=False,
                       random_state=42)

In [12]:
clf.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [13]:
y_pred = clf.predict(X_test)

In [14]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred)

0.937118599097904

Usando el dataset desbalanceado, el f1_score => 0.5428 \
Usando SMOTETomek para balancearlo, el f1_score => 0.9575

In [22]:
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict

kfold = KFold(n_splits=5)
results = cross_val_score(clf, X, y, cv=kfold, scoring="f1")



In [29]:
print(f"F1: {results.mean()*100}")

F1: 55.124992857604106


In [30]:
import pickle

pickle.dump(clf, open("xgb_modelo_1", 'wb'))