# Exercício 1: Classificação

Passos:

1. Selecionar os dados de origem
2. Quebrar as variáveis categóricas e numéricas
3. Tratar as variáveis categóricas
4. Juntar os dados e formar a ABT
5. Aplicar feature selection
6. Selecionar os dados para a ABT final de modelagem
7. Estruturar o dicionário para tunning
8. Instanciar os obj
9. Executar o train
10. Observar os resultados

In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

# configs do Pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [3]:
df = pd.read_csv('new_train.csv', sep=',')
df['difficulty'] = -1 # desconhecido
df.loc[(df['poutcome'] == 'success') & (df['previous'].between(0, 1)), 'difficulty'] = 0 # fácil
df.loc[(df['poutcome'] == 'success') & (df['previous'].between(2, 4)), 'difficulty'] = 1 # médio
df.loc[(df['poutcome'] == 'success') & (df['previous'].between(5, 7)), 'difficulty'] = 2 # difícil
df.loc[(df['poutcome'] == 'nonexistent') & (df['previous'] > 7), 'difficulty'] = 3 # muito difícil
df.loc[(df['poutcome'] == 'failure'), 'difficulty'] = 4 # impossível
df_exercicio = df.copy()
df_exercicio.dtypes

age             int64
job            object
marital        object
education      object
default        object
housing        object
loan           object
contact        object
month          object
day_of_week    object
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome       object
y              object
difficulty      int64
dtype: object

In [5]:
# Passo 2
df_exercicio_cat = df_exercicio.drop(columns=['age', 'duration', 'campaign', 'pdays', 'previous', 'difficulty', 'y'], axis=1)
df_exercicio_num = df_exercicio.drop(columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y'], axis=1)
df_exercicio_var_resp = df_exercicio['y']

df_exercicio_var_resp = df_exercicio_var_resp.replace('no', 0)
df_exercicio_var_resp = df_exercicio_var_resp.replace('yes', 1)

# Passe 3
df_exercicio_cat_dum = pd.get_dummies(df_exercicio_cat, prefix_sep='_', columns=df_exercicio_cat.columns, drop_first=True)

# Passo 4
df_exerciocio_feature_selection = df_exercicio_cat_dum.merge(df_exercicio_num, left_index=True, right_index=True)

In [10]:
# Normalizando
df_exercicio_cat_dum_norm = MinMaxScaler().fit_transform(df_exercicio_cat_dum)

# Feature selection categóricas
selecao_chi = SelectKBest(chi2, k=5)
selecao_chi.fit(df_exercicio_cat_dum_norm, df_exercicio_var_resp)
suport_chi = selecao_chi.get_support()
features_chi = df_exercicio_cat_dum.loc[:, suport_chi].columns.tolist()
features_chi

['contact_telephone',
 'month_mar',
 'month_oct',
 'month_sep',
 'poutcome_success']

In [12]:
# Feature selection numéricas

selecao_rfe = RFE(estimator=LogisticRegression(random_state=42, max_iter=200), n_features_to_select=3, step=1)
selecao_rfe.fit(df_exercicio_num, df_exercicio_var_resp)
rfe_suporte = selecao_rfe.get_support()
rfe_features = df_exercicio_num.loc[:, rfe_suporte].columns.tolist()
rfe_features

['campaign', 'previous', 'difficulty']

## Modelagem

In [13]:
df_exerciocio_modelagem = df_exercicio_cat_dum.merge(df_exercicio_num, left_index=True, right_index=True)

# Separando em treino e teste
x_treino_exer, x_teste_exer, y_treino_exer, y_teste_exer = train_test_split(df_exerciocio_modelagem, df_exercicio_var_resp, test_size=0.3, random_state=42)

# Dicionário para tunning de hiperparâmetros
exercicioj_dict = {
    'n_estimators':[500, 100],
    'min_samples_leaf':[2, 100],
    'min_samples_split':[10, 200],
    'random_state':[42]
}

In [15]:
# Modelo
gb_exercicio = GradientBoostingClassifier(random_state=42)

# Tunning
gb_exercicio_tunning = GridSearchCV(gb_exercicio, exercicioj_dict, cv=2, scoring=make_scorer(accuracy_score))

gb_exercicio_tunning.fit(x_treino_exer, y_treino_exer)

GridSearchCV(cv=2, estimator=GradientBoostingClassifier(random_state=42),
             param_grid={'min_samples_leaf': [2, 100],
                         'min_samples_split': [10, 200],
                         'n_estimators': [500, 100], 'random_state': [42]},
             scoring=make_scorer(accuracy_score))

In [16]:
gb_exercicio_tunning.best_estimator_

GradientBoostingClassifier(min_samples_leaf=100, min_samples_split=10,
                           random_state=42)

In [17]:
gb_exercicio_tunning.best_score_

0.9088662361116161

In [18]:
accuracy_score(y_teste_exer, gb_exercicio_tunning.predict(x_teste_exer))

0.9095599393019727

In [19]:
accuracy_score(y_treino_exer, gb_exercicio_tunning.predict(x_treino_exer))

0.9141990028181227

In [20]:
import joblib
joblib.dump(gb_exercicio_tunning, 'gb_exercicio_tunning.pkl')

['gb_exercicio_tunning.pkl']

# Exercício 2: Regressão

Passos a serem seguidos:
1. Selecionar os dados de origem
2. Quebrar as categóricas e numéricas
3. Juntar os dados e formar a ABT
4. Aplicar feature selection
5. Selecionar os dados para a ABT de modelagem
6. Estruturar o dicionário para tunning
7. Instanciar os objetos
8. Executar o train
9. Observar os resultados
10. Serializar o modelo final

In [24]:
dados = pd.read_csv('new_train.csv', sep=',')
dados.drop(columns=['y'], axis=1, inplace=True)
dados['difficulty'] = -1 # desconhecido
dados.loc[(dados['poutcome'] == 'success') & (dados['previous'].between(0, 4)), 'difficulty'] = 0 # fácil e médio
dados.loc[(dados['poutcome'] == 'success') & (dados['previous'].between(5, 8)), 'difficulty'] = 1 # difícil e muito difícil
dados.loc[(dados['poutcome'] == 'failure'), 'difficulty'] = 2 # impossível
df_exercicio2 = dados.copy()
df_exercicio2.dtypes

age             int64
job            object
marital        object
education      object
default        object
housing        object
loan           object
contact        object
month          object
day_of_week    object
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome       object
difficulty      int64
dtype: object

In [25]:
df_exercicio2_cat = df_exercicio2.drop(columns=['age', 'duration', 'campaign', 'pdays', 'previous', 'difficulty'], axis=1)
df_exercicio2_cat.columns

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object')

In [27]:
# Tira a variável 'age' porque ela vai ser nossa variável resposta
df_exercicio2_num = df_exercicio2.drop(columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'age'])
df_exercicio2_num.columns

Index(['duration', 'campaign', 'pdays', 'previous', 'difficulty'], dtype='object')

In [29]:
df_exercicio2_cat_dum = pd.get_dummies(df_exercicio2_cat, prefix_sep='_', columns=df_exercicio2_cat.columns, drop_first=True)
df_exercicio2_cat_dum.head()

Unnamed: 0,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_married,marital_single,marital_unknown,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,default_unknown,default_yes,housing_unknown,housing_yes,loan_unknown,loan_yes,contact_telephone,month_aug,month_dec,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0
1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0


In [30]:
df_exercicio2_modelagem = df_exercicio2_cat_dum.merge(df_exercicio2_num, left_index=True, right_index=True)
df_exercicio2_modelagem.shape

(32950, 48)

In [31]:
# Variável resposta
df_exercicio2_var_resp = df_exercicio2['age']

# Seleção de variáveis categóricas

dt_teste = DecisionTreeRegressor(random_state=42)
dt_selecao = SelectFromModel(dt_teste, max_features=5)
dt_selecao.fit(df_exercicio2_cat_dum, df_exercicio2_var_resp)
dt_suporte = dt_selecao.get_support()
dt_feature = df_exercicio2_cat_dum.loc[:, dt_suporte].columns.tolist()
dt_feature

['job_retired', 'marital_single', 'default_unknown', 'housing_yes', 'loan_yes']

In [32]:
# Seleção das variáveis numéricas

rf_exercicio2 = RandomForestRegressor(random_state=42)
rf_selecao = RFE(rf_exercicio2, n_features_to_select=5, step=1)
rf_selecao.fit(df_exercicio2_num, df_exercicio2_var_resp)
rf_suporte = rf_selecao.get_support()
rf_feature = df_exercicio2_num.loc[:, rf_suporte].columns.tolist()
rf_feature

['duration', 'campaign', 'pdays', 'previous', 'difficulty']

In [33]:
# Vamos usar no exercício todas as features (essa parte de seleção acima foi só para rever os conceitos)

x_treino_exer, x_teste_exer, y_treino_exer, y_teste_exer = train_test_split(df_exercicio2_modelagem, df_exercicio2_var_resp, test_size=0.3, random_state=42)

In [39]:
exercicio2_dict = {
    'max_depth':[1, 50, 100],
    'n_estimators':[50, 200, 500],
    'random_state':[42]
}

## Modelo

In [40]:
gb_exercicio2 = GradientBoostingRegressor(random_state=42)

# tunning
gb_exercicio2_tunning = GridSearchCV(gb_exercicio2, exercicio2_dict, cv=2, scoring=make_scorer(mean_absolute_error, greater_is_better=False))

In [41]:
gb_exercicio2_tunning.fit(x_treino_exer, y_treino_exer)

GridSearchCV(cv=2, estimator=GradientBoostingRegressor(random_state=42),
             param_grid={'max_depth': [1, 50, 100],
                         'n_estimators': [50, 200, 500], 'random_state': [42]},
             scoring=make_scorer(mean_absolute_error, greater_is_better=False))

In [42]:
gb_exercicio2_tunning.best_estimator_

GradientBoostingRegressor(max_depth=1, n_estimators=500, random_state=42)

In [43]:
gb_exercicio2_tunning.best_score_

-6.59142518836844

In [44]:
mean_absolute_error(y_treino_exer, gb_exercicio2_tunning.predict(x_treino_exer))

6.576545274552671

In [45]:
mean_absolute_error(y_teste_exer, gb_exercicio2_tunning.predict(x_teste_exer))

6.5854359857213085

In [46]:
joblib.dump(gb_exercicio2_tunning, 'gb_exercicio2_tunning.pkl')

['gb_exercicio2_tunning.pkl']

# Exercício 3: Clustering

Os exercícios de clustering, devido à restrição computacional, estão no Google Colab