In [1]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
data_train = pd.read_csv('https://raw.githubusercontent.com/alura-cursos/combina-classificadores/main/dados/train.csv')
data_test = pd.read_csv('https://raw.githubusercontent.com/alura-cursos/combina-classificadores/main/dados/test.csv')

In [3]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      1039

In [4]:
data_train['satisfaction'].unique()

array(['neutral or dissatisfied', 'satisfied'], dtype=object)

In [5]:
def pre(df):
    df.drop(columns=['Unnamed: 0', 'id'], inplace=True)
    df.dropna(inplace=True)
    cate_columns = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
    df = pd.get_dummies(df, columns=cate_columns)
    x = df.drop(columns='satisfaction')
    y = df['satisfaction'].map({'neutral or dissatisfied': 0, 'satisfied': 1})
    return x, y

In [6]:
x_train, y_train = pre(data_train)
x_test, y_test = pre(data_test)

Decision Tree

In [7]:
dtc = DecisionTreeClassifier(random_state=42)

In [8]:
cv_results_dtc = cross_validate(dtc, x_train, y_train, cv=5)

In [9]:
cv_results_dtc['test_score'].mean()

0.9443500672495754

In [10]:
dtc.fit(x_train, y_train)

In [11]:
dtc.score(x_test, y_test)

0.94612443517553

Logistic Regression

In [12]:
lrc = LogisticRegression(random_state=42)

In [13]:
cv_results_lrc = cross_validate(lrc, x_train, y_train, cv=5)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [14]:
cv_results_lrc['test_score'].mean()

0.8202308750585392

In [15]:
lrc.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
lrc.score(x_test, y_test)

0.7743019348858765

Pipeline

In [17]:
pipeline = Pipeline([
        ('Scaler', StandardScaler()),
        ('model', lrc)
])

In [18]:
cv_results_lrc_2 = cross_validate(pipeline, x_train, y_train, cv=5)

In [19]:
cv_results_lrc_2['test_score'].mean()

0.874934880970234

GaussianNB

In [20]:
gnb = GaussianNB()

### Voting Classifier

In [21]:
models_name = ['Tree', 'Logistic', 'Naive Bayes']

In [22]:
pipelines = []

for model, name in zip([dtc, lrc, gnb], models_name):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    pipelines.append(pipeline)
    validation = cross_validate(pipeline, x_train, y_train, cv=5)
    print(validation['test_score'].mean())

0.9443983339909126
0.874934880970234
0.8483406855152964


In [23]:
voting = VotingClassifier(estimators=[
    (models_name[0], pipelines[0]),
    (models_name[1], pipelines[1]),
    (models_name[2], pipelines[2])
], voting='hard')

In [24]:
voting

In [25]:
validation_voting = cross_validate(voting, x_train, y_train, cv=5)

In [26]:
validation_voting['test_score'].mean()

0.8917408753720582

In [27]:
voting_soft = VotingClassifier(estimators=[
    (models_name[0], pipelines[0]),
    (models_name[1], pipelines[1]),
    (models_name[2], pipelines[2])
], voting='soft')

In [28]:
validation_voting_soft = cross_validate(voting, x_train, y_train, cv=5)

In [29]:
validation_voting_soft['test_score'].mean()

0.8917408753720582

In [30]:
weight_params = {
    'voting': ['hard', 'soft'],
    'weights': [(1,1,1), (2,1,1), (1,2,1), (1,1,2)]
}

In [31]:
grid_search = GridSearchCV(voting, weight_params)

In [32]:
# grid_search.fit(x_train, y_train)

In [33]:
# grid_search.best_params_

In [34]:
# grid_search.best_score_

In [35]:
# grid_search.score(x_test, y_test)

### Bagging Classifier

In [36]:
bagging_classifier = BaggingClassifier(n_estimators=10, random_state=42)

In [37]:
bagging_classifier.fit(x_train, y_train)

In [38]:
bagging_classifier.score(x_test, y_test)

0.9586760900629514

In [39]:
base_model = pipelines[0]

In [40]:
grid_params = {
    'n_estimators': [10, 20, 30],
    'max_samples': [0.5, 0.7, 0.9],
    'max_features': [0.5, 0.7, 0.9],
}

In [41]:
bagging_grid = GridSearchCV(
    BaggingClassifier(),
    grid_params,
    cv=5
)

In [42]:
# bagging_grid.fit(x_train, y_train)
# bagging_grid.best_params_

In [43]:
best_params = {'max_features': 0.9, 'max_samples': 0.9, 'n_estimators': 30}

In [44]:
# bagging_classifier_best = BaggingClassifier(estimator=base_model, **best_params)

In [45]:
# bagging_classifier_best.fit(x_train, y_train)

In [46]:
# bagging_classifier_best.score(x_test, y_test)

ExtraTreesClassifier

In [47]:
extra_grid_params = {
    'n_estimators': [10, 20, 30],
    'max_features': [0.5, 0.7, 0.9],
}

In [48]:
extratrees_grid = GridSearchCV(ExtraTreesClassifier(), extra_grid_params, cv=5)

In [49]:
# extratrees_grid.fit(x_train, y_train)

In [50]:
# extra_best_params = {'max_features': 0.7, 'n_estimators': 30}

In [51]:
# extratrees_classifier = ExtraTreesClassifier(**extra_best_params)

In [52]:
# extratrees_classifier.fit(x_train, y_train)

In [53]:
# extratrees_classifier.score(x_test, y_test)

### Boosting

In [54]:
ada_model = AdaBoostClassifier(n_estimators=50, learning_rate=1, random_state=42)

In [55]:
# ada_model.fit(x_train, y_train)

In [56]:
# ada_model.score(x_train, y_train)

In [57]:
ada_grid = {
'n_estimators': [50,100,200],
'learning_rate': [0.1,0.01,0.001]
}

In [58]:
ada_grid = GridSearchCV(AdaBoostClassifier(), ada_grid, cv=5)

In [59]:
# ada_grid.fit(x_train, y_train)

Stacking

In [60]:
base_models_stacking = [(models_name[0], pipelines[0]),
(models_name[1], pipelines[1]),
(models_name[2], pipelines[2])]

In [61]:
base_models_stacking

[('Tree',
  Pipeline(steps=[('scaler', StandardScaler()),
                  ('model', DecisionTreeClassifier(random_state=42))])),
 ('Logistic',
  Pipeline(steps=[('scaler', StandardScaler()),
                  ('model', LogisticRegression(random_state=42))])),
 ('Naive Bayes',
  Pipeline(steps=[('scaler', StandardScaler()), ('model', GaussianNB())]))]

In [62]:
meta_model = LogisticRegression(random_state=42)

In [63]:
stacking_model = StackingClassifier(estimators=base_models_stacking, final_estimator=meta_model)

In [64]:
stacking_model.fit(x_train, y_train)

In [65]:
stacking_model.score(x_test, y_test)

0.9462402966052601

Validando o Stacking

In [66]:
'''
from mlxtend.classifier import StackingCVClassifier
grid_search.best_params_
parametros_cat = grid_search.best_params_
base_models = [
    CatBoostClassifier(**parametros_cat),
    ExtraTreesClassifier(**parametros_trees),
    pipelines[2]
]
stacking_cv_classifier = StackingCVClassifier(classifiers=base_models, meta_classifier=modelo_meta)
mapeamento = {'neutral or dissatisfied': 0, 'satisfied': 1}
y_treino = y_treino.replace(mapeamento)
y_teste = y_teste.replace(mapeamento)
stacking_cv_classifier.fit(X_treino, y_treino)
'''


"\nfrom mlxtend.classifier import StackingCVClassifier\ngrid_search.best_params_\nparametros_cat = grid_search.best_params_\nbase_models = [\n    CatBoostClassifier(**parametros_cat),\n    ExtraTreesClassifier(**parametros_trees),\n    pipelines[2]\n]\nstacking_cv_classifier = StackingCVClassifier(classifiers=base_models, meta_classifier=modelo_meta)\nmapeamento = {'neutral or dissatisfied': 0, 'satisfied': 1}\ny_treino = y_treino.replace(mapeamento)\ny_teste = y_teste.replace(mapeamento)\nstacking_cv_classifier.fit(X_treino, y_treino)\n"