### Automatizando modelos com pipeline
  * https://stephenallwright.com/cross_val_score-sklearn/
  * https://stackoverflow.com/questions/76533127/get-the-best-model-after-cross-validation
  * https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer

In [2]:
# loading data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# columns to drop
drop_columns = ['Unnamed: 0','id','Gender','Gate location']
train.drop(columns=drop_columns,inplace=True)
test.drop(columns=drop_columns,inplace=True)

In [4]:
train.isnull().sum(), test.isnull().sum()

(Customer Type                          0
 Age                                    0
 Type of Travel                         0
 Class                                  0
 Flight Distance                        0
 Inflight wifi service                  0
 Departure/Arrival time convenient      0
 Ease of Online booking                 0
 Food and drink                         0
 Online boarding                        0
 Seat comfort                           0
 Inflight entertainment                 0
 On-board service                       0
 Leg room service                       0
 Baggage handling                       0
 Checkin service                        0
 Inflight service                       0
 Cleanliness                            0
 Departure Delay in Minutes             0
 Arrival Delay in Minutes             310
 satisfaction                           0
 dtype: int64,
 Customer Type                         0
 Age                                   0
 Type of Travel      

In [5]:
# Substituição dados faltantes
# Train data
media = train['Arrival Delay in Minutes'].mean()
train['Arrival Delay in Minutes'].fillna(media, inplace=True)

# Test data
media = test['Arrival Delay in Minutes'].mean()
test['Arrival Delay in Minutes'].fillna(media, inplace=True)

In [6]:
train['Arrival Delay in Minutes'].isnull().sum(), test['Arrival Delay in Minutes'].isnull().sum()

(0, 0)

In [7]:
# Separa os dataframes em previsores e classe
x_train = train.drop(['satisfaction'], axis=1)
y_train = train['satisfaction']
x_test = test.drop(['satisfaction'], axis=1)
y_test = test['satisfaction']

In [8]:
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((103904, 20), (103904,), (25976, 20), (25976,))

In [9]:
# Separa as colunas numéricas e categóricas
numerical_cols = x_train.select_dtypes(include=['number']).columns
categorical_cols = x_train.select_dtypes(include=['object']).columns

In [10]:
categorical_cols

Index(['Customer Type', 'Type of Travel', 'Class'], dtype='object')

### Machine Learning Pipeline

In [11]:
# Cria a normalização e transformação para as colunas
numerical_scaler = Pipeline(steps=[
    ('scaler',StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot',OneHotEncoder())
])

In [12]:
# Combina dois pipelines em uma única coluna de execução
preprocessor = ColumnTransformer(
    [('num', numerical_scaler, numerical_cols),
     ('cat', categorical_transformer, categorical_cols)
    ])

In [78]:
# # Combina dois pipelines em uma única coluna de execução
# preprocessor = ColumnTransformer(
#     columns_trannformer=[
#         ('num', numerical_scaler, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])

In [13]:
# Create the full pipeline
pipeline = Pipeline(steps=[
    ('preprocess',preprocessor),
    ('model',DecisionTreeClassifier())
])
pipeline

In [80]:
# # Fit the pipeline to the training data
# pipeline.fit(x_train,y_train)

In [81]:
# y_pred = pipeline.predict(x_test)

In [82]:
# report = classification_report(y_test,y_pred)
# print(report)

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [15]:
skf = StratifiedKFold(n_splits=10)

#  RandomForest params
# grid_params={'model__n_estimators':[10,30,50],
#             'model__criterion': ['gini', 'entropy', 'log_loss'],
#             'model__max_features': ['sqrt', 'log2', None]
#             }

grid_params={'model__criterion': ['gini', 'entropy', 'log_loss'],
            'model__max_features': ['sqrt', 'log2', None]
            }

search = GridSearchCV(pipeline, grid_params, cv=skf)
search.fit(x_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.947):
{'model__criterion': 'log_loss', 'model__max_features': None}


In [17]:
y_pred = search.predict(x_test)

In [18]:
report = classification_report(y_test,y_pred)
print(report)

                         precision    recall  f1-score   support

neutral or dissatisfied       0.95      0.95      0.95     14573
              satisfied       0.94      0.94      0.94     11403

               accuracy                           0.95     25976
              macro avg       0.95      0.95      0.95     25976
           weighted avg       0.95      0.95      0.95     25976

