### Base: Airline Passenger Satisfaction
* https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## 1) Resumo geral da base de dados

In [2]:
df=pd.read_csv('train.csv')
df

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,103899,94171,Female,disloyal Customer,23,Business travel,Eco,192,2,1,...,2,3,1,4,2,3,2,3,0.0,neutral or dissatisfied
103900,103900,73097,Male,Loyal Customer,49,Business travel,Business,2347,4,4,...,5,5,5,5,5,5,4,0,0.0,satisfied
103901,103901,68825,Male,disloyal Customer,30,Business travel,Business,1995,1,1,...,4,3,2,4,5,5,4,7,14.0,neutral or dissatisfied
103902,103902,54173,Female,disloyal Customer,22,Business travel,Eco,1000,1,1,...,1,4,5,1,5,4,1,0,0.0,neutral or dissatisfied


## Limpeza dos Dados

### Dados Faltantes

In [3]:
df.isnull().sum()

Unnamed: 0                             0
id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction    

In [4]:
# Remove id and Unnamed: 0
df.drop(['Unnamed: 0'],axis=1, inplace=True)
df.drop(['id'], axis=1, inplace=True)

In [5]:
df.columns

Index(['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

## Substituição dados faltantes em Arrival Delay in Minutes

In [6]:
media = df['Arrival Delay in Minutes'].mean()
df['Arrival Delay in Minutes'].fillna(media, inplace=True)

In [7]:
df['Arrival Delay in Minutes'].isnull().sum()

0

## Transformação dados categóricos

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
categorical_columns = df.select_dtypes(include=['object']).columns

In [10]:
for categorical_feature in categorical_columns:
  print(f'{categorical_feature}: {df[categorical_feature].unique()}')

Gender: ['Male' 'Female']
Customer Type: ['Loyal Customer' 'disloyal Customer']
Type of Travel: ['Personal Travel' 'Business travel']
Class: ['Eco Plus' 'Business' 'Eco']
satisfaction: ['neutral or dissatisfied' 'satisfied']


In [11]:
df.head(5)

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [31]:
# for i in df.columns:
#   if(df[i].dtype=='object'):
#     print(f'{i}: {df[i].unique()}')

In [12]:
# Transforma os dados categóricos
labelencoder = LabelEncoder()
df['Gender'] = labelencoder.fit_transform(df['Gender'])
df['Customer Type'] = labelencoder.fit_transform(df['Customer Type'])
df['Type of Travel'] = labelencoder.fit_transform(df['Type of Travel'])
df['Class'] = labelencoder.fit_transform(df['Class'])
df['satisfaction'] = labelencoder.fit_transform(df['satisfaction'])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103904 non-null  int64  
 1   Customer Type                      103904 non-null  int64  
 2   Age                                103904 non-null  int64  
 3   Type of Travel                     103904 non-null  int64  
 4   Class                              103904 non-null  int64  
 5   Flight Distance                    103904 non-null  int64  
 6   Inflight wifi service              103904 non-null  int64  
 7   Departure/Arrival time convenient  103904 non-null  int64  
 8   Ease of Online booking             103904 non-null  int64  
 9   Gate location                      103904 non-null  int64  
 10  Food and drink                     103904 non-null  int64  
 11  Online boarding                    1039

## Normalização de atributos numéricos

In [15]:
df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,1,0,13,1,2,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,0
1,1,1,25,0,0,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,0
2,0,0,26,0,0,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,1
3,0,0,25,0,0,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,0
4,1,0,61,0,0,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,1


In [16]:
scaler_cols = df.drop('satisfaction', axis=1).select_dtypes(include=['number']).columns
print(scaler_cols)

Index(['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes',
       'Arrival Delay in Minutes'],
      dtype='object')


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
ajuste = scaler.fit(df[scaler_cols])
df[scaler_cols] = ajuste.transform(df[scaler_cols])

In [18]:
df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,1.015031,-0.472767,-1.745279,1.490614,2.264618,-0.731539,0.203579,0.616172,0.173776,-1.547323,...,1.231704,0.479403,-0.26684,0.311769,0.549799,1.156436,1.30587,0.266393,0.073014,0
1,1.015031,2.115208,-0.95136,-0.670865,-0.957054,-0.957184,0.203579,-0.695245,0.173776,0.018094,...,-1.769081,-1.849161,1.25338,-0.535045,-1.821012,0.305848,-1.742292,-0.361375,-0.237539,0
2,-0.985192,-0.472767,-0.8852,-0.670865,-0.957054,-0.047584,-0.549533,-0.695245,-0.54106,-0.764614,...,1.231704,0.479403,-0.26684,0.311769,0.549799,0.305848,1.30587,-0.387532,-0.392816,1
3,-0.985192,-0.472767,-0.95136,-0.670865,-0.957054,-0.629246,-0.549533,1.27188,1.603448,1.583511,...,-1.018885,-1.072973,1.25338,-0.535045,-1.821012,0.305848,-0.980251,-0.099805,-0.159901,0
4,1.015031,-0.472767,1.430397,-0.670865,-0.957054,-0.978244,0.203579,-0.039537,0.173776,0.018094,...,-0.268688,-0.296785,0.49327,0.311769,-0.240472,-0.54474,-0.218211,-0.387532,-0.392816,1


## 3) Classificação com Decision Tree

### Separação entre treino e teste (70% e 30%)

In [19]:
X = df.drop(['satisfaction'], axis=1).values
y = df['satisfaction'].values

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Decision Tree

In [21]:

DT = DecisionTreeClassifier(random_state = 10)
DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)

DT_score = DT.score(X_train, y_train)
DT_test = DT.score(X_test, y_test)

cm = confusion_matrix(y_test,y_pred)
print('Training Score',DT_score)
print('Testing Score \n',DT_test)
print(cm)

Training Score 1.0
Testing Score 
 0.945078916976774
[[16862   838]
 [  874 12598]]


In [22]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     17700
           1       0.94      0.94      0.94     13472

    accuracy                           0.95     31172
   macro avg       0.94      0.94      0.94     31172
weighted avg       0.95      0.95      0.95     31172



## 4) validação cruzada

- https://drigols.medium.com/introdu%C3%A7%C3%A3o-a-valida%C3%A7%C3%A3o-cruzada-k-fold-2a6bced32a90
- https://dataml.com.br/validacao-cruzada-aninhada-com-scikit-learn/

In [23]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

#### Decision Tree - Cross-Validation

In [24]:
DT = DecisionTreeClassifier(random_state=10)
skf = StratifiedKFold(n_splits=10)

clf =  cross_validate(
    DT,
    X_train,
    y_train,
    scoring='f1',
    return_train_score=True,
    return_estimator = True,
    cv=skf, # k-fold
)

np.mean(clf['test_score'])


0.9350006428962221

In [25]:
# Predict all models from cross-validation
f1_lst = []
for model in clf['estimator']:
  y_pred = model.predict(X_test)
  f1_lst.append(f1_score(y_test, y_pred))
  # print(f'Acc: {accuracy_score(y_test,y_pred)} - F1: {f1_score(y_test, y_pred)}')

print(np.mean(f1_lst))

0.9354031264724766


#### Teste com balanceamento de classes

In [26]:
df['satisfaction'].value_counts()

satisfaction
0    58879
1    45025
Name: count, dtype: int64

In [29]:
from imblearn.over_sampling import SMOTE

In [30]:
sm = SMOTE()
X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)

In [31]:
DT = DecisionTreeClassifier(random_state=10)
skf = StratifiedKFold(n_splits=10)

clf =  cross_validate(
    DT,
    X_train_oversampled,
    y_train_oversampled,
    scoring='f1',
    return_train_score=True,
    return_estimator = True,
    cv=skf, # k-fold
)

np.mean(clf['test_score'])

0.9487020976779983

In [32]:
# Predict all models from cross-validation
f1_lst = []
for model in clf['estimator']:
  y_pred = model.predict(X_test)
  f1_lst.append(f1_score(y_test, y_pred))
  # print(f'Acc: {accuracy_score(y_test,y_pred)} - F1: {f1_score(y_test, y_pred)}')

print(np.mean(f1_lst))

0.9347891113280564


In [33]:
DT = DT.fit(X_train_oversampled, y_train_oversampled)
y_pred = DT.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     17700
           1       0.93      0.93      0.93     13472

    accuracy                           0.94     31172
   macro avg       0.94      0.94      0.94     31172
weighted avg       0.94      0.94      0.94     31172



## 5) Técnicas de ajuste de hiperparâmetros
- https://scikit-learn.org/stable/modules/grid_search.html#


- Abordagens disponíveis no scikit-learn:
    - GridSearchCV: considera exaustivamente todas as combinações de parâmetros;
    - RandomizedSearchCV: pesquisa aleatória de parâmetros, em que cada configuração é amostrada a partir de uma distribuição de possíveis valores de parâmetro.

In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#### Árvore de Decisão

In [35]:
DT = DecisionTreeClassifier()
skf = StratifiedKFold(n_splits=10, shuffle=True)
param_grid = {'criterion': ['gini', 'entropy', 'log_loss'],
              'splitter': ['best', 'random'],
              'max_features': ['sqrt','log2', None]}

#### Exemplo com GridSearchCV
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
- https://vitalflux.com/grid-search-explained-python-sklearn-examples/

In [36]:
g_search = GridSearchCV(estimator = DT,
                        param_grid = param_grid,
                        refit=True,
                        cv = skf,
                        return_train_score=True)

In [37]:
g_search.fit(X_train, y_train);
print(g_search.best_params_)

{'criterion': 'log_loss', 'max_features': None, 'splitter': 'best'}


In [38]:
print(g_search.best_score_)

0.9466534598023786


In [39]:
g_search.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_max_features', 'param_splitter', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'split5_train_score', 'split6_train_score', 'split7_train_score', 'split8_train_score', 'split9_train_score', 'mean_train_score', 'std_train_score'])

In [None]:
# g_results =  pd.DataFrame(g_search.cv_results_)

In [40]:
# model = g_search.best_estimator_
# model.score(X_test,y_test)
y_pred = g_search.predict(X_test)
f1_score(y_test,y_pred)

0.9392994953992283

In [41]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     17700
           1       0.94      0.94      0.94     13472

    accuracy                           0.95     31172
   macro avg       0.95      0.95      0.95     31172
weighted avg       0.95      0.95      0.95     31172



#### Exemplo com RandomizedSearchCV
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

In [42]:
from sklearn.model_selection import RandomizedSearchCV

In [43]:
r_search = RandomizedSearchCV(estimator = DT,
                              param_distributions = param_grid,
                              cv = skf,
                              refit=True,
                              return_train_score=True)

In [44]:
r_search.fit(X_train, y_train);
print(r_search.best_params_)

{'splitter': 'random', 'max_features': None, 'criterion': 'entropy'}


In [45]:
print(r_search.best_score_)

0.9465984467430146


In [46]:
# model = r_search.best_estimator_
# model.score(X_test,y_test)
y_pred = r_search.predict(X_test)
f1_score(y_test,y_pred)

0.9373682068735896

In [47]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     17700
           1       0.93      0.94      0.94     13472

    accuracy                           0.95     31172
   macro avg       0.94      0.95      0.94     31172
weighted avg       0.95      0.95      0.95     31172

