## Base: Airline passenger satisfaction

* https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [34]:
# df = pd.read_csv('train.csv')
df = pd.read_csv('test.csv')

In [3]:
df.shape

(25976, 25)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         25976 non-null  int64  
 1   id                                 25976 non-null  int64  
 2   Gender                             25976 non-null  object 
 3   Customer Type                      25976 non-null  object 
 4   Age                                25976 non-null  int64  
 5   Type of Travel                     25976 non-null  object 
 6   Class                              25976 non-null  object 
 7   Flight Distance                    25976 non-null  int64  
 8   Inflight wifi service              25976 non-null  int64  
 9   Departure/Arrival time convenient  25976 non-null  int64  
 10  Ease of Online booking             25976 non-null  int64  
 11  Gate location                      25976 non-null  int

## Limpeza dos Dados

### Dados Faltantes

In [5]:
df.isnull().sum()

Unnamed: 0                            0
id                                    0
Gender                                0
Customer Type                         0
Age                                   0
Type of Travel                        0
Class                                 0
Flight Distance                       0
Inflight wifi service                 0
Departure/Arrival time convenient     0
Ease of Online booking                0
Gate location                         0
Food and drink                        0
Online boarding                       0
Seat comfort                          0
Inflight entertainment                0
On-board service                      0
Leg room service                      0
Baggage handling                      0
Checkin service                       0
Inflight service                      0
Cleanliness                           0
Departure Delay in Minutes            0
Arrival Delay in Minutes             83
satisfaction                          0


In [35]:
# Remove id and Unnamed: 0
df.drop(['Unnamed: 0'],axis=1, inplace=True)
df.drop(['id'], axis=1, inplace=True)

In [7]:
df.columns

Index(['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

## Substituição dados faltantes em Arrival Delay in Minutes

In [36]:
media = df['Arrival Delay in Minutes'].mean()
df['Arrival Delay in Minutes'].fillna(media, inplace=True)

In [37]:
df['Arrival Delay in Minutes'].isnull().sum()

0

## Transformação dados categóricos

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
for i in df.columns:
  if(df[i].dtype=='object'):
    print(f'{i}: {df[i].unique()}')

Gender: ['Female' 'Male']
Customer Type: ['Loyal Customer' 'disloyal Customer']
Type of Travel: ['Business travel' 'Personal Travel']
Class: ['Eco' 'Business' 'Eco Plus']
satisfaction: ['satisfied' 'neutral or dissatisfied']


In [38]:
# Transforma os dados categóricos
labelencoder = LabelEncoder()
df['Gender'] = labelencoder.fit_transform(df['Gender'])
df['Customer Type'] = labelencoder.fit_transform(df['Customer Type'])
df['Type of Travel'] = labelencoder.fit_transform(df['Type of Travel'])
df['Class'] = labelencoder.fit_transform(df['Class'])
df['satisfaction'] = labelencoder.fit_transform(df['satisfaction'])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             25976 non-null  int64  
 1   Customer Type                      25976 non-null  int64  
 2   Age                                25976 non-null  int64  
 3   Type of Travel                     25976 non-null  int64  
 4   Class                              25976 non-null  int64  
 5   Flight Distance                    25976 non-null  int64  
 6   Inflight wifi service              25976 non-null  int64  
 7   Departure/Arrival time convenient  25976 non-null  int64  
 8   Ease of Online booking             25976 non-null  int64  
 9   Gate location                      25976 non-null  int64  
 10  Food and drink                     25976 non-null  int64  
 11  Online boarding                    25976 non-null  int

## Normalização de atributos numéricos

In [14]:
df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,0,52,0,1,160,5,4,3,4,...,5,5,5,5,2,5,5,50,44.0,1
1,0,0,36,0,0,2863,1,1,3,1,...,4,4,4,4,3,4,5,0,0.0,1
2,1,1,20,0,1,192,2,0,2,4,...,2,4,1,3,2,2,2,0,0.0,0
3,1,0,44,0,0,3377,0,0,0,2,...,1,1,1,1,3,1,4,0,6.0,1
4,0,0,49,0,1,1182,2,3,4,3,...,2,2,2,2,4,2,4,0,20.0,1


In [39]:
scaler_cols = ['Age', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes',
       'Arrival Delay in Minutes']
print(scaler_cols)

['Age', 'Flight Distance', 'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']


In [40]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
ajuste = scaler.fit(df[scaler_cols])
df[scaler_cols] = ajuste.transform(df[scaler_cols])

In [17]:
df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,0,0.817887,0,1,-1.035171,1.703853,0.621641,0.172143,0.797831,...,1.227138,1.25917,1.250974,1.161716,-1.035348,1.144063,1.298998,0.95381,0.781143,1
1,0,0,-0.239238,0,0,1.671443,-1.291598,-1.334871,0.172143,-1.542065,...,0.479907,0.479178,0.49273,0.311739,-0.247517,0.297077,1.298998,-0.382286,-0.393543,1
2,1,1,-1.296363,0,1,-1.003128,-0.542735,-1.987042,-0.535609,0.797831,...,-1.014556,0.479178,-1.782001,-0.538238,-1.035348,-1.396893,-0.974927,-0.382286,-0.393543,0
3,1,0,0.289325,0,0,2.186131,-2.04046,-1.987042,-1.951114,-0.7621,...,-1.761787,-1.8608,-1.782001,-2.238193,-0.247517,-2.243878,0.541023,-0.382286,-0.233358,1
4,0,0,0.619676,0,1,-0.011804,-0.542735,-0.03053,0.879895,0.017866,...,-1.014556,-1.080807,-1.023758,-1.388216,0.540315,-1.396893,0.541023,-0.382286,0.140405,1


## Modelos de Machine Learning

### Separação entre treino e teste

In [41]:
# Formato dataframe e series
X = df.drop(['satisfaction'], axis=1)
y = df['satisfaction']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

### Randon Forest

In [43]:
forest = RandomForestClassifier(n_estimators = 100)

forest.fit(X_train, y_train)
forest_score = forest.score(X_train, y_train)
forest_test = forest.score(X_test, y_test)

y_pred = forest.predict(X_test)

### Métricas de avaliação

In [21]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import (recall_score,
                             accuracy_score,
                             precision_score,
                             f1_score)

In [44]:
# Acuracidade
print("ACC: %.3f" %(accuracy_score(y_test,y_pred)))
#Revocação
print("Recall :%.2f" %(recall_score(y_test,y_pred)))
#Precisão
print("Precision :%.2f" %(precision_score(y_test,y_pred)))
#F1-score
print("F1-score :%.2f" %(f1_score(y_test,y_pred)))


ACC: 0.956
Recall :0.93
Precision :0.97
F1-score :0.95


In [23]:
from sklearn.decomposition import PCA

In [51]:
pca = PCA(n_components=15)
X_pca = pca.fit_transform(X)
X_pca

array([[-2.31932565, -1.17197812, -0.99397359, ..., -0.05212325,
         0.70711162, -0.44751429],
       [-1.84987085,  2.4556449 ,  0.63768507, ...,  0.12161403,
        -0.52114387,  0.326862  ],
       [ 3.20220723,  0.43567257, -0.02101258, ...,  0.82300915,
         0.26563863, -0.18414977],
       ...,
       [ 1.74349854, -0.58087294, -2.08316801, ...,  0.10596834,
         0.18406625, -0.82300638],
       [-1.30747706,  0.2283029 , -0.22661859, ..., -0.08000553,
        -0.41618368,  0.3172009 ],
       [ 4.21468805, -1.60015718,  2.26209903, ...,  0.19687426,
        -1.31014504, -0.94649025]])

In [52]:
# Convert to Dataframe4
# pca_DF = pd.DataFrame(data = X
#              , columns = ['component 1', 'component 2', 'component 3'])
pca_DF = pd.DataFrame(data = X_pca)

In [53]:
pca_DF.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-2.319326,-1.171978,-0.993974,1.340432,-1.023946,-0.232535,0.621192,2.078583,0.04086,-0.044525,0.123033,-0.242263,-0.052123,0.707112,-0.447514
1,-1.849871,2.455645,0.637685,-0.312121,1.097835,-1.293896,0.108767,-0.630972,0.196876,-0.471412,0.002512,0.05763,0.121614,-0.521144,0.326862
2,3.202207,0.435673,-0.021013,-0.702036,-0.910364,-1.096197,-0.391743,0.053618,1.00094,1.891195,1.371401,-0.048689,0.823009,0.265639,-0.18415
3,2.942881,2.650925,3.207204,-0.521071,2.763809,-0.390362,0.192377,-1.737856,0.301784,0.482527,0.01416,-0.498956,-0.05419,0.0588,-0.32067
4,2.032268,-0.254214,1.619954,-0.340706,-0.112118,0.7467,0.245164,-0.551839,-0.568231,0.281606,-0.023864,1.784713,0.194602,0.054617,0.728362


In [54]:
X_train, X_test, y_train, y_test = train_test_split(pca_DF, y, test_size=0.3, random_state=10)

### Randon Forest

In [55]:
forest = RandomForestClassifier(n_estimators = 100)

forest.fit(X_train, y_train)
forest_score = forest.score(X_train, y_train)
forest_test = forest.score(X_test, y_test)

y_pred = forest.predict(X_test)

### Métricas de avaliação

In [56]:
# Acuracidade
print("ACC: %.3f" %(accuracy_score(y_test,y_pred)))
#Revocação
print("Recall :%.2f" %(recall_score(y_test,y_pred)))
#Precisão
print("Precision :%.2f" %(precision_score(y_test,y_pred)))
#F1-score
print("F1-score :%.2f" %(f1_score(y_test,y_pred)))


ACC: 0.906
Recall :0.85
Precision :0.93
F1-score :0.89
