# Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.utils import resample

from imblearn.over_sampling import RandomOverSampler

from scipy import stats

# Criando o data frame

In [2]:
df = pd.read_csv('archive/train.csv')

Removendo coluna de id e coluna com nome nulo

In [3]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.drop(columns=['id'])

In [4]:
df.dtypes

Gender                                object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Inflight wifi service                  int64
Departure/Arrival time convenient      int64
Ease of Online booking                 int64
Gate location                          int64
Food and drink                         int64
Online boarding                        int64
Seat comfort                           int64
Inflight entertainment                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Inflight service                       int64
Cleanliness                            int64
Departure Delay in Minutes             int64
Arrival Delay in Minutes             float64
satisfacti

# Separação do data frame
Separando por classe

In [5]:
class1_df = df[df['satisfaction'] == 'satisfied']
class2_df = df[df['satisfaction'] == 'neutral or dissatisfied']

In [6]:
print('Quantidade de dados na classe 1:', class1_df.shape[0])
print('Quantidade de dados na classe 2:', class2_df.shape[0])

Quantidade de dados na classe 1: 45025
Quantidade de dados na classe 2: 58879


Separação aleatória de cada classe

In [7]:
class1_50_percent = class1_df.sample(frac=0.5)
remaining_class1 = class1_df.drop(class1_50_percent.index)
class1_25_percent_1 = remaining_class1.sample(frac=0.5)
class1_25_percent_2 = remaining_class1.drop(class1_25_percent_1.index)

In [8]:
print('Quantidade de dados na classe 1 50%:', class1_50_percent.shape[0])
print('Quantidade de dados na classe 1 25% 1:', class1_25_percent_1.shape[0])
print('Quantidade de dados na classe 1 25% 2:', class1_25_percent_2.shape[0])

Quantidade de dados na classe 1 50%: 22512
Quantidade de dados na classe 1 25% 1: 11256
Quantidade de dados na classe 1 25% 2: 11257


In [9]:
class2_50_percent = class2_df.sample(frac=0.5)
remaining_class2 = class2_df.drop(class2_50_percent.index)
class2_25_percent_1 = remaining_class2.sample(frac=0.5)
class2_25_percent_2 = remaining_class2.drop(class2_25_percent_1.index)

In [10]:
print('Quantidade de dados na classe 2 50%:', class2_50_percent.shape[0])
print('Quantidade de dados na classe 2 25% 1:', class2_25_percent_1.shape[0])
print('Quantidade de dados na classe 2 25% 2:', class2_25_percent_2.shape[0])

Quantidade de dados na classe 2 50%: 29440
Quantidade de dados na classe 2 25% 1: 14720
Quantidade de dados na classe 2 25% 2: 14719


In [11]:
class1_50_percent = class1_50_percent.dropna()
class2_50_percent = class2_50_percent.dropna()

## Oversampling

In [12]:
size_difference = (len(class2_50_percent) + len(class2_25_percent_1)) - (len(class1_50_percent) + len(class1_25_percent_1))
size_difference

10357

In [13]:
to_be_oversampled = pd.concat([class2_50_percent, class2_25_percent_1, class1_50_percent, class1_25_percent_1])

In [14]:
oversampler = RandomOverSampler(random_state=42)

In [15]:
X_resampled, y_resampled = oversampler.fit_resample(to_be_oversampled.drop('satisfaction', axis=1), to_be_oversampled['satisfaction'])

In [16]:
df_resampled = pd.DataFrame(X_resampled, columns=df.drop('satisfaction', axis=1).columns)
merged_df = df_resampled.join(y_resampled, how='inner')
merged_df

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,Female,Loyal Customer,37,Personal Travel,Eco Plus,1127,2,4,2,3,...,3,4,4,5,4,4,3,0,0.0,neutral or dissatisfied
1,Female,Loyal Customer,47,Personal Travel,Eco,173,1,4,1,2,...,4,4,1,2,5,4,5,0,0.0,neutral or dissatisfied
2,Male,Loyal Customer,31,Business travel,Business,1587,4,2,2,2,...,4,4,2,4,2,3,4,0,6.0,neutral or dissatisfied
3,Female,Loyal Customer,14,Personal Travel,Eco Plus,888,3,3,3,4,...,3,2,2,3,2,4,3,33,38.0,neutral or dissatisfied
4,Male,disloyal Customer,15,Business travel,Eco,845,2,2,2,3,...,4,2,1,4,1,1,4,0,0.0,neutral or dissatisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88125,Male,disloyal Customer,21,Business travel,Business,1121,4,5,4,2,...,1,5,2,5,3,4,1,0,0.0,satisfied
88126,Female,Loyal Customer,40,Business travel,Eco,404,4,4,1,4,...,4,3,1,2,1,2,4,32,26.0,satisfied
88127,Male,Loyal Customer,28,Business travel,Business,1739,1,1,1,1,...,2,4,3,5,4,5,2,0,0.0,satisfied
88128,Female,Loyal Customer,41,Business travel,Business,368,2,2,2,2,...,4,4,4,4,5,4,4,0,0.0,satisfied


In [17]:
class1_df = merged_df[merged_df['satisfaction'] == 'satisfied']
class2_df = merged_df[merged_df['satisfaction'] == 'neutral or dissatisfied']

In [18]:
class1_50_percent = class1_df.sample(frac=50/75)
class1_25_percent_1 = class1_df.drop(class1_50_percent.index)

In [19]:
class2_50_percent = class2_df.sample(frac=50/75)
class2_25_percent_1 = class2_df.drop(class2_50_percent.index)

## Tratamento

In [20]:
def treatment(df):
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
        ])
        

    result = preprocessor.fit_transform(df)
    
    num_col_names = numeric_features.tolist()
    cat_col_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features).tolist()

    new_col_names = num_col_names + cat_col_names

    result = pd.DataFrame(result, columns=new_col_names)

    return result

In [21]:
class1_50_percent = treatment(class1_50_percent)
class2_50_percent = treatment(class2_50_percent)
class1_25_percent_1 = treatment(class1_25_percent_1)
class2_25_percent_1 = treatment(class2_25_percent_1)

In [22]:
class2_50_percent.head()

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,...,Gender_Female,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,satisfaction_neutral or dissatisfied
0,0.938928,-1.065948,-0.412446,0.574633,-0.451942,0.020261,0.026713,-0.568924,-0.03036,0.076014,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
1,-0.70366,-1.036661,0.622084,0.574633,0.375783,0.852722,1.514411,0.300744,1.503385,1.588425,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,-1.433699,2.233242,1.656614,-2.083373,1.203508,0.852722,0.770562,-0.568924,-0.797232,-0.680191,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,-1.616209,-0.114773,0.622084,-1.418871,0.375783,0.020261,-1.460984,0.300744,-1.564104,-1.436397,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.668967,-1.065948,1.656614,1.239135,1.203508,1.685183,0.770562,1.170413,0.736513,0.83222,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


## Fazendo oversampling no conjunto de treinamento

Calculando a classe majoritaria e a diferênça

In [24]:
size_difference = (len(class2_50_percent) + len(class2_25_percent_1)) - (len(class1_50_percent) + len(class1_25_percent_1))
size_difference

0

In [25]:
train_df = pd.concat([class1_50_percent, class2_50_percent], ignore_index=True)
train_df = train_df.sample(frac=1).reset_index(drop=True)

In [26]:
validation_df = pd.concat([class1_25_percent_1, class2_25_percent_1], ignore_index=True)
validation_df = validation_df.sample(frac=1).reset_index(drop=True)

In [27]:
test_df = pd.concat([class1_25_percent_2, class2_25_percent_2], ignore_index=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [28]:
train_df.head()

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,...,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,satisfaction_satisfied,satisfaction_neutral or dissatisfied
0,-0.277804,-0.089306,-1.446976,-0.75437,-0.451942,-0.8122,0.026713,-0.568924,-0.797232,-1.436397,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,,1.0
1,-0.216967,-1.004828,-1.446976,-2.083373,-1.279667,-0.8122,-1.460984,-1.438593,-1.564104,-1.436397,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,,1.0
2,1.668967,-0.408911,1.656614,-0.75437,1.203508,0.852722,0.770562,1.170413,0.736513,0.83222,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,,1.0
3,0.999764,1.97603,0.622084,1.239135,0.375783,0.020261,0.026713,0.300744,1.503385,0.83222,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,,1.0
4,1.486457,-0.307045,-0.412446,-0.75437,-0.451942,0.852722,0.770562,-0.568924,0.736513,0.83222,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,,1.0


In [29]:
test_df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,Male,Loyal Customer,47,Business travel,Eco Plus,788,1,1,1,1,...,1,2,1,4,1,4,1,18,4.0,neutral or dissatisfied
1,Male,Loyal Customer,56,Personal Travel,Business,189,1,5,1,2,...,2,2,1,4,5,2,5,26,15.0,neutral or dissatisfied
2,Male,Loyal Customer,18,Personal Travel,Eco,181,2,4,2,3,...,3,4,3,5,5,5,3,12,4.0,neutral or dissatisfied
3,Female,Loyal Customer,56,Business travel,Business,3714,1,1,1,1,...,4,4,4,4,5,4,3,0,0.0,satisfied
4,Male,Loyal Customer,34,Personal Travel,Eco,954,4,3,4,5,...,1,4,2,1,4,1,1,0,0.0,neutral or dissatisfied


In [30]:
validation_df.head()

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,...,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,satisfaction_satisfied,satisfaction_neutral or dissatisfied
0,-1.557568,-0.750854,1.652504,1.246276,1.20288,1.697924,0.761992,1.17458,0.72799,0.823985,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,,1.0
1,1.910388,0.085612,1.162599,1.305796,1.247893,1.472153,-1.229521,-0.878055,0.89764,0.04095,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,
2,1.006257,-0.805723,0.618069,0.579683,0.375227,-1.657941,1.501186,0.300253,1.492793,1.574866,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,,1.0
3,-1.313395,0.607149,-1.450801,-0.753502,-1.280078,0.019992,1.501186,-1.448403,1.492793,1.574866,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,,1.0
4,-0.947134,-0.520156,0.618069,-0.086909,0.375227,-0.818974,1.501186,0.300253,1.492793,1.574866,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,,1.0


In [31]:
train_df['satisfaction'] = train_df['satisfaction_satisfied'].apply(lambda x: 1 if x == 1.0 else 0)
validation_df['satisfaction'] = validation_df['satisfaction_satisfied'].apply(lambda x: 1 if x == 1.0 else 0)

In [32]:
train_df.drop(columns=['satisfaction_satisfied', 'satisfaction_neutral or dissatisfied'], inplace=True)
validation_df.drop(columns=['satisfaction_satisfied', 'satisfaction_neutral or dissatisfied'], inplace=True)

In [33]:
# Salvar o dataset atualizado
train_df.to_csv('dataset_tratado/train_df.csv', index=False)
validation_df.to_csv('dataset_tratado/validation_df.csv', index=False)