In [62]:
!pip install ydata_profiling



In [63]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from ydata_profiling import ProfileReport
import joblib

In [52]:
data = pd.read_csv('data/raw_data.csv')
profile = ProfileReport(data, title="Invistico Airline Report")
profile.to_file('airline-report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/23 [00:00<?, ?it/s][A
  4%|▍         | 1/23 [00:00<00:15,  1.44it/s][A
 13%|█▎        | 3/23 [00:01<00:08,  2.41it/s][A
 22%|██▏       | 5/23 [00:02<00:08,  2.24it/s][A
 26%|██▌       | 6/23 [00:02<00:07,  2.21it/s][A
 35%|███▍      | 8/23 [00:02<00:04,  3.54it/s][A
 43%|████▎     | 10/23 [00:03<00:02,  4.64it/s][A
 57%|█████▋    | 13/23 [00:03<00:01,  6.71it/s][A
 65%|██████▌   | 15/23 [00:03<00:01,  6.72it/s][A
 74%|███████▍  | 17/23 [00:03<00:00,  6.35it/s][A
 78%|███████▊  | 18/23 [00:04<00:00,  6.11it/s][A
 87%|████████▋ | 20/23 [00:04<00:00,  6.46it/s][A
 91%|█████████▏| 21/23 [00:04<00:00,  6.42it/s][A
100%|██████████| 23/23 [00:04<00:00,  4.77it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [53]:
rename_mapping = {
    "satisfaction": "satisfaction",
    "Gender": "gender",
    "Customer Type": "cust_type",
    "Age": "age",
    "Type of Travel": "travel_type",
    "Class": "class",
    "Flight Distance": "flight_dist",
    "Seat comfort": "seat",
    "Departure/Arrival time convenient": "time_conv",
    "Food and drink": "food",
    "Gate location": "gate_loc",
    "Inflight wifi service": "wifi",
    "Inflight entertainment": "entertain",
    "Online support": "support",
    "Ease of Online booking": "booking",
    "On-board service": "onboard",
    "Leg room service": "legroom",
    "Baggage handling": "baggage",
    "Checkin service": "checkin",
    "Cleanliness": "clean",
    "Online boarding": "online_board",
    "Departure Delay in Minutes": "dep_delay",
    "Arrival Delay in Minutes": "arr_delay"
}

data = data.rename(columns=rename_mapping)

In [54]:
target = "satisfaction"
x = data.drop(target, axis=1)
y = data[target]

In [55]:
# train: 70 - validation: 15 - test: 15
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.30, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.50, random_state=42)

In [56]:
for col in data.columns:
    print(f"{col}: {data[col].unique()}")

print(data.isnull().sum())

satisfaction: ['satisfied' 'dissatisfied']
gender: ['Female' 'Male']
cust_type: ['Loyal Customer' 'disloyal Customer']
age: [65 47 15 60 70 30 66 10 56 22 58 34 62 35 13 52 55 28  9 25 53 16 64 42
 21 20 26 48 57 31 17 33 32 38 29 24 37  7 39 11 49  8 40 45 67 59 44 69
 51 18 23 12 46 41 54 27 63 61 36 50 68 19 14 43 72 71 80 77 85 78 75 79
 74 73 76]
travel_type: ['Personal Travel' 'Business travel']
class: ['Eco' 'Business' 'Eco Plus']
flight_dist: [ 265 2464 2138 ... 5832 5120 4260]
seat: [0 1 4 5 2 3]
time_conv: [0 1 2 3 4 5]
food: [0 1 2 3 4 5]
gate_loc: [2 3 4 1 5 0]
wifi: [2 0 3 4 5 1]
entertain: [4 2 0 3 5 1]
support: [2 3 4 5 1 0]
booking: [3 2 1 5 4 0]
onboard: [3 4 1 2 5 0]
legroom: [0 4 3 2 5 1]
baggage: [3 4 1 2 5]
checkin: [5 2 4 3 1 0]
clean: [3 4 1 2 5 0]
online_board: [2 3 5 4 1 0]
dep_delay: [   0  310   17   30   47   40    5    2   34    4   13  427   15   10
   16   11    9    1   19   35   14    6   27   20    3   90   12   68
   93    7   29   66   97  151   64  

In [57]:
num_features = ['age', 'flight_dist', 'dep_delay', 'arr_delay','seat',
                'time_conv', 'food', 'gate_loc', 'wifi', 'entertain', 'support',
                'booking', 'onboard', 'legroom', 'baggage', 'checkin', 'clean', 'online_board']
nom_features = ['gender', 'cust_type', 'travel_type', 'class']

In [58]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

nom_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num_feature", num_transformer, num_features),
    ("nom_feature", nom_transformer, nom_features),
])

In [59]:
x_train_processed = preprocessor.fit_transform(x_train)
x_val_processed = preprocessor.transform(x_val)
x_test_processed = preprocessor.transform(x_test)

In [60]:
nom_cols = preprocessor.named_transformers_['nom_feature'].named_steps['encoder'].get_feature_names_out(nom_features)
all_cols = np.concatenate([num_features, nom_cols])

data_train = pd.DataFrame(x_train_processed, columns=all_cols)
data_val = pd.DataFrame(x_val_processed, columns=all_cols)
data_test = pd.DataFrame(x_test_processed, columns=all_cols)

data_train[target] = y_train.reset_index(drop=True)
data_val[target] = y_val.reset_index(drop=True)
data_test[target] = y_test.reset_index(drop=True)

data_train.to_csv("data/train_data.csv", index=False)
data_val.to_csv("data/validation_data.csv", index=False)
data_test.to_csv("data/test_data.csv", index=False)

In [61]:
joblib.dump(preprocessor, "data/preprocessor.pkl")

['data/preprocessor.pkl']