In [25]:
from tpot import TPOTClassifier
import pandas as pd

train = pd.read_csv("../../datasets/classification/train.csv")
test = pd.read_csv("../../datasets/classification/test.csv")

train = train.dropna(axis = 0)
test = test.dropna(axis = 0)

y_train = train["satisfaction"]
X_train = train.drop("satisfaction", axis = 1)

y_test = test["satisfaction"]
X_test = test.drop("satisfaction", axis = 1)
X_train.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,5,4,3,4,4,5,5,25,18.0
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,1,5,3,1,4,1,1,6.0
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,5,4,3,4,4,4,5,0,0.0
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,2,5,3,1,4,2,11,9.0
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,5,3,3,4,4,3,3,3,0,0.0


In [26]:
X_train.dtypes

Unnamed: 0                             int64
id                                     int64
Gender                                object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Inflight wifi service                  int64
Departure/Arrival time convenient      int64
Ease of Online booking                 int64
Gate location                          int64
Food and drink                         int64
Online boarding                        int64
Seat comfort                           int64
Inflight entertainment                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Inflight service                       int64
Cleanliness                            int64
Departure 

### Preprocessing

TPOT library does not handle categorical data. We have to convert the values of some columns by defining a mapping with numeric values.

In [27]:
for cat in ["Gender", "Customer Type", "Type of Travel", "Class"]:
    print("Levels for catgeory '{0}': {1}".format(cat, X_train[cat].unique()))

Levels for catgeory 'Gender': ['Male' 'Female']
Levels for catgeory 'Customer Type': ['Loyal Customer' 'disloyal Customer']
Levels for catgeory 'Type of Travel': ['Personal Travel' 'Business travel']
Levels for catgeory 'Class': ['Eco Plus' 'Business' 'Eco']


In [28]:
X_train["Gender"] = X_train["Gender"].map({"Male": 0, "Female": 1})
X_train["Customer Type"] = X_train["Customer Type"].map({"Loyal Customer": 0, "disloyal Customer": 1})
X_train["Type of Travel"] = X_train["Type of Travel"].map({"Personal Travel": 0, "Business travel": 1})
X_train["Class"] = X_train["Class"].map({"Eco Plus": 0, "Business": 1, "Eco": 2})
y_train = y_train.map({"neutral or dissatisfied": 0, "satisfied": 1})

X_test["Gender"] = X_test["Gender"].map({"Male": 0, "Female": 1})
X_test["Customer Type"] = X_test["Customer Type"].map({"Loyal Customer": 0, "disloyal Customer": 1})
X_test["Type of Travel"] = X_test["Type of Travel"].map({"Personal Travel": 0, "Business travel": 1})
X_test["Class"] = X_test["Class"].map({"Eco Plus": 0, "Business": 1, "Eco": 2})
y_test = y_test.map({"neutral or dissatisfied": 0, "satisfied": 1})

### Training

In [29]:
pipeline_optimizer = TPOTClassifier(verbosity=2, max_time_mins = 60)
pipeline_optimizer.fit(X_train.to_numpy().astype("float"), y_train.to_numpy().astype("float"))

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


61.39 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.9000000000000001, min_samples_leaf=20, min_samples_split=18, n_estimators=100)


### Evaluation

In [30]:
print(pipeline_optimizer.score(X_test.to_numpy().astype("float"), y_test.to_numpy().astype("float")))



0.9608002162746688
