## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv("Invistico_Airline.csv")

In [3]:
dataset.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,4,2,2,0,2,4,2,5,0,0.0


In [4]:
X = dataset.iloc[:,1:-1].values
y = dataset.iloc[:,0].values

In [5]:
X

array([['Female', 'Loyal Customer', 65, ..., 3, 2, 0],
       ['Male', 'Loyal Customer', 47, ..., 3, 2, 310],
       ['Female', 'Loyal Customer', 15, ..., 4, 2, 0],
       ...,
       ['Male', 'disloyal Customer', 69, ..., 3, 2, 155],
       ['Male', 'disloyal Customer', 66, ..., 1, 2, 193],
       ['Female', 'disloyal Customer', 38, ..., 3, 3, 185]], dtype=object)

In [6]:
y

array(['satisfied', 'satisfied', 'satisfied', ..., 'dissatisfied',
       'dissatisfied', 'dissatisfied'], dtype=object)

## Data Preprocessing

### Label Encoding of Categorical variables

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[1 1 1 ... 0 0 0]


In [8]:
X[:,0] = le.fit_transform(X[:,0])
print(X)

[[0 'Loyal Customer' 65 ... 3 2 0]
 [1 'Loyal Customer' 47 ... 3 2 310]
 [0 'Loyal Customer' 15 ... 4 2 0]
 ...
 [1 'disloyal Customer' 69 ... 3 2 155]
 [1 'disloyal Customer' 66 ... 1 2 193]
 [0 'disloyal Customer' 38 ... 3 3 185]]


In [9]:
X[:,1] = le.fit_transform(X[:,1])
print(X)

[[0 0 65 ... 3 2 0]
 [1 0 47 ... 3 2 310]
 [0 0 15 ... 4 2 0]
 ...
 [1 1 69 ... 3 2 155]
 [1 1 66 ... 1 2 193]
 [0 1 38 ... 3 3 185]]


In [10]:
X[:,3] = le.fit_transform(X[:,3])
print(X)

[[0 0 65 ... 3 2 0]
 [1 0 47 ... 3 2 310]
 [0 0 15 ... 4 2 0]
 ...
 [1 1 69 ... 3 2 155]
 [1 1 66 ... 1 2 193]
 [0 1 38 ... 3 3 185]]


### One Hot Encoding of categorical variable

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[4])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)

[[0.0 1.0 0.0 ... 3 2 0]
 [1.0 0.0 0.0 ... 3 2 310]
 [0.0 1.0 0.0 ... 4 2 0]
 ...
 [0.0 1.0 0.0 ... 3 2 155]
 [0.0 1.0 0.0 ... 1 2 193]
 [0.0 1.0 0.0 ... 3 3 185]]


## Splitting the dataset into test set and training set

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=35320)

## Training the XG Boost Classifier on the Training set

In [13]:
from xgboost import XGBClassifier
XGB = XGBClassifier(learning_rate=0.15, max_depth=12, min_child_weight=8, 
                    eta=3, n_estimators=350, 
                    penalty="l1", random_state=856) # 93.2
XGB.fit(X_train,y_train)



Parameters: { penalty } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=3, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=12,
              min_child_weight=8, missing=nan, monotone_constraints='()',
              n_estimators=350, n_jobs=4, num_parallel_tree=1,
              objective='binary:logistic', penalty='l1', random_state=856,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

## Predicting the test set results

In [14]:
y_pred = XGB.predict(X_test)

## Checking the Accuracy and f1_score for the predicted results

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))
from sklearn.metrics import f1_score
print(f1_score(y_test, y_pred, average='weighted'))

[[16861   688]
 [  988 20427]]
0.9569859357355508
0.9570165017606604
