# Import all the necessary libraries
#### Run  this cell everytime an import is added

In [None]:
# Prerequisite imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,ConfusionMatrixDisplay,classification_report
from sklearn.preprocessing import LabelEncoder


In [None]:
# Set the display options to show all the columns 
pd.set_option('display.max_columns', None)

# Load the dataset
df = pd.read_csv('Data/Invistico_Airline.csv')
df.head(10)

In [None]:
# Show the shape of the dataset
df.shape

In [None]:
# show the datatypes of the dataset
df.info()

In [None]:
# Show some statistics of the dataset
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

> Here we can see that there are 393 missing values in the 'Arrival Delay in Minutes' column

In [None]:
# Plotting a missingo matrix to visualize the missing values
# Note: Running this function will return a ValueError, this is because of a update in matolplib. It will still show the graph but without the column labels. 
#msno.matrix(df)

In [None]:
# Dropping columns with NaN values
df = df.dropna()

In [None]:
# Check for duplicates in the dataset
df.duplicated().sum()

> Here we can see that there are no duplicate values in the dataset

# Data understanding

### Check the unique values in the categorical columns

In [None]:
columns_to_check = ['satisfaction', 'Customer Type', 'Type of Travel', 'Class']
for column in columns_to_check:
    print(f"Unique values in '{column}' : {df[column].unique()}")

# Exploratory data analysis
### Plot all the categorical columns with 'satisfaction' as its hue so that we can get a sense of their relation

In [None]:
# Define categorical columns
categorical_columns = ['satisfaction', 'Customer Type', 'Type of Travel', 'Class']

# Set up the figure size
fig, axes = plt.subplots(1, len(categorical_columns), figsize=(20, 5))

# Define different color palettes for each plot
color_palettes = ['Blues', 'Greens', 'Reds', 'Purples']

# Loop through each categorical column and create a bar plot
for i, col in enumerate(categorical_columns):
    sns.countplot(x=df[col], ax=axes[i], hue=df['satisfaction'], palette=color_palettes[i]),
    axes[i].set_title(f'Distribution of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Count')

plt.tight_layout()
plt.show()

> What we see per plot
+  Customer satisfaction is roughly split 60 / 40 in favour of satisfaction.
+  As expected, disloyal customers have a high dissatifaction rate. Surprisingly, loyal customers also have a high dissatisfaction rate. More than half of the loyal customers.
+  When looking a the type of travel, the majority of travel is done as 'Business travel' where around 70% is satisfied and 30#
+  Here we can see that the 'eco' class has the highest dissatisfaction rate of all three travel classes. 'Business' class gets a high satisfaction rate. 'Eco plus', edges just towards a majority of dissatisfied customers.

In [None]:
le=LabelEncoder()
df['satisfaction']= le.fit_transform(df['satisfaction'])
df['Customer Type']=le.fit_transform(df['Customer Type'])
df['Type of Travel']=le.fit_transform(df['Type of Travel'])
df['Class']=le.fit_transform(df['Class'])

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.show()

In [None]:
def train_test_split_data(df, target, random_state, test_size):
    X_train,y_train,X_test,y_test= train_test_split(df.drop([target], axis=1),
                                                    df[target],
                                                    random_state=random_state,
                                                    test_size=0.20,
                                                    stratify=df[target])
    return X_train, X_test, y_train,y_test

X_train, y_train, X_test, y_test=  train_test_split_data(df, target='satisfaction', random_state=65, test_size=0.20)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
sm= SMOTE(sampling_strategy='minority', random_state=42)
X_train_smote, y_train_smote= sm.fit_resample(X_train, y_train)
X_train_smote.shape, y_train_smote.shape

In [None]:
lr = LogisticRegression(max_iter=10000, random_state=56)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("----------------------")

accuracy_scores=[]
f1_scores=[]
precision_scores=[]
recall_scores=[]

accuracy_scores.append(accuracy_score(y_pred, y_test))
f1_scores.append(f1_score(y_pred, y_test))
precision_scores.append(precision_score(y_pred, y_test))
recall_scores.append(recall_score(y_pred, y_test))

# Print the Results
print(f"Accuracy:{accuracy_scores}")
print(f"F1-Score:{f1_scores}")
print(f"Precision:{precision_scores}") 
print(f"Recall:{recall_scores}")

print("-------------------------------")
print("Classifiaction Reoprt")
print("-------------------------------")
print(classification_report(y_test,y_pred,digits=3))
print("Confusion_Matrix")
ConfusionMatrixDisplay.from_predictions(y_test,y_pred,cmap="Reds")
plt.show()

In [None]:
rf=RandomForestClassifier(random_state=42)

hyper_params = {"max_features": [3,10],
                "min_samples_split":[2, 10],
                "min_samples_leaf":[1, 10],
                "n_estimators":[100, 300],
                "criterion":["gini"]}
    
model = RandomizedSearchCV(
    rf, hyper_params, cv=3, scoring="f1_macro", n_jobs=-1, n_iter=20, random_state=42
)
model.fit(X_train_smote,y_train_smote)
y_pred= model.predict(X_test)
print(model.best_estimator_)

print("----------------------")

accuracy_scores=[]
f1_scores=[]
precision_scores=[]
recall_scores=[]


accuracy_scores.append(accuracy_score(y_pred, y_test))
f1_scores.append(f1_score(y_pred, y_test))
precision_scores.append(precision_score(y_pred, y_test))
recall_scores.append(recall_score(y_pred, y_test))

# Print the Results
print(f"Accuracy:{accuracy_scores}")
print(f"F1-Score:{f1_scores}")
print(f"Precision:{precision_scores}") 
print(f"Recall:{recall_scores}")

print("-------------------------------")
print("Classifiaction Reoprt")
print("-------------------------------")
print(classification_report(y_test,y_pred,digits=3))
print("Confusion_Matrix")
ConfusionMatrixDisplay.from_predictions(y_test,y_pred,cmap="Blues")
plt.show()

In [None]:
gb=GradientBoostingClassifier(random_state=42)

hyper_params = {"max_features": [3,10],
                "min_samples_split":[2, 10],
                "min_samples_leaf":[1, 10],
                "n_estimators":[100, 300],
                "learning_rate":[0.05, 0.1, 0.2]}

model = RandomizedSearchCV(
    gb, hyper_params, cv=3, scoring="f1_macro", n_jobs=-1, n_iter=20, random_state=42
)
model.fit(X_train_smote,y_train_smote)
y_pred= model.predict(X_test)
print(model.best_estimator_)

print("----------------------")

accuracy_scores=[]
f1_scores=[]
precision_scores=[]
recall_scores=[]


accuracy_scores.append(accuracy_score(y_pred, y_test))
f1_scores.append(f1_score(y_pred, y_test))
precision_scores.append(precision_score(y_pred, y_test))
recall_scores.append(recall_score(y_pred, y_test))

# Print the Results
print(f"Accuracy:{accuracy_scores}")
print(f"F1-Score:{f1_scores}")
print(f"Precision:{precision_scores}") 
print(f"Recall:{recall_scores}")

print("-------------------------------")
print("Classifiaction Reoprt")
print("-------------------------------")
print(classification_report(y_test,y_pred,digits=3))
print("Confusion_Matrix")
ConfusionMatrixDisplay.from_predictions(y_test,y_pred,cmap="Greens")
plt.show()