# Importing Libraries

In [87]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#pyclustering
from pyclustering.cluster.kmedians import kmedians
from pyclustering.cluster import cluster_visualizer
from pyclustering.cluster.center_initializer import random_center_initializer

#scipy
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import ward, fcluster

#sklearn
import sklearn
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm, tree, neighbors
from sklearn import naive_bayes, ensemble
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error

import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt



#warnings
import warnings
warnings.filterwarnings('ignore')  

# Data

In [None]:
#import data
airline = pd.read_csv('Airline_Passenger.csv')
airline.head()

In [None]:
#check columns
airline.columns

In [None]:
airline['satisfaction'].value_counts()

In [None]:
#shape
airline.shape

# Cleaning data

In [None]:
#removing unnamed column
airline = airline.drop(columns = ['Unnamed: 0', 'id']) #since not used

- data is very clean
- not checking for null values , because Arrival Delay has 0 values = no delay

### Dealing with missing values

In [None]:
#checking for missing values
airline.isna().sum()

In [None]:
#in arrival Delay in Minutes na values = 0, meaning no delay
#fill with 0
airline['Arrival Delay in Minutes'].fillna(0, inplace = True)

#check again
airline.isna().sum()

### Converting categorical data

In [None]:
#convert Gender, Customer Type, Type of Travel, Class to categorical
airline['Gender'] = airline['Gender'].astype('category')
airline['Customer Type'] = airline['Customer Type'].astype('category')
airline['Type of Travel'] = airline['Type of Travel'].astype('category')
airline['satisfaction'] = airline['satisfaction'].astype('category')
airline.dtypes

In [None]:
#check for unique
print(airline['Gender'].unique())
print(airline['Customer Type'].unique())
print(airline['Type of Travel'].unique())
print(airline['Class'].unique())
print(airline['satisfaction'].unique())

In [None]:
#formatting them
categorical_columns = airline.select_dtypes(['category']).columns
categorical_columns
airline[categorical_columns] = airline[categorical_columns].apply(lambda x: x.cat.codes)

In [None]:
#formatting class 0-2
category_mapping = {'Eco': 0, 'Eco Plus': 1, 'Business': 2}
airline['Class'] = airline['Class'].map(category_mapping)

In [None]:
#checking
print(airline['Gender'].unique())
print(airline['Customer Type'].unique())
print(airline['Type of Travel'].unique())
print(airline['Class'].unique())
print(airline['satisfaction'].unique())

In [None]:
airline.head()

# EDA

## Distribution of age

In [None]:
plt.figure(figsize = (8, 6))
sns.histplot(airline['Age'], bins = 20, kde = True, color = 'purple')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

## Count of Gender

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Gender', data=airline, palette='icefire')
plt.title('Count of Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Female', 'Male'])
plt.show()

## Age by Class & Gender

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Class', y='Age', hue='Gender', data=airline, palette = 'PuBuGn')
plt.title('Age by Class and Gender: Female:0, Male:1')
plt.xlabel('Class')
plt.ylabel('Age')
plt.show()

## Count of Customer Type

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Customer Type', data=airline, palette='cividis')
plt.title('Count of Customer Type')
plt.xlabel('Customer Type')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Loyal', 'Disloyal'])
plt.show()

# Correlation

In [None]:
correlation_matrix = airline.corr()

#plot
plt.figure(figsize = (25, 25))
sns.heatmap(correlation_matrix, cmap = 'magma')
plt.title('Correlation Matrix')
plt.show()
#correlation matrix

# Overall model comparision for the entire data

### Test-Train Split (40:60)

In [None]:
X = airline.drop(['satisfaction'],axis = 1)
y = airline['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Logistic Classifier

In [None]:
#logistic regression
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
log_model.fit(X_train, y_train)

#calculate accuracy
y_pred_log = log_model.predict(X_test)
accuracy_log = accuracy_score(y_test, y_pred_log)
print("Accuracy of Logistic Regression:", accuracy_log)

### Decision Tree Classifier

In [None]:
#check which depth to use
scores_list = []
depth_list = np.arange(1,20,1)
for depth in depth_list:
    dt = DecisionTreeClassifier(max_depth = depth, criterion = 'gini', random_state = 0)
    scores = cross_val_score(dt, X_train, y_train, cv = 10,scoring = 'accuracy')
    scores_list.append(scores.mean())

#plot
plt.plot(depth_list, scores_list,  color = 'purple', markerfacecolor = 'black',label = 'Score')
plt.title('Accuracy Score vs max_depth')
plt.show()

In [None]:
# for max accuracy depth
max_value = max(scores_list)
max_index = scores_list.index(max_value)
max_index

In [None]:
dt = DecisionTreeClassifier(max_depth = 13, criterion = "gini", random_state = 0)
dt_model = dt.fit(X_train, y_train)

#calculate accuracy
y_pred_dt = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy of Decision Tree:", accuracy_dt)

### Bagging Classifier

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0)
clf_bag.fit(X_train_scaled, y_train)
y_pred_bag = clf_bag.predict(X_test_scaled)

#calculate accuracy
accuracy_bag = accuracy_score(y_test, y_pred_bag)
print("Accuracy of Bagging Classifier:", accuracy_bag)

### Random Forest Classifier

In [None]:
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42)
rand_forest_model.fit(X_train, y_train)

#calculate accuracy
y_pred_rand_forest = rand_forest_model.predict(X_test)
accuracy_rand_forest = accuracy_score(y_test, y_pred_rand_forest)
print("Accuracy of Random Forest:", accuracy_rand_forest)

### GB Classifier

In [None]:
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42)
gbc.fit(X_train, y_train)

#calculate accuracy
y_pred_gbc = gbc.predict(X_test)
accuracy_gbc = accuracy_score(y_test, y_pred_gbc)
print("Accuracy of GB Classifier:", accuracy_gbc)

### KNN Classifier

In [None]:
#k-means
cluster_range = range(2,50)
cluster_wss = []

for num_cluster in cluster_range:
    clusters = KMeans(num_cluster)
    clusters.fit(X_train)
    cluster_wss.append(clusters.inertia_)
    
plt.xlabel('# Clusters')
plt.ylabel('WSS')
plt.plot(cluster_range, cluster_wss, marker = 'o', color = 'purple')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X, y)

#calculate accuracy
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy of KNN:", accuracy_knn)

### Comparing all the models on overall data

In [None]:
#to keep decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state =0).fit(X_train, y_train)

#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for overall data = Random Forest Classifier

# Subsetting data by Class

## Business Class

In [None]:
business = airline[airline['Class'] == 2]
business.head()

### Test-Train Split (40:60)

In [None]:
X = business.drop(['satisfaction'], axis = 1)
y = business['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion = 'gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)

#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Business class data = Random Forest Classifier

### Confusion Matrix

In [None]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare')
plt.show()

## Economy Class

In [None]:
economy = airline[airline['Class'] == 0]
economy.head()

### Test-Train Split (40:60)

In [None]:
X = economy.drop(['satisfaction'], axis = 1)
y = economy['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Economy Class data = Random Forest Classifier

### Confusion Matrix

In [None]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare') 
plt.show()

## Economy Plus Class

In [None]:
economy_plus = airline[airline['Class'] == 1]
economy_plus.head()

### Test-Train Split (40:60)

In [None]:
X = economy_plus.drop(['satisfaction'], axis = 1)
y = economy_plus['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Economy Plus Class data = Gradient Boosting Classifier

### Confusion Matrix

In [None]:
y_pred_gbc = gbc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gbc, labels = gbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = gbc.classes_)
disp.plot(cmap = 'flare') 
plt.show()

# Subsetting data by Gender

## Female data

In [None]:
female = airline[airline['Gender'] == 0]
female.head()

### Test-Train Split (40:60)

In [None]:
X = female.drop(['satisfaction'], axis = 1)
y = female['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Female data = Random Forest Classifier

### Confusion Matrix

In [None]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare') 
plt.show()

## Male data

In [None]:
male = airline[airline['Gender'] == 1]
male.head()

### Test-Train Split (40:60)

In [None]:
X = male.drop(['satisfaction'], axis = 1)
y = male['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Male data = Random Forest Classifier

### Confusion Matrix

In [None]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare') 
plt.show()

# Subsetting data by Age

## Age group 1: 6 - 18

In [None]:
age_1 = airline[(airline['Age'] > 6) & (airline['Age'] <= 18)]
age_1.head()

### Test-Train Split (40:60)

In [None]:
X = age_1.drop(['satisfaction'], axis = 1)
y = age_1['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Age group 1 data = Gradient Boosting Classifier

### Confusion Matrix

In [None]:
y_pred_gbc = gbc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gbc, labels = gbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = gbc.classes_)
disp.plot(cmap = 'flare') 
plt.show()

## Age group 2: 19 - 24

In [None]:
age_2 = airline[(airline['Age'] > 18) & (airline['Age'] <= 24)]
age_2.head()

### Test-Train Split (40:60)

In [None]:
X = age_2.drop(['satisfaction'], axis = 1)
y = age_2['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Age group 2 data = Random Forest Classifier

### Confusion Matrix

In [None]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare') 
plt.show()

## Age group 3: 25 - 34

In [None]:
age_3 = airline[(airline['Age'] > 25) & (airline['Age'] <= 34)]
age_3.head()

### Test-Train Split (40:60)

In [None]:
X = age_3.drop(['satisfaction'], axis = 1)
y = age_3['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Age group 3 data = Random Forest Classifier

### Confusion Matrix

In [None]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare') 
plt.show()

## Age group 4: 35 - 44

In [None]:
age_4 = airline[(airline['Age'] > 35) & (airline['Age'] <= 44)]
age_4.head()

### Test-Train Split (40:60)

In [None]:
X = age_4.drop(['satisfaction'], axis = 1)
y = age_4['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Age group 4 data = Random Forest Classifier

### Confusion Matrix

In [None]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare') 
plt.show()

## Age group 5: 45 - 54

In [None]:
age_5 = airline[(airline['Age'] > 45) & (airline['Age'] <= 54)]
age_5.head()

### Test-Train Split (40:60)

In [None]:
X = age_5.drop(['satisfaction'], axis = 1)
y = age_5['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Age group 5 data = Random Forest Classifier

### Confusion Matrix

In [None]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare') 
plt.show()

## Age group 6: 55 - 64

In [None]:
age_6 = airline[(airline['Age'] > 55) & (airline['Age'] <= 64)]
age_6.head()

### Test-Train Split (40:60)

In [None]:
X = age_6.drop(['satisfaction'], axis = 1)
y = age_6['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Age group 6 data = Gradient Boosting Classifier

### Confusion Matrix

In [None]:
y_pred_gbc = gbc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gbc, labels = gbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = gbc.classes_)
disp.plot(cmap = 'flare') 
plt.show()

## Age group 7: above 65

In [None]:
age_7 = airline[(airline['Age'] >= 65)]
age_7.head()

### Test-Train Split (40:60)

In [None]:
X = age_7.drop(['satisfaction'], axis = 1)
y = age_7['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Age group 7 data = Gradient Boosting Classifier

### Confusion Matrix

In [None]:
y_pred_gbc = gbc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gbc, labels = gbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = gbc.classes_)
disp.plot(cmap = 'flare') 
plt.show()

# Subsetting data by Type of Travel

## Business Travel

In [None]:
btravel = airline[airline['Type of Travel'] == 0]
btravel.head()

### Test-Train Split (40:60)

In [None]:
X = btravel.drop(['satisfaction'], axis = 1)
y = btravel['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Business Travel data = Random Forest Classifier

### Confusion Matrix

In [None]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare') 
plt.show()

## Personal Travel

In [None]:
ptravel = airline[airline['Type of Travel'] == 1]
ptravel.head()

### Test-Train Split (40:60)

In [None]:
X = ptravel.drop(['satisfaction'], axis = 1)
y = ptravel['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Personal Travel data = Gradient Boosting Classifier

### Confusion Matrix

In [None]:
y_pred_gbc = gbc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gbc, labels = gbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = gbc.classes_)
disp.plot(cmap = 'flare') 
plt.show()

# Subsetting by Customer Type

## Loyal Customer

In [None]:
loyal = airline[airline['Customer Type'] == 0]
loyal.head()

### Test-Train Split (40:60)

In [None]:
X = loyal.drop(['satisfaction'], axis = 1)
y = loyal['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Loyal Customer data = Random Forest Classifier

### Confusion Matrix

In [None]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare') 
plt.show()

## Disloyal Customer

In [None]:
disloyal = airline[airline['Customer Type'] == 1]
disloyal.head()

### Test-Train Split (40:60)

In [None]:
X = disloyal.drop(['satisfaction'], axis = 1)
y = disloyal['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

### Comparing all the models

In [None]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0), 
                               X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)


#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))

#evaluating model results
acc_results = []
auc_results = []
names = []

#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)



#using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(n_splits = 3)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    #append
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                         round(cv_auc_results.mean()*100, 2),
                         round(cv_auc_results.std()*100, 2),
                         round(cv_acc_results.mean()*100, 2),
                         round(cv_acc_results.std()*100, 2)
                         ]
    i += 1

#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)

### Best model for Disloyal Customer data = Gradient Boosting Classifier

### Confusion Matrix

In [None]:
y_pred_gbc = gbc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gbc, labels = gbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = gbc.classes_)
disp.plot(cmap = 'flare') 
plt.show()

# Business Decision Performance

- We are choosing the model that performs best for each subset and choosing the top 3 most important features (based on the model)
- Increasing the satisfaction by 1 for each of those features and predicting (based on test data) how many passengers that were initially dissatified, [since most of the model accuracies are above 97%, assuming the predictions are true] will be predicted as satisfied
- Then calculating the cost, profit and the cost profit ratio for each

## Assumed costs for increasing a variable by 1 unit

In [None]:
costs = {
    'Flight Distance': 400, 'Inflight wifi service': 80, 'Ease of Online booking': 60,
    'Gate location': 140, 'Food and drink': 100, 'Online boarding': 70, 'Seat comfort': 110,
    'Inflight entertainment': 50, 'On-board service': 30, 'Leg room service': 500,
    'Baggage handling': 20, 'Checkin service': 20, 'Inflight service': 40, 'Cleanliness': 20
}

## Assumed profit per changed customer

In [None]:
profit_per_change = 500

## For Class

### For Business Class: Random Forest Classifier

In [None]:
#business.head()
X = business.drop(['satisfaction'], axis = 1) 
y = business['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = business.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = business['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns

#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = rf.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase = rf.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_business_class = pd.DataFrame(table_data, columns=table_head)
results_business_class["Cost Benefit ($)"] = results_business_class["Total Profit ($)"]/results_business_class["Total Cost ($)"]
results_business_class

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_business_class, color='purple')

### For Economy Class: Random Forest Classifier 

In [None]:
X = economy.drop(['satisfaction'], axis = 1) 
y = economy['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = economy.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = economy['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns

#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = rf.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = rf.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_economy_class = pd.DataFrame(table_data, columns=table_head)
results_economy_class["Cost Benefit ($)"] = results_economy_class["Total Profit ($)"]/results_economy_class["Total Cost ($)"]
results_economy_class

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_economy_class, color='purple')

### For Economy Plus Class: Gradient Boosting Classifier 

In [None]:
X = economy_plus.drop(['satisfaction'], axis = 1) #dropping no common class for non important stuff
y = economy_plus['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42)
gb.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(gb,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = economy_plus.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = economy_plus['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
features = X.columns

#plot
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = gb.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = gb.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_economy_plus_class = pd.DataFrame(table_data, columns=table_head)
results_economy_plus_class["Cost Benefit ($)"] = results_economy_plus_class["Total Profit ($)"]/results_economy_plus_class["Total Cost ($)"]
results_economy_plus_class

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_economy_plus_class, color='purple')

## For Gender

### For Female data: Random Forest Classifier 

In [None]:
X = female.drop(['satisfaction'], axis = 1) 
y = female['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = female.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = female['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns

#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = rf.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = rf.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_female_class = pd.DataFrame(table_data, columns=table_head)
results_female_class["Cost Benefit ($)"] = results_female_class["Total Profit ($)"]/results_female_class["Total Cost ($)"]
results_female_class

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_female_class, color='purple')

### For Male data: Random Forest Classifier 

In [None]:
X = male.drop(['satisfaction'], axis = 1) 
y = male['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = male.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = male['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns

#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = rf.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = rf.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_male_class = pd.DataFrame(table_data, columns=table_head)
results_male_class["Cost Benefit ($)"] = results_male_class["Total Profit ($)"]/results_male_class["Total Cost ($)"]
results_male_class

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_male_class, color='purple')

## For Age

### For Age group 1(6-18): Gradient Boosting Classifier 

In [None]:
X = age_1.drop(['satisfaction'], axis = 1) 
y = age_1['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42)
gb.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(gb,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = age_1.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_1['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
features = X.columns

#plot
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = gb.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = gb.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_1_class = pd.DataFrame(table_data, columns=table_head)
results_age_1_class["Cost Benefit ($)"] = results_age_1_class["Total Profit ($)"]/results_age_1_class["Total Cost ($)"]
results_age_1_class

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_1_class, color='purple')

### For Age group 2(19-24): Random Forest Classifier 

In [None]:
X = age_2.drop(['satisfaction'], axis = 1) 
y = age_2['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = age_2.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_2['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns

#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = rf.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = rf.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_2_class = pd.DataFrame(table_data, columns=table_head)
results_age_2_class["Cost Benefit ($)"] = results_age_2_class["Total Profit ($)"]/results_age_2_class["Total Cost ($)"]
results_age_2_class

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_2_class, color='purple')

### For Age group 3(25-34): Random Forest Classifier 

In [None]:
X = age_3.drop(['satisfaction'], axis = 1) 
y = age_3['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = age_3.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_3['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns

#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = rf.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = rf.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_3_class = pd.DataFrame(table_data, columns=table_head)
results_age_3_class["Cost Benefit ($)"] = results_age_3_class["Total Profit ($)"]/results_age_3_class["Total Cost ($)"]
results_age_3_class

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_3_class, color='purple')

### For Age group 4(35-44): Random Forest Classifier

In [None]:
X = age_4.drop(['satisfaction'], axis = 1) 
y = age_4['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = age_4.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_4['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns

#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = rf.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = rf.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_4_class = pd.DataFrame(table_data, columns=table_head)
results_age_4_class["Cost Benefit ($)"] = results_age_4_class["Total Profit ($)"]/results_age_4_class["Total Cost ($)"]
results_age_4_class

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_4_class, color='purple')

### For Age group 5(45-54): Random Forest Classifier

In [None]:
X = age_5.drop(['satisfaction'], axis = 1) 
y = age_5['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = age_5.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_5['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns

#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = rf.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = rf.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_5_class = pd.DataFrame(table_data, columns=table_head)
results_age_5_class["Cost Benefit ($)"] = results_age_5_class["Total Profit ($)"]/results_age_5_class["Total Cost ($)"]
results_age_5_class

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_5_class, color='purple')

### For Age group 6(55-64): Gradient Boosting Classifier

In [None]:
X = age_6.drop(['satisfaction'], axis = 1) 
y = age_6['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(gb,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = age_6.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_6['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
features = X.columns

#plot
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = gb.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = gb.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_6_class = pd.DataFrame(table_data, columns=table_head)
results_age_6_class["Cost Benefit ($)"] = results_age_6_class["Total Profit ($)"]/results_age_6_class["Total Cost ($)"]
results_age_6_class

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_6_class, color='purple')

### For Age group 7(above 65): Gradient Boosting Classifier

In [None]:
X = age_7.drop(['satisfaction'], axis = 1) 
y = age_7['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(gb,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = age_7.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_7['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
features = X.columns

#plot
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = gb.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = gb.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_7_class = pd.DataFrame(table_data, columns=table_head)
results_age_7_class["Cost Benefit ($)"] = results_age_7_class["Total Profit ($)"]/results_age_7_class["Total Cost ($)"]
results_age_7_class

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_7_class, color='purple')

## For Type of Travel

### For Business Travel: Random Forest Classifier 

In [None]:
X = btravel.drop(['satisfaction'], axis = 1) 
y = btravel['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = btravel.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = btravel['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns

#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = rf.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = rf.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_btravel = pd.DataFrame(table_data, columns=table_head)
results_btravel["Cost Benefit ($)"] = results_btravel["Total Profit ($)"]/results_btravel["Total Cost ($)"]
results_btravel

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_btravel, color='purple')

### For Personal Travel: Gradient Boosting Classifier

In [None]:
X = ptravel.drop(['satisfaction'], axis = 1) #dropping no common class for non important stuff
y = ptravel['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42)
gb.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(gb,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = ptravel.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = ptravel['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
features = X.columns

#plot
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = gb.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = gb.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_ptravel = pd.DataFrame(table_data, columns=table_head)
results_ptravel["Cost Benefit ($)"] = results_ptravel["Total Profit ($)"]/results_ptravel["Total Cost ($)"]
results_ptravel

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_ptravel, color='purple')

## For Type of Customer

### For Loyal Customer: Random Forest Classifier 

In [None]:
X = loyal.drop(['satisfaction'], axis = 1) 
y = loyal['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = loyal.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = loyal['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns

#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = rf.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = rf.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_loyal = pd.DataFrame(table_data, columns=table_head)
results_loyal["Cost Benefit ($)"] = results_loyal["Total Profit ($)"]/results_loyal["Total Cost ($)"]
results_loyal

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_loyal, color='purple')

### For Disloyal Customer: Gradient Boosting Classifier

In [None]:
X = disloyal.drop(['satisfaction'], axis = 1) #dropping no common class for non important stuff
y = disloyal['satisfaction']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42)
gb.fit(X_train,y_train)

#checking current feature importances
features = X.columns


f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()

In [None]:
#recursive feature elimination 
rfe = RFECV(gb,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)

#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common

In [None]:
#training data again with the removed features
X = disloyal.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = disloyal['satisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test

In [None]:
#training model using new infor
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
features = X.columns

#plot
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()

In [None]:
#intital predictions
initial_predictions = gb.predict(X_test)

In [None]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]] 
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []

for feature in top3:
    X_test_increase = X_test.copy()
    total_cost = 0

    if feature not in excluded:
        X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
        cost = costs.get(feature, 0) * increased_values
        total_cost += cost

    y_pred_increase_new = gb.predict(X_test_increase) #new predictions
    
    changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
    profit = profit_per_change * changed #profit per changed customer
    profits.append(profit)
    table_data.append([feature, increased_values, total_cost, profit, changed])

table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_disloyal = pd.DataFrame(table_data, columns=table_head)
results_disloyal["Cost Benefit ($)"] = results_disloyal["Total Profit ($)"]/results_disloyal["Total Cost ($)"]
results_disloyal

In [None]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_disloyal, color='purple')

# Other visualizations

## Total profits over different Classes

In [None]:
total_profit_business_class = results_business_class["Total Profit ($)"].sum()
total_profit_economy_class = results_economy_class["Total Profit ($)"].sum()
total__economy_plus_class = results_economy_plus_class["Total Profit ($)"].sum()

profits = [total_profit_business_class, total_profit_economy_class, total__economy_plus_class]
labels = ["Business Class", "Economy Class", "Economy Plus Class"]

plt.figure(figsize=(10, 6))
sns.barplot(x = labels, y = profits, palette = "pink")
plt.ylabel("Total Profit ($)")
plt.show()

## Total profits over different Genders

In [None]:
total_profit_female = results_female_class["Total Profit ($)"].sum()
total_profit_male = results_male_class["Total Profit ($)"].sum()

profits = [total_profit_female, total_profit_male]
labels = ["Female", "Male"]

plt.figure(figsize=(10, 6))
sns.barplot(x = labels, y = profits, palette = "Reds")
plt.show()

## Total profits over different Age groups

In [None]:
total_profit_age1 = results_age_1_class["Total Profit ($)"].sum()
total_profit_age2 = results_age_2_class["Total Profit ($)"].sum()
total_profit_age3 = results_age_3_class["Total Profit ($)"].sum()
total_profit_age4 = results_age_4_class["Total Profit ($)"].sum()
total_profit_age5 = results_age_5_class["Total Profit ($)"].sum()
total_profit_age6 = results_age_6_class["Total Profit ($)"].sum()
total_profit_age7 = results_age_7_class["Total Profit ($)"].sum()

profits = [total_profit_age1, total_profit_age2, total_profit_age3, total_profit_age4, total_profit_age5, total_profit_age6, 
           total_profit_age7]
labels = ["Age group 1", "Age group 2", "Age group 3", "Age group 4", "Age group 5", "Age group 6", "Age group 7"]

plt.figure(figsize=(10, 6))
sns.barplot(x = labels, y = profits, palette = "Purples")
plt.ylabel("Total Profit ($)")
plt.show()

## Total profits over different Type of Travel

In [None]:
total_profit_btravel = results_btravel["Total Profit ($)"].sum()
total_profit_ptravel = results_ptravel["Total Profit ($)"].sum()

profits = [total_profit_btravel, total_profit_ptravel]
labels = ["Business Travel", "Personal Travel"]

plt.figure(figsize=(10, 6))
sns.barplot(x = labels, y = profits, palette = "Greens")
plt.ylabel("Total Profit ($)")
plt.show()

## Total profits over different Type of Travel

In [None]:
total_profit_loyal = results_loyal["Total Profit ($)"].sum()
total_profit_disloyal = results_disloyal["Total Profit ($)"].sum()

profits = [total_profit_loyal, total_profit_disloyal]
labels = ["Loyal Customer", "Disoyal Customer"]

plt.figure(figsize=(10, 6))
sns.barplot(x = labels, y = profits, palette = "Blues")
plt.ylabel("Total Profit ($)")
plt.show()

## Overall Profits for different Features

In [None]:
#combining results
all_results = pd.concat([results_business_class, results_economy_class, results_economy_plus_class,
                         results_female_class, results_male_class, 
                         results_age_1_class, results_age_2_class, results_age_3_class, results_age_4_class, 
                         results_age_5_class, results_age_6_class, results_age_7_class,
                         results_btravel, results_ptravel,
                         results_loyal, results_disloyal])

all_results = all_results.groupby('Feature').sum().reset_index()

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x ='Feature', y ='Total Profit ($)', data = all_results, palette = 'crest')
plt.xticks(rotation=45, ha ='right')
plt.legend()
plt.tight_layout()
plt.show()

## Overall Costs Benefit for different Features

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x ='Feature', y ='Total Cost ($)', data = all_results, palette = 'coolwarm')
plt.xticks(rotation=45, ha ='right')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
#combining results
all_results = pd.concat([results_business_class, results_economy_class, results_economy_plus_class,
                         results_female_class, results_male_class, 
                         results_age_1_class, results_age_2_class, results_age_3_class, results_age_4_class, 
                         results_age_5_class, results_age_6_class, results_age_7_class,
                         results_btravel, results_ptravel,
                         results_loyal, results_disloyal])

all_results = all_results.groupby('Feature').sum().reset_index()

plt.figure(figsize=(12, 6))
sns.barplot(x ='Feature', y ='Cost Benefit ($)', data = all_results, palette = 'rocket')
plt.xticks(rotation=45, ha ='right')
plt.legend()
plt.tight_layout()
plt.show()