<a href="https://colab.research.google.com/github/minhazulamin1/Irish-Hospital-Patient-Waiting-Dashboard-2021-2023/blob/main/Aviation_Operator_Customer_Satisfaction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import SGDClassifier
from imblearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.pipeline import make_pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
# import warnings
# warnings.filterwarnings("ignore", category=FutureWarning)
# from sklearn.exceptions import ConvergenceWarning
# warnings.filterwarnings("ignore", category=ConvergenceWarning)
# from sklearn.exceptions import UndefinedMetricWarning
# warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [None]:
# Dataset Import and Investigation
dataset = pd.read_csv("/content/drive/MyDrive/Data Mining/CA 1/Aviation Operator.csv")
pd.set_option('display.max_columns', None)
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

   Unnamed: 0      id  Gender      Customer Type  Age   Type of Travel  \
0           0   70172    Male     Loyal Customer   13  Personal Travel   
1           1    5047    Male  disloyal Customer   25  Business travel   
2           2  110028  Female     Loyal Customer   26  Business travel   
3           3   24026  Female     Loyal Customer   25  Business travel   
4           4  119299    Male     Loyal Customer   61  Business travel   

      Class  Flight Distance  Inflight wifi service  \
0  Eco Plus              460                      3   
1  Business              235                      3   
2  Business             1142                      2   
3  Business              562                      2   
4  Business              214                      3   

   Departure/Arrival time convenient  Ease of Online booking  Gate location  \
0                                  4                       3              1   
1                                  2                       3      

In [None]:
# Categorical Data Conversion to Numerical Data
dataset['Gender'] = dataset['Gender'].map({'Male':1, 'Female':0})
dataset['Customer Type'] = dataset['Customer Type'].map({'Loyal Customer':1, 'disloyal Customer':0})
dataset['Type of Travel'] = dataset['Type of Travel'].map({'Business travel':1, 'Personal Travel':0})
dataset['Class'] = dataset['Class'].map({'Business':0, 'Eco Plus':1, 'Eco':2})
dataset['satisfaction'] = dataset['satisfaction'].map({'neutral or dissatisfied':1, 'satisfied':0})
# Missing value handling through using median for simplicity)
dataset['Arrival Delay in Minutes'].fillna(dataset['Arrival Delay in Minutes'].median(), inplace=True)
print(dataset.info())
print(dataset.head(2))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  int64  
 3   Customer Type                      103904 non-null  int64  
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  int64  
 6   Class                              103904 non-null  int64  
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      1039

In [None]:
# Dividing dataset into features and labels
X = dataset.drop(['satisfaction', 'Gate location', 'Unnamed: 0', 'id'], axis=1)  # Features
Y = dataset['satisfaction']  # Labels

# Splitting the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Notice that we only transform the test set

# Distribution of the target variable 'satisfaction'
class_distribution = Y_train.value_counts(normalize=True)
print("Class Distribution in Training Set:\n", class_distribution)

Class Distribution in Training Set:
 1    0.565475
0    0.434525
Name: satisfaction, dtype: float64


In [None]:
#Logistic Regression

# Define the pipeline with SMOTE and SGDClassifier
model = ImbPipeline([
        ('balancing', SMOTE(random_state=101)),
        ('classification', SGDClassifier(loss='log_loss', penalty='elasticnet', random_state=1))
])

# Define the grid parameters
grid_param = {
    'classification__eta0': [0.001, 0.0005],
    'classification__max_iter': [1000, 1500],
    'classification__alpha': [0.001, 0.1],
    'classification__l1_ratio': [0.5, 1]
}

# Define scoring metrics
scoring_metrics = {'precision': 'precision', 'recall': 'recall', 'f1': 'f1'}

# Initialize GridSearchCV
gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring=scoring_metrics, refit='f1', cv=5)

# Scale the features (assuming X_train and Y_train have been defined)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit the model using the scaled training data and the correct labels
gd_sr.fit(X_train_scaled, Y_train)

# Get the best parameters and the best model
best_parameters = gd_sr.best_params_
best_model = gd_sr.best_estimator_

# Predict on the scaled test data
y_pred = best_model.predict(X_test_scaled)

# Calculate evaluation metrics
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)

# Print the best parameter
print("Best Parameters:", gd_sr.best_params_)

# Print the evaluation metrics
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Best Parameters: {'classification__alpha': 0.001, 'classification__eta0': 0.001, 'classification__l1_ratio': 0.5, 'classification__max_iter': 1000}
Precision: 0.8937
Recall: 0.8782
F1-Score: 0.8859


In [None]:
#Adaboost

# Set up AdaBoost with SMOTE in a pipeline
model = ImbPipeline([
        ('balancing', SMOTE(random_state=101)),
        ('classification', AdaBoostClassifier(random_state=1))
])

# Define the grid search parameters
grid_param = {
    'classification__n_estimators': [80, 90, 100, 110, 120]
}

# Define scoring metrics
scoring_metrics = {'precision': 'precision', 'recall': 'recall', 'f1': 'f1'}

# Initialize GridSearchCV with multiple scoring metrics and refit set to 'precision'
gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring=scoring_metrics, refit='precision', cv=3)

# Fit the GridSearchCV to find the best hyperparameters
gd_sr.fit(X_train_scaled, Y_train)

# Print the best parameters and the best precision score
print("Best Parameters:", gd_sr.best_params_)
print("Best Precision Score:", gd_sr.best_score_)

# Use the best estimator to predict on the test data
best_model = gd_sr.best_estimator_
Y_pred = best_model.predict(X_test_scaled)

# Calculate precision, recall, and F1-score
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

# Print the evaluation metrics
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Get feature importances from the best model
feature_importances = best_model.named_steps['classification'].feature_importances_
featimp = pd.Series(feature_importances, index=X_train.columns).sort_values(ascending=False)
print(featimp)

Best Parameters: {'classification__n_estimators': 110}
Best Precision Score: 0.9193782847457034
Precision: 0.9191
Recall: 0.9167
F1-Score: 0.9179
Age                                  0.100000
Seat comfort                         0.081818
Inflight wifi service                0.081818
Departure/Arrival time convenient    0.081818
Online boarding                      0.072727
Leg room service                     0.072727
Cleanliness                          0.054545
Checkin service                      0.054545
Inflight service                     0.054545
Ease of Online booking               0.054545
Customer Type                        0.045455
Inflight entertainment               0.045455
Baggage handling                     0.045455
Type of Travel                       0.045455
Flight Distance                      0.036364
On-board service                     0.027273
Class                                0.018182
Food and drink                       0.018182
Arrival Delay in Minutes  

In [None]:
#RFC

# Setting up the pipeline with SMOTE and RandomForestClassifier
model = ImbPipeline([
    ('balancing', SMOTE(random_state=101)),
    ('classification', RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=1))
])

# Defining the grid of parameters to search
grid_param = {
    'classification__n_estimators': [1000, 3000, 5000]
}

# Scoring metrics defining
scoring_metrics = {
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

# Setting up the grid search with cross-validation
gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring=scoring_metrics, refit='f1', cv=5, return_train_score=True)

# Fitting the grid search to the data
gd_sr.fit(X_train_scaled, Y_train)

# The best parameters retrieving and the best score for each metric
best_parameters = gd_sr.best_params_
best_precision = gd_sr.cv_results_['mean_test_precision'][gd_sr.best_index_]
best_recall = gd_sr.cv_results_['mean_test_recall'][gd_sr.best_index_]
best_f1 = gd_sr.cv_results_['mean_test_f1'][gd_sr.best_index_]

# Printing the best parameters and the best scores
print(f"Best Parameters: {best_parameters}")
print(f"Best Precision Score: {best_precision:.4f}")
print(f"Best Recall Score: {best_recall:.4f}")
print(f"Best F1 Score: {best_f1:.4f}")

# Getting feature importances from the best estimator
best_estimator = gd_sr.best_estimator_.named_steps['classification']
featimp = pd.Series(best_estimator.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(featimp)


Best Parameters: {'classification__n_estimators': 3000}
Best Precision Score: 0.9698
Best Recall Score: 0.9407
Best F1 Score: 0.9550
Online boarding                      0.165953
Inflight wifi service                0.159061
Type of Travel                       0.092348
Class                                0.084703
Inflight entertainment               0.049311
Ease of Online booking               0.046072
Seat comfort                         0.044661
Leg room service                     0.039568
Customer Type                        0.037521
Flight Distance                      0.037052
Age                                  0.036682
On-board service                     0.032492
Baggage handling                     0.028092
Checkin service                      0.028083
Cleanliness                          0.025950
Inflight service                     0.025889
Departure/Arrival time convenient    0.020493
Arrival Delay in Minutes             0.014631
Food and drink                       0.

In [None]:
#RFC with important feature

# Setting up the pipeline with SMOTE and RandomForestClassifier
model = ImbPipeline([
    ('balancing', SMOTE(random_state=101)),
    ('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1))
])

# Define the grid of parameters to search
grid_param = {
    'classification__n_estimators': [100, 200, 300]
}

# Scoring metrics defining
scoring_metrics = {
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

# Setting up the grid search with cross-validation
gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring=scoring_metrics, refit='f1', cv=5, return_train_score=True)

# Fitting the grid search to the data
gd_sr.fit(X_train_scaled, Y_train)

# Retrieving the best parameters and the best score for each metric
best_parameters = gd_sr.best_params_
best_precision = gd_sr.cv_results_['mean_test_precision'][gd_sr.best_index_]
best_recall = gd_sr.cv_results_['mean_test_recall'][gd_sr.best_index_]
best_f1 = gd_sr.cv_results_['mean_test_f1'][gd_sr.best_index_]

# Printing the best parameters and the best scores
print(f"Best Parameters: {best_parameters}")
print(f"Best Precision Score: {best_precision:.4f}")
print(f"Best Recall Score: {best_recall:.4f}")
print(f"Best F1 Score: {best_f1:.4f}")


Best Parameters: {'classification__n_estimators': 300}
Best Precision Score: 0.9691
Best Recall Score: 0.9403
Best F1 Score: 0.9545


In [None]:
# Support Vector Classifier with LinearSVC
model = ImbPipeline([
    ('balancing', SMOTE(random_state=101)),
    ('classification', LinearSVC(random_state=1, max_iter=10000))
])

# Defining the grid of parameters to search
grid_param = {
    'classification__C': [0.01, 0.1, 1, 10]
}

# Scoring metrics defining
scoring_metrics = {
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

# Setting up the grid search with cross-validation
gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring=scoring_metrics, refit='f1', cv=5, n_jobs=-1, return_train_score=True)

# Fitting the grid search to the data
gd_sr.fit(X_train_scaled, Y_train)

# Retrieving the best parameters and the best score for each metric
best_parameters = gd_sr.best_params_
print(f"Best Parameters: {best_parameters}")

best_precision = gd_sr.cv_results_['mean_test_precision'][gd_sr.best_index_]
print(f"Best Precision Score: {best_precision:.4f}")

best_recall = gd_sr.cv_results_['mean_test_recall'][gd_sr.best_index_]
print(f"Best Recall Score: {best_recall:.4f}")

best_f1 = gd_sr.cv_results_['mean_test_f1'][gd_sr.best_index_]
print(f"Best F1 Score: {best_f1:.4f}")


Best Parameters: {'classification__C': 10}
Best Precision Score: 0.8453
Best Recall Score: 0.8543
Best F1 Score: 0.8498


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from imblearn.pipeline import Pipeline as imblearnPipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

# Define the pipeline with SMOTE and KNN
model = imblearnPipeline([
    ('balancing', SMOTE(random_state=101)),
    ('classification', KNeighborsClassifier())
])

# Define the grid of parameters to search
grid_param = {
    'classification__n_neighbors': [3, 5, 7]
}

# Define custom scoring dictionary
scoring = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

# Set up the grid search with cross-validation
gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring=scoring, cv=5, refit='f1')

# Fit the model to the training data
gd_sr.fit(X_train_scaled, Y_train)

# Extract the best parameters and scores
best_parameters_knn = gd_sr.best_params_
best_precision_knn = gd_sr.cv_results_['mean_test_precision'][gd_sr.best_index_]
best_recall_knn = gd_sr.cv_results_['mean_test_recall'][gd_sr.best_index_]
best_f1_knn = gd_sr.cv_results_['mean_test_f1'][gd_sr.best_index_]

# Output the results
print(f"Best Parameters: {best_parameters_knn}")
print(f"Best Precision: {best_precision_knn:.4f}")
print(f"Best Recall: {best_recall_knn:.4f}")
print(f"Best F1 Score: {best_f1_knn:.4f}")


Best Parameters: {'classification__n_neighbors': 7}
Best Precision: 0.9301
Best Recall: 0.8997
Best F1 Score: 0.9147
