In [54]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import datetime

from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
import joblib

In [55]:
df = pd.read_csv('Samples/airline-test-samples.csv')
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,TicketCategory
0,12/2/2022,Vistara,UK,852,9:30,11h 10m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:40,economy,"{'source': 'Bangalore', 'destination': 'Delhi'}",moderate
1,3/3/2022,Indigo,6E,248,22:20,02h 05m,non-stop,0:25,economy,"{'source': 'Mumbai', 'destination': 'Delhi'}",cheap
2,19-03-2022,Air India,AI,570,5:20,07h 40m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,13:00,economy,"{'source': 'Mumbai', 'destination': 'Delhi'}",cheap


In [56]:
import warnings
warnings.filterwarnings("ignore")


#### Loading TicketCategory Encoder :

In [57]:
price_enc = joblib.load("EncoderModels/price_enc.save")
df['TicketCategory'] = price_enc.transform(df[["TicketCategory"]])
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,TicketCategory
0,12/2/2022,Vistara,UK,852,9:30,11h 10m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:40,economy,"{'source': 'Bangalore', 'destination': 'Delhi'}",2.0
1,3/3/2022,Indigo,6E,248,22:20,02h 05m,non-stop,0:25,economy,"{'source': 'Mumbai', 'destination': 'Delhi'}",0.0
2,19-03-2022,Air India,AI,570,5:20,07h 40m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,13:00,economy,"{'source': 'Mumbai', 'destination': 'Delhi'}",0.0


#### Data preprocessing on Date

In [58]:
df["date"]=pd.to_datetime(df["date"])
df["date"]=df["date"].dt.strftime("%m/%d/%Y")
pd.DatetimeIndex(df["date"]).weekday
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day
df['dayofyear'] = pd.DatetimeIndex(df['date']).dayofyear

#### Loading Ch_code Encoder :

In [59]:
ch_enc = joblib.load("EncoderModels/ch_enc.save")

df['ch_code'] = ch_enc.transform(df[["ch_code"]])

df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,TicketCategory,month,day,dayofyear
0,12/02/2022,Vistara,7.0,852,9:30,11h 10m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:40,economy,"{'source': 'Bangalore', 'destination': 'Delhi'}",2.0,12,2,336
1,03/03/2022,Indigo,1.0,248,22:20,02h 05m,non-stop,0:25,economy,"{'source': 'Mumbai', 'destination': 'Delhi'}",0.0,3,3,62
2,03/19/2022,Air India,2.0,570,5:20,07h 40m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,13:00,economy,"{'source': 'Mumbai', 'destination': 'Delhi'}",0.0,3,19,78


#### Departure time preprocessing

In [60]:
df["dep_time"]=pd.to_datetime(df["dep_time"])
df['dep_time'] = df['dep_time'].dt.strftime("%-H:%M")
df["dep_hour"]=pd.DatetimeIndex(df["dep_time"]).hour
df["dep_minute"]=pd.DatetimeIndex(df["dep_time"]).minute

#### Time_taken preprocessing

In [61]:
df["hours_taken"] = df["time_taken"].str.split('h').str.get(0)
df["minutes_taken"] = df["time_taken"].str[4:6]
df["minutes_taken"] = df["minutes_taken"].str.replace('m', '')
df["minutes_taken"] = df["minutes_taken"].str.replace('h', '')
df["hours_taken"] = pd.to_numeric(df["hours_taken"])
df["minutes_taken"] = pd.to_numeric(df["minutes_taken"], errors='coerce')

#### Stop preprocessing

In [62]:
df["stop"] = df["stop"].str.split('-').str.get(0)
df["stop"] = df["stop"].replace(['non'], 0)
df.isna().sum() #  28944 null vals
df["stop"] = df["stop"].replace(['2+'], 2) # Indicates for 2 or more stops
df['stop'] = df['stop'].fillna(0)
df['stop'] = pd.to_numeric(df['stop'])

#### Arrival time preprocessing

In [63]:
df["arr_time"]=pd.to_datetime(df["arr_time"])
df['arr_time'] = df['arr_time'].dt.strftime("%-H:%M")
df["arr_hour"]=pd.DatetimeIndex(df["arr_time"]).hour
df["arr_minute"]=pd.DatetimeIndex(df["arr_time"]).minute
df["arr_hour"] = pd.to_numeric(df["arr_hour"])
df["arr_minute"] = pd.to_numeric(df["arr_minute"])

#### Source & Destination preprocessing

In [64]:
df['source'] = df['route'].str.split( ', ').str.get(0).str.split(':').str.get(1)
df['destination'] = df['route'].str.split( ', ').str.get(1).str.split(':').str.get(1).str.split('}').str.get(0)
df['source'] = df['source'].str.replace('\'', "")
df['destination'] = df['destination'].str.replace('\'', "")

#### Loading Type Encoder :

In [65]:
type_enc = joblib.load("EncoderModels/type_enc.save")
df['type'] = type_enc.transform(df[["type"]])
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,...,day,dayofyear,dep_hour,dep_minute,hours_taken,minutes_taken,arr_hour,arr_minute,source,destination
0,12/02/2022,Vistara,7.0,852,2022-05-27 09:30:00,11h 10m,1,2022-05-27 20:40:00,1.0,"{'source': 'Bangalore', 'destination': 'Delhi'}",...,2,336,9,30,11,10,20,40,Bangalore,Delhi
1,03/03/2022,Indigo,1.0,248,2022-05-27 22:20:00,02h 05m,0,2022-05-27 00:25:00,1.0,"{'source': 'Mumbai', 'destination': 'Delhi'}",...,3,62,22,20,2,5,0,25,Mumbai,Delhi
2,03/19/2022,Air India,2.0,570,2022-05-27 05:20:00,07h 40m,1,2022-05-27 13:00:00,1.0,"{'source': 'Mumbai', 'destination': 'Delhi'}",...,19,78,5,20,7,40,13,0,Mumbai,Delhi


#### Loading Source Encoder :

In [66]:
source_enc = joblib.load("EncoderModels/source_enc.save")
df['source'] = source_enc.transform(df[["source"]])
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,...,day,dayofyear,dep_hour,dep_minute,hours_taken,minutes_taken,arr_hour,arr_minute,source,destination
0,12/02/2022,Vistara,7.0,852,2022-05-27 09:30:00,11h 10m,1,2022-05-27 20:40:00,1.0,"{'source': 'Bangalore', 'destination': 'Delhi'}",...,2,336,9,30,11,10,20,40,0.0,Delhi
1,03/03/2022,Indigo,1.0,248,2022-05-27 22:20:00,02h 05m,0,2022-05-27 00:25:00,1.0,"{'source': 'Mumbai', 'destination': 'Delhi'}",...,3,62,22,20,2,5,0,25,5.0,Delhi
2,03/19/2022,Air India,2.0,570,2022-05-27 05:20:00,07h 40m,1,2022-05-27 13:00:00,1.0,"{'source': 'Mumbai', 'destination': 'Delhi'}",...,19,78,5,20,7,40,13,0,5.0,Delhi


#### Loading Destination Encoder :

In [67]:
destination_enc = joblib.load("EncoderModels/destination_enc.save")
df['destination'] = source_enc.transform(df[["destination"]])
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,...,day,dayofyear,dep_hour,dep_minute,hours_taken,minutes_taken,arr_hour,arr_minute,source,destination
0,12/02/2022,Vistara,7.0,852,2022-05-27 09:30:00,11h 10m,1,2022-05-27 20:40:00,1.0,"{'source': 'Bangalore', 'destination': 'Delhi'}",...,2,336,9,30,11,10,20,40,0.0,2.0
1,03/03/2022,Indigo,1.0,248,2022-05-27 22:20:00,02h 05m,0,2022-05-27 00:25:00,1.0,"{'source': 'Mumbai', 'destination': 'Delhi'}",...,3,62,22,20,2,5,0,25,5.0,2.0
2,03/19/2022,Air India,2.0,570,2022-05-27 05:20:00,07h 40m,1,2022-05-27 13:00:00,1.0,"{'source': 'Mumbai', 'destination': 'Delhi'}",...,19,78,5,20,7,40,13,0,5.0,2.0


#### Cleaning Data

In [68]:
df = df.fillna(-1)
df = df.drop(['airline', 'date', 'dep_time', "time_taken", 'arr_time', 'route',], axis=1)
df.head()

Unnamed: 0,ch_code,num_code,stop,type,TicketCategory,month,day,dayofyear,dep_hour,dep_minute,hours_taken,minutes_taken,arr_hour,arr_minute,source,destination
0,7.0,852,1,1.0,2.0,12,2,336,9,30,11,10,20,40,0.0,2.0
1,1.0,248,0,1.0,0.0,3,3,62,22,20,2,5,0,25,5.0,2.0
2,2.0,570,1,1.0,0.0,3,19,78,5,20,7,40,13,0,5.0,2.0


In [69]:
X = df.loc[:, df.columns != 'TicketCategory']
Y = df['TicketCategory']

#### Loading Scaler Model :

In [70]:
scaler = joblib.load("scaler.save")
X_scaled = scaler.transform(X)

In [71]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier

#### Loading Classification model :

In [72]:
rfc_model = joblib.load("PredictionModels\RandomForestClassifier(n_jobs=-1, random_state=42)_ClassificationModel.save")
dtc_model = joblib.load("PredictionModels\DecisionTreeClassifier(max_depth=50, random_state=42)_ClassificationModel.save")
knn_model = joblib.load("PredictionModels\KNeighborsClassifier()_ClassificationModel.save")
logestic_model = joblib.load("PredictionModels\LogisticRegression(C=0.1)_ClassificationModel.save")
mlpc_model = joblib.load("PredictionModels\MLPClassifier(activation='tanh', hidden_layer_sizes=(150, 100, 50), max_iter=50)_ClassificationModel.save")
ridge_model = joblib.load("PredictionModels\RidgeClassifier(alpha=0.1)_ClassificationModel.save")

In [73]:
for i, clf in enumerate((rfc_model, dtc_model, knn_model, logestic_model, mlpc_model, ridge_model)):
    predictions = clf.predict(X_scaled)
    print("Accuracy of: " + str(clf)+":"+str(accuracy_score(Y, predictions)))
    print('\n')
    print(classification_report(Y, predictions))


Accuracy of: RandomForestClassifier(n_jobs=-1, random_state=42):0.6666666666666666


              precision    recall  f1-score   support

         0.0       1.00      0.50      0.67         2
         2.0       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3

Accuracy of: DecisionTreeClassifier(max_depth=50, random_state=42):0.6666666666666666


              precision    recall  f1-score   support

         0.0       1.00      0.50      0.67         2
         2.0       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3

Accuracy of: KNeighborsClassifier():0.6666666666666666


              precision    recall  f1-score   support

         0.0       0.67      1.00      0.80         2
       