In [75]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import datetime

from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
import joblib

In [76]:
df = pd.read_csv('Samples/airline-tas-classification-test.csv')
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,TicketCategory
0,8/3/2022,Air India,AI,430,9:55,11h 20m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,21:15,economy,"{'source': 'Chennai', 'destination': 'Mumbai'}",cheap
1,7/3/2022,Indigo,6E,926,18:30,04h 55m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,23:25,economy,"{'source': 'Delhi', 'destination': 'Mumbai'}",cheap
2,24-03-2022,Indigo,6E,6491,13:05,07h 40m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:45,economy,"{'source': 'Bangalore', 'destination': 'Mumbai'}",cheap
3,27-03-2022,Air India,AI,473,18:40,22h 25m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,17:05,economy,"{'source': 'Delhi', 'destination': 'Bangalore'}",cheap
4,7/3/2022,Indigo,6E,684,21:55,01h 15m,non-stop,23:10,economy,"{'source': 'Bangalore', 'destination': 'Hydera...",cheap


In [77]:
import warnings
warnings.filterwarnings("ignore")


#### Loading TicketCategory Encoder :

In [78]:
price_enc = joblib.load("EncoderModels/price_enc.save")
df['TicketCategory'] = price_enc.transform(df[["TicketCategory"]])
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,TicketCategory
0,8/3/2022,Air India,AI,430,9:55,11h 20m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,21:15,economy,"{'source': 'Chennai', 'destination': 'Mumbai'}",0.0
1,7/3/2022,Indigo,6E,926,18:30,04h 55m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,23:25,economy,"{'source': 'Delhi', 'destination': 'Mumbai'}",0.0
2,24-03-2022,Indigo,6E,6491,13:05,07h 40m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:45,economy,"{'source': 'Bangalore', 'destination': 'Mumbai'}",0.0
3,27-03-2022,Air India,AI,473,18:40,22h 25m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,17:05,economy,"{'source': 'Delhi', 'destination': 'Bangalore'}",0.0
4,7/3/2022,Indigo,6E,684,21:55,01h 15m,non-stop,23:10,economy,"{'source': 'Bangalore', 'destination': 'Hydera...",0.0


#### Data preprocessing on Date

In [79]:
df["date"]=pd.to_datetime(df["date"])
df["date"]=df["date"].dt.strftime("%m/%d/%Y")
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day
df['dayofyear'] = pd.DatetimeIndex(df['date']).dayofyear

#### Loading Ch_code Encoder :

In [80]:
ch_enc = joblib.load("EncoderModels/ch_enc.save")

df['ch_code'] = ch_enc.transform(df[["ch_code"]])

df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,TicketCategory,month,day,dayofyear
0,08/03/2022,Air India,2.0,430,9:55,11h 20m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,21:15,economy,"{'source': 'Chennai', 'destination': 'Mumbai'}",0.0,8,3,215
1,07/03/2022,Indigo,1.0,926,18:30,04h 55m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,23:25,economy,"{'source': 'Delhi', 'destination': 'Mumbai'}",0.0,7,3,184
2,03/24/2022,Indigo,1.0,6491,13:05,07h 40m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:45,economy,"{'source': 'Bangalore', 'destination': 'Mumbai'}",0.0,3,24,83
3,03/27/2022,Air India,2.0,473,18:40,22h 25m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,17:05,economy,"{'source': 'Delhi', 'destination': 'Bangalore'}",0.0,3,27,86
4,07/03/2022,Indigo,1.0,684,21:55,01h 15m,non-stop,23:10,economy,"{'source': 'Bangalore', 'destination': 'Hydera...",0.0,7,3,184


#### Departure time preprocessing

In [81]:
df["dep_time"]=pd.to_datetime(df["dep_time"])
df['dep_time'] = df['dep_time'].dt.strftime("%-H:%M")
df["dep_hour"]=pd.DatetimeIndex(df["dep_time"]).hour
df["dep_minute"]=pd.DatetimeIndex(df["dep_time"]).minute

#### Time_taken preprocessing

In [82]:
df["hours_taken"] = df["time_taken"].str.split('h').str.get(0)
df["minutes_taken"] = df["time_taken"].str[4:6]
df["minutes_taken"] = df["minutes_taken"].str.replace('m', '')
df["minutes_taken"] = df["minutes_taken"].str.replace('h', '')
df["hours_taken"] = pd.to_numeric(df["hours_taken"])
df["minutes_taken"] = pd.to_numeric(df["minutes_taken"], errors='coerce')

#### Stop preprocessing

In [83]:
df["stop"] = df["stop"].str.split('-').str.get(0)
df["stop"] = df["stop"].replace(['non'], 0)
df.isna().sum() #  28944 null vals
df["stop"] = df["stop"].replace(['2+'], 2) # Indicates for 2 or more stops
df['stop'] = df['stop'].fillna(0)
df['stop'] = pd.to_numeric(df['stop'])

#### Arrival time preprocessing

In [84]:
df["arr_time"]=pd.to_datetime(df["arr_time"])
df['arr_time'] = df['arr_time'].dt.strftime("%-H:%M")
df["arr_hour"]=pd.DatetimeIndex(df["arr_time"]).hour
df["arr_minute"]=pd.DatetimeIndex(df["arr_time"]).minute
df["arr_hour"] = pd.to_numeric(df["arr_hour"])
df["arr_minute"] = pd.to_numeric(df["arr_minute"])

#### Source & Destination preprocessing

In [85]:
df['source'] = df['route'].str.split( ', ').str.get(0).str.split(':').str.get(1)
df['destination'] = df['route'].str.split( ', ').str.get(1).str.split(':').str.get(1).str.split('}').str.get(0)
df['source'] = df['source'].str.replace('\'', "")
df['destination'] = df['destination'].str.replace('\'', "")

#### Loading Type Encoder :

In [86]:
type_enc = joblib.load("EncoderModels/type_enc.save")
df['type'] = type_enc.transform(df[["type"]])
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,...,day,dayofyear,dep_hour,dep_minute,hours_taken,minutes_taken,arr_hour,arr_minute,source,destination
0,08/03/2022,Air India,2.0,430,2022-05-28 09:55:00,11h 20m,1,2022-05-28 21:15:00,1.0,"{'source': 'Chennai', 'destination': 'Mumbai'}",...,3,215,9,55,11.0,20.0,21,15,Chennai,Mumbai
1,07/03/2022,Indigo,1.0,926,2022-05-28 18:30:00,04h 55m,1,2022-05-28 23:25:00,1.0,"{'source': 'Delhi', 'destination': 'Mumbai'}",...,3,184,18,30,4.0,55.0,23,25,Delhi,Mumbai
2,03/24/2022,Indigo,1.0,6491,2022-05-28 13:05:00,07h 40m,1,2022-05-28 20:45:00,1.0,"{'source': 'Bangalore', 'destination': 'Mumbai'}",...,24,83,13,5,7.0,40.0,20,45,Bangalore,Mumbai
3,03/27/2022,Air India,2.0,473,2022-05-28 18:40:00,22h 25m,1,2022-05-28 17:05:00,1.0,"{'source': 'Delhi', 'destination': 'Bangalore'}",...,27,86,18,40,22.0,25.0,17,5,Delhi,Bangalore
4,07/03/2022,Indigo,1.0,684,2022-05-28 21:55:00,01h 15m,0,2022-05-28 23:10:00,1.0,"{'source': 'Bangalore', 'destination': 'Hydera...",...,3,184,21,55,1.0,15.0,23,10,Bangalore,Hyderabad


#### Loading Source Encoder :

In [87]:
source_enc = joblib.load("EncoderModels/source_enc.save")
df['source'] = source_enc.transform(df[["source"]])
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,...,day,dayofyear,dep_hour,dep_minute,hours_taken,minutes_taken,arr_hour,arr_minute,source,destination
0,08/03/2022,Air India,2.0,430,2022-05-28 09:55:00,11h 20m,1,2022-05-28 21:15:00,1.0,"{'source': 'Chennai', 'destination': 'Mumbai'}",...,3,215,9,55,11.0,20.0,21,15,1.0,Mumbai
1,07/03/2022,Indigo,1.0,926,2022-05-28 18:30:00,04h 55m,1,2022-05-28 23:25:00,1.0,"{'source': 'Delhi', 'destination': 'Mumbai'}",...,3,184,18,30,4.0,55.0,23,25,2.0,Mumbai
2,03/24/2022,Indigo,1.0,6491,2022-05-28 13:05:00,07h 40m,1,2022-05-28 20:45:00,1.0,"{'source': 'Bangalore', 'destination': 'Mumbai'}",...,24,83,13,5,7.0,40.0,20,45,0.0,Mumbai
3,03/27/2022,Air India,2.0,473,2022-05-28 18:40:00,22h 25m,1,2022-05-28 17:05:00,1.0,"{'source': 'Delhi', 'destination': 'Bangalore'}",...,27,86,18,40,22.0,25.0,17,5,2.0,Bangalore
4,07/03/2022,Indigo,1.0,684,2022-05-28 21:55:00,01h 15m,0,2022-05-28 23:10:00,1.0,"{'source': 'Bangalore', 'destination': 'Hydera...",...,3,184,21,55,1.0,15.0,23,10,0.0,Hyderabad


#### Loading Destination Encoder :

In [88]:
destination_enc = joblib.load("EncoderModels/destination_enc.save")
df['destination'] = source_enc.transform(df[["destination"]])
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,...,day,dayofyear,dep_hour,dep_minute,hours_taken,minutes_taken,arr_hour,arr_minute,source,destination
0,08/03/2022,Air India,2.0,430,2022-05-28 09:55:00,11h 20m,1,2022-05-28 21:15:00,1.0,"{'source': 'Chennai', 'destination': 'Mumbai'}",...,3,215,9,55,11.0,20.0,21,15,1.0,5.0
1,07/03/2022,Indigo,1.0,926,2022-05-28 18:30:00,04h 55m,1,2022-05-28 23:25:00,1.0,"{'source': 'Delhi', 'destination': 'Mumbai'}",...,3,184,18,30,4.0,55.0,23,25,2.0,5.0
2,03/24/2022,Indigo,1.0,6491,2022-05-28 13:05:00,07h 40m,1,2022-05-28 20:45:00,1.0,"{'source': 'Bangalore', 'destination': 'Mumbai'}",...,24,83,13,5,7.0,40.0,20,45,0.0,5.0
3,03/27/2022,Air India,2.0,473,2022-05-28 18:40:00,22h 25m,1,2022-05-28 17:05:00,1.0,"{'source': 'Delhi', 'destination': 'Bangalore'}",...,27,86,18,40,22.0,25.0,17,5,2.0,0.0
4,07/03/2022,Indigo,1.0,684,2022-05-28 21:55:00,01h 15m,0,2022-05-28 23:10:00,1.0,"{'source': 'Bangalore', 'destination': 'Hydera...",...,3,184,21,55,1.0,15.0,23,10,0.0,3.0


#### Cleaning Data

In [89]:
df = df.fillna(-1)
df = df.drop(['airline', 'date', 'dep_time', "time_taken", 'arr_time', 'route',], axis=1)
df.head()

Unnamed: 0,ch_code,num_code,stop,type,TicketCategory,month,day,dayofyear,dep_hour,dep_minute,hours_taken,minutes_taken,arr_hour,arr_minute,source,destination
0,2.0,430,1,1.0,0.0,8,3,215,9,55,11.0,20.0,21,15,1.0,5.0
1,1.0,926,1,1.0,0.0,7,3,184,18,30,4.0,55.0,23,25,2.0,5.0
2,1.0,6491,1,1.0,0.0,3,24,83,13,5,7.0,40.0,20,45,0.0,5.0
3,2.0,473,1,1.0,0.0,3,27,86,18,40,22.0,25.0,17,5,2.0,0.0
4,1.0,684,0,1.0,0.0,7,3,184,21,55,1.0,15.0,23,10,0.0,3.0


In [90]:
X = df.loc[:, df.columns != 'TicketCategory']
Y = df['TicketCategory']

#### Loading Scaler Model :

In [91]:
scaler = joblib.load("scaler.save")
X_scaled = scaler.transform(X)

In [92]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier

#### Loading Classification model :

In [93]:
rfc_model = joblib.load("PredictionModels\RandomForestClassifier(n_jobs=-1, random_state=42)_ClassificationModel.save")
dtc_model = joblib.load("PredictionModels\DecisionTreeClassifier(max_depth=50, random_state=42)_ClassificationModel.save")
knn_model = joblib.load("PredictionModels\KNeighborsClassifier()_ClassificationModel.save")
logestic_model = joblib.load("PredictionModels\LogisticRegression(C=0.1)_ClassificationModel.save")
mlpc_model = joblib.load("PredictionModels\MLPClassifier(activation='tanh', hidden_layer_sizes=(150, 100, 50), max_iter=50)_ClassificationModel.save")
ridge_model = joblib.load("PredictionModels\RidgeClassifier(alpha=0.1)_ClassificationModel.save")

In [94]:
for i, clf in enumerate((rfc_model, dtc_model, knn_model, logestic_model, mlpc_model, ridge_model)):
    predictions = clf.predict(X_scaled)
    print("Accuracy of: " + str(clf)+":"+str(accuracy_score(Y, predictions)))
    print('\n')
    print(classification_report(Y, predictions))


Accuracy of: RandomForestClassifier(n_jobs=-1, random_state=42):0.95835


              precision    recall  f1-score   support

         0.0       0.93      0.95      0.94      5380
         1.0       0.95      0.96      0.96      1954
         2.0       0.96      0.96      0.96      8621
         3.0       0.99      0.98      0.98      4045

    accuracy                           0.96     20000
   macro avg       0.96      0.96      0.96     20000
weighted avg       0.96      0.96      0.96     20000

Accuracy of: DecisionTreeClassifier(max_depth=50, random_state=42):0.95515


              precision    recall  f1-score   support

         0.0       0.93      0.94      0.93      5380
         1.0       0.96      0.96      0.96      1954
         2.0       0.96      0.95      0.95      8621
         3.0       0.98      0.99      0.98      4045

    accuracy                           0.96     20000
   macro avg       0.96      0.96      0.96     20000
weighted avg       0.96      0.96 