In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import datetime

from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
import joblib

In [21]:
df = pd.read_csv('Samples/airline-tas-regression-test.csv')
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,price
0,8/3/2022,Air India,AI,430,9:55,11h 20m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,21:15,economy,"{'source': 'Chennai', 'destination': 'Mumbai'}",4357
1,7/3/2022,Indigo,6E,926,18:30,04h 55m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,23:25,economy,"{'source': 'Delhi', 'destination': 'Mumbai'}",4270
2,24-03-2022,Indigo,6E,6491,13:05,07h 40m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:45,economy,"{'source': 'Bangalore', 'destination': 'Mumbai'}",3153
3,27-03-2022,Air India,AI,473,18:40,22h 25m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,17:05,economy,"{'source': 'Delhi', 'destination': 'Bangalore'}",4728
4,7/3/2022,Indigo,6E,684,21:55,01h 15m,non-stop,23:10,economy,"{'source': 'Bangalore', 'destination': 'Hydera...",1714


In [22]:
import warnings
warnings.filterwarnings("ignore")

### Data preprocessing on 'price'

In [23]:
df['price'] = df['price'].str.replace(",", "")
df['price'] = pd.to_numeric(df['price'])

#### Data preprocessing on Date

In [24]:
df["date"]=pd.to_datetime(df["date"])
df["date"]=df["date"].dt.strftime("%m/%d/%Y")
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day
df['dayofyear'] = pd.DatetimeIndex(df['date']).dayofyear

#### Loading Ch_code Encoder :

In [25]:
ch_enc = joblib.load("EncoderModels/ch_enc.save")

df['ch_code'] = ch_enc.transform(df[["ch_code"]])

df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,price,month,day,dayofyear
0,08/03/2022,Air India,2.0,430,9:55,11h 20m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,21:15,economy,"{'source': 'Chennai', 'destination': 'Mumbai'}",4357,8,3,215
1,07/03/2022,Indigo,1.0,926,18:30,04h 55m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,23:25,economy,"{'source': 'Delhi', 'destination': 'Mumbai'}",4270,7,3,184
2,03/24/2022,Indigo,1.0,6491,13:05,07h 40m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:45,economy,"{'source': 'Bangalore', 'destination': 'Mumbai'}",3153,3,24,83
3,03/27/2022,Air India,2.0,473,18:40,22h 25m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,17:05,economy,"{'source': 'Delhi', 'destination': 'Bangalore'}",4728,3,27,86
4,07/03/2022,Indigo,1.0,684,21:55,01h 15m,non-stop,23:10,economy,"{'source': 'Bangalore', 'destination': 'Hydera...",1714,7,3,184


#### Departure time preprocessing

In [26]:
df["dep_time"]=pd.to_datetime(df["dep_time"])
df['dep_time'] = df['dep_time'].dt.strftime("%-H:%M")
df["dep_hour"]=pd.DatetimeIndex(df["dep_time"]).hour
df["dep_minute"]=pd.DatetimeIndex(df["dep_time"]).minute

#### Time_taken preprocessing

In [27]:
df["hours_taken"] = df["time_taken"].str.split('h').str.get(0)
df["minutes_taken"] = df["time_taken"].str[4:6]
df["minutes_taken"] = df["minutes_taken"].str.replace('m', '')
df["minutes_taken"] = df["minutes_taken"].str.replace('h', '')
df["hours_taken"] = pd.to_numeric(df["hours_taken"])
df["minutes_taken"] = pd.to_numeric(df["minutes_taken"], errors='coerce')

#### Stop preprocessing

In [28]:
df["stop"] = df["stop"].str.split('-').str.get(0)
df["stop"] = df["stop"].replace(['non'], 0)
df.isna().sum() #  28944 null vals
df["stop"] = df["stop"].replace(['2+'], 2) # Indicates for 2 or more stops
df['stop'] = df['stop'].fillna(0)
df['stop'] = pd.to_numeric(df['stop'])

#### Arrival time preprocessing

In [29]:
df["arr_time"]=pd.to_datetime(df["arr_time"])
df['arr_time'] = df['arr_time'].dt.strftime("%-H:%M")
df["arr_hour"]=pd.DatetimeIndex(df["arr_time"]).hour
df["arr_minute"]=pd.DatetimeIndex(df["arr_time"]).minute
df["arr_hour"] = pd.to_numeric(df["arr_hour"])
df["arr_minute"] = pd.to_numeric(df["arr_minute"])

#### Source & Destination preprocessing

In [30]:
df['source'] = df['route'].str.split( ', ').str.get(0).str.split(':').str.get(1)
df['destination'] = df['route'].str.split( ', ').str.get(1).str.split(':').str.get(1).str.split('}').str.get(0)
df['source'] = df['source'].str.replace('\'', "")
df['destination'] = df['destination'].str.replace('\'', "")

#### Loading Type Encoder :

In [31]:
type_enc = joblib.load("EncoderModels/type_enc.save")
df['type'] = type_enc.transform(df[["type"]])
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,...,day,dayofyear,dep_hour,dep_minute,hours_taken,minutes_taken,arr_hour,arr_minute,source,destination
0,08/03/2022,Air India,2.0,430,2022-05-28 09:55:00,11h 20m,1,2022-05-28 21:15:00,1.0,"{'source': 'Chennai', 'destination': 'Mumbai'}",...,3,215,9,55,11.0,20.0,21,15,Chennai,Mumbai
1,07/03/2022,Indigo,1.0,926,2022-05-28 18:30:00,04h 55m,1,2022-05-28 23:25:00,1.0,"{'source': 'Delhi', 'destination': 'Mumbai'}",...,3,184,18,30,4.0,55.0,23,25,Delhi,Mumbai
2,03/24/2022,Indigo,1.0,6491,2022-05-28 13:05:00,07h 40m,1,2022-05-28 20:45:00,1.0,"{'source': 'Bangalore', 'destination': 'Mumbai'}",...,24,83,13,5,7.0,40.0,20,45,Bangalore,Mumbai
3,03/27/2022,Air India,2.0,473,2022-05-28 18:40:00,22h 25m,1,2022-05-28 17:05:00,1.0,"{'source': 'Delhi', 'destination': 'Bangalore'}",...,27,86,18,40,22.0,25.0,17,5,Delhi,Bangalore
4,07/03/2022,Indigo,1.0,684,2022-05-28 21:55:00,01h 15m,0,2022-05-28 23:10:00,1.0,"{'source': 'Bangalore', 'destination': 'Hydera...",...,3,184,21,55,1.0,15.0,23,10,Bangalore,Hyderabad


#### Loading Source Encoder :

In [32]:
source_enc = joblib.load("EncoderModels/source_enc.save")
df['source'] = source_enc.transform(df[["source"]])
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,...,day,dayofyear,dep_hour,dep_minute,hours_taken,minutes_taken,arr_hour,arr_minute,source,destination
0,08/03/2022,Air India,2.0,430,2022-05-28 09:55:00,11h 20m,1,2022-05-28 21:15:00,1.0,"{'source': 'Chennai', 'destination': 'Mumbai'}",...,3,215,9,55,11.0,20.0,21,15,1.0,Mumbai
1,07/03/2022,Indigo,1.0,926,2022-05-28 18:30:00,04h 55m,1,2022-05-28 23:25:00,1.0,"{'source': 'Delhi', 'destination': 'Mumbai'}",...,3,184,18,30,4.0,55.0,23,25,2.0,Mumbai
2,03/24/2022,Indigo,1.0,6491,2022-05-28 13:05:00,07h 40m,1,2022-05-28 20:45:00,1.0,"{'source': 'Bangalore', 'destination': 'Mumbai'}",...,24,83,13,5,7.0,40.0,20,45,0.0,Mumbai
3,03/27/2022,Air India,2.0,473,2022-05-28 18:40:00,22h 25m,1,2022-05-28 17:05:00,1.0,"{'source': 'Delhi', 'destination': 'Bangalore'}",...,27,86,18,40,22.0,25.0,17,5,2.0,Bangalore
4,07/03/2022,Indigo,1.0,684,2022-05-28 21:55:00,01h 15m,0,2022-05-28 23:10:00,1.0,"{'source': 'Bangalore', 'destination': 'Hydera...",...,3,184,21,55,1.0,15.0,23,10,0.0,Hyderabad


#### Loading Destination Encoder :

In [33]:
destination_enc = joblib.load("EncoderModels/destination_enc.save")
df['destination'] = source_enc.transform(df[["destination"]])
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,...,day,dayofyear,dep_hour,dep_minute,hours_taken,minutes_taken,arr_hour,arr_minute,source,destination
0,08/03/2022,Air India,2.0,430,2022-05-28 09:55:00,11h 20m,1,2022-05-28 21:15:00,1.0,"{'source': 'Chennai', 'destination': 'Mumbai'}",...,3,215,9,55,11.0,20.0,21,15,1.0,5.0
1,07/03/2022,Indigo,1.0,926,2022-05-28 18:30:00,04h 55m,1,2022-05-28 23:25:00,1.0,"{'source': 'Delhi', 'destination': 'Mumbai'}",...,3,184,18,30,4.0,55.0,23,25,2.0,5.0
2,03/24/2022,Indigo,1.0,6491,2022-05-28 13:05:00,07h 40m,1,2022-05-28 20:45:00,1.0,"{'source': 'Bangalore', 'destination': 'Mumbai'}",...,24,83,13,5,7.0,40.0,20,45,0.0,5.0
3,03/27/2022,Air India,2.0,473,2022-05-28 18:40:00,22h 25m,1,2022-05-28 17:05:00,1.0,"{'source': 'Delhi', 'destination': 'Bangalore'}",...,27,86,18,40,22.0,25.0,17,5,2.0,0.0
4,07/03/2022,Indigo,1.0,684,2022-05-28 21:55:00,01h 15m,0,2022-05-28 23:10:00,1.0,"{'source': 'Bangalore', 'destination': 'Hydera...",...,3,184,21,55,1.0,15.0,23,10,0.0,3.0


#### Cleaning Data

In [34]:
df = df.fillna(-1)
df = df.drop(['airline', 'date', 'dep_time', "time_taken", 'arr_time', 'route',], axis=1)
df.head()

Unnamed: 0,ch_code,num_code,stop,type,price,month,day,dayofyear,dep_hour,dep_minute,hours_taken,minutes_taken,arr_hour,arr_minute,source,destination
0,2.0,430,1,1.0,4357,8,3,215,9,55,11.0,20.0,21,15,1.0,5.0
1,1.0,926,1,1.0,4270,7,3,184,18,30,4.0,55.0,23,25,2.0,5.0
2,1.0,6491,1,1.0,3153,3,24,83,13,5,7.0,40.0,20,45,0.0,5.0
3,2.0,473,1,1.0,4728,3,27,86,18,40,22.0,25.0,17,5,2.0,0.0
4,1.0,684,0,1.0,1714,7,3,184,21,55,1.0,15.0,23,10,0.0,3.0


In [35]:
X = df.loc[:, df.columns != 'price']
Y = df['price']

#### Loading Scaler Model :

In [36]:
scaler = joblib.load("scaler.save")
X_scaled = scaler.transform(X)

In [37]:
dtr_model = joblib.load("RegressionModels/DecisionTreeRegressor(random_state=42)_RegressionModel.save")
elastic_model = joblib.load("RegressionModels/ElasticNet()_RegressionModel.save")
lasso_model = joblib.load("RegressionModels/Lasso()_RegressionModel.save")
ridge_model = joblib.load("RegressionModels/Ridge()_RegressionModel.save")

In [38]:
for i, clf in enumerate((dtr_model, elastic_model, lasso_model,ridge_model)):
    predictions = clf.predict(X_scaled)
    print('Mean Square Error of '+ str(clf)+":"+ str(metrics.mean_squared_error(Y, predictions)))
    print(('Accuracy of '+ str(clf)+":"+str(r2_score(Y, predictions))))
    print('\n')


Mean Square Error of DecisionTreeRegressor(random_state=42):8904947.029593056
Accuracy of DecisionTreeRegressor(random_state=42):0.982951627567055


Mean Square Error of ElasticNet():102471463.77730359
Accuracy of ElasticNet():0.8038200932112289


Mean Square Error of Lasso():52277270.53950954
Accuracy of Lasso():0.8999160382455287


Mean Square Error of Ridge():50684257.72401149
Accuracy of Ridge():0.9029658346877548


