# Predict The Flight Ticket Price 
# Regression Problem

In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import datetime as dt
from datetime import timedelta

In [2]:
# Importing the excel data
df_train = pd.read_excel("C:\\Users\\PUNEET SINGH\\flight train.xlsx")
df_test = pd.read_excel("C:\\Users\\PUNEET SINGH\\flight test.xlsx")
print("Shape of the training set :",df_train.shape)
print("Shape of the test set :",df_test.shape)

Shape of the training set : (10683, 11)
Shape of the test set : (2671, 10)


In [3]:
print("5 rows of the training set \n",df_train.head())
print("__"*40)
print("5 rows of the testing set \n",df_test.head())

5 rows of the training set 
        Airline Date_of_Journey    Source Destination                  Route  \
0       IndiGo      24/03/2019  Banglore   New Delhi              BLR → DEL   
1    Air India       1/05/2019   Kolkata    Banglore  CCU → IXR → BBI → BLR   
2  Jet Airways       9/06/2019     Delhi      Cochin  DEL → LKO → BOM → COK   
3       IndiGo      12/05/2019   Kolkata    Banglore        CCU → NAG → BLR   
4       IndiGo      01/03/2019  Banglore   New Delhi        BLR → NAG → DEL   

  Dep_Time  Arrival_Time Duration Total_Stops Additional_Info  Price  
0    22:20  01:10 22 Mar   2h 50m    non-stop         No info   3897  
1    05:50         13:15   7h 25m     2 stops         No info   7662  
2    09:25  04:25 10 Jun      19h     2 stops         No info  13882  
3    18:05         23:30   5h 25m      1 stop         No info   6218  
4    16:50         21:35   4h 45m      1 stop         No info  13302  
______________________________________________________________________

In [4]:
# Data Type of the trainig set
df_train.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

In [5]:
# Checking for the missing values in the training and testing set
print("Null values in the training set \n{}".format(df_train.isnull().sum()))
print("__"*40)
print("Null values in the testing set \n{}".format(df_test.isnull().sum()))

Null values in the training set 
Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64
________________________________________________________________________________
Null values in the testing set 
Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
dtype: int64


In [6]:
# Training set only have 1 missing values in two features 
# so dropping the missing values
#drop null column values
df_train.dropna(inplace=True)

In [7]:
# Checking the value count of the feature 'Destination' in training and testing set
print("Training set \n{}".format(df_train['Destination'].value_counts()))

Training set 
Cochin       4536
Banglore     2871
Delhi        1265
New Delhi     932
Hyderabad     697
Kolkata       381
Name: Destination, dtype: int64


In [8]:
# New delhi and delhi has two different values
print("Testing  set \n{}".format(df_test['Destination'].value_counts()))

Testing  set 
Cochin       1145
Banglore      710
Delhi         317
New Delhi     238
Hyderabad     186
Kolkata        75
Name: Destination, dtype: int64


In [9]:
# Replacing New delhi to delhi in both training and testing set
df_train["Destination"] = df_train["Destination"].replace({'New Delhi': 'Delhi'})
df_test["Destination"] = df_test["Destination"].replace({'New Delhi': 'Delhi'})

In [10]:
# value count of feature Total_stops 
df_train['Total_Stops'].value_counts()

1 stop      5625
non-stop    3491
2 stops     1520
3 stops       45
4 stops        1
Name: Total_Stops, dtype: int64

In [11]:
# Assigning the binary form manually
# training set
df_train.Total_Stops.replace(['1 stop', 'non-stop', '2 stops', '3 stops', '4 stops'], [1, 0, 2, 3, 4], inplace=True)
# Testing set
df_test.Total_Stops.replace(['1 stop', 'non-stop', '2 stops', '3 stops', '4 stops'], [1, 0, 2, 3, 4], inplace=True)

In [12]:
# Changing the object type into int form
df_train["Total_Stops"] = df_train["Total_Stops"].astype(int)
df_test["Total_Stops"] = df_test["Total_Stops"].astype(int)

In [13]:
print("Training set:\n",df_train["Additional_Info"].value_counts())
print('__'*30)
print("Testing set:\n",df_test["Additional_Info"].value_counts())

Training set:
 No info                         8344
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
No Info                            3
1 Short layover                    1
2 Long layover                     1
Red-eye flight                     1
Name: Additional_Info, dtype: int64
____________________________________________________________
Testing set:
 No info                         2148
In-flight meal not included      444
No check-in baggage included      76
Business class                     1
1 Long layover                     1
Change airports                    1
Name: Additional_Info, dtype: int64


In [14]:
# No info is there twice in the training set
df_train["Additional_Info"] = df_train["Additional_Info"].replace({'No Info': 'No info'})

In [15]:
# Now using datetime library
# will examine the feature 'Date_of_Journey'
# Training set
df_train["isWeekend"] = ((pd.to_datetime(df_train["Date_of_Journey"], format = '%d/%m/%Y').dt.dayofweek) // 5 == 1).astype(int)
df_train["weekday"] = pd.to_datetime(df_train["Date_of_Journey"], format = '%d/%m/%Y').dt.day_name()
df_train["journey_day"] = pd.to_datetime(df_train["Date_of_Journey"], format = '%d/%m/%Y').dt.day.astype(int)
df_train["Month_of_Journey"] = pd.to_datetime(df_train["Date_of_Journey"], format = '%d/%m/%Y').dt.month.astype(int)

In [16]:
# Testing set
df_test["isWeekend"] = ((pd.to_datetime(df_test["Date_of_Journey"], format = '%d/%m/%Y').dt.dayofweek) // 5 == 1).astype(int)
df_test["journey_day"] = pd.to_datetime(df_test["Date_of_Journey"], format = '%d/%m/%Y').dt.day.astype(int)
df_test["weekday"] = pd.to_datetime(df_test["Date_of_Journey"], format = '%d/%m/%Y').dt.day_name()
df_test["Month_of_Journey"] = pd.to_datetime(df_test["Date_of_Journey"], format = '%d/%m/%Y').dt.month.astype(int)

In [17]:
# Drop original feature from train and test dataset 
df_train.drop( 'Date_of_Journey', axis = 1, inplace = True)
df_test.drop('Date_of_Journey', axis = 1, inplace = True)

In [18]:
# cleaning duration column in dataset to predict
duration = list(df_test["Duration"])

for i in range(len(duration)) :
    if len(duration[i].split()) != 2:
        if 'h' in duration[i] :
            duration[i] = duration[i].strip() + ' 0m'
        elif 'm' in duration[i] :
            duration[i] = '0h {}'.format(duration[i].strip())

dur_hours = []
dur_minutes = []  
dur_seconds = []
for i in range(len(duration)) :
    dur_minutes.append(int(duration[i].split()[0][:-1])*60 + int(duration[i].split()[1][:-1]))
    dur_seconds.append(int(duration[i].split()[0][:-1])*60*60 + int(duration[i].split()[1][:-1])*60)
df_test["Duration_minutes"] = dur_minutes
df_test.drop(labels = 'Duration', axis = 1, inplace = True)

In [19]:
#Cleaning Departure and Arrival Times
# Training Set
df_train['Depart_Time_Hour'] = pd.to_datetime(df_train.Dep_Time).dt.hour.astype(int)
df_train['Depart_Time_Minutes'] = pd.to_datetime(df_train.Dep_Time).dt.minute.astype(int)
df_train.drop(labels = 'Dep_Time', axis = 1, inplace = True)
df_train['Arr_Time_Hour'] = pd.to_datetime(df_train.Arrival_Time).dt.hour.astype(int)
df_train['Arr_Time_Minutes'] = pd.to_datetime(df_train.Arrival_Time).dt.minute.astype(int)
df_train.drop(labels = 'Arrival_Time', axis = 1, inplace = True)

# Testing Set
df_test['Depart_Time_Hour'] = pd.to_datetime(df_test.Dep_Time).dt.hour.astype(int)
df_test['Depart_Time_Minutes'] = pd.to_datetime(df_test.Dep_Time).dt.minute.astype(int)
df_test.drop(labels = 'Dep_Time', axis = 1, inplace = True)
df_test['Arr_Time_Hour'] = pd.to_datetime(df_test.Arrival_Time).dt.hour.astype(int)
df_test['Arr_Time_Minutes'] = pd.to_datetime(df_test.Arrival_Time).dt.minute.astype(int)
df_test.drop(labels = 'Arrival_Time', axis = 1, inplace = True)

In [20]:
# df_train['part_of_day'].value_counts()
# We created X and y columns for training dataset while y column for dataset to predict will
# be generated by our model in final step
X = df_train.drop(["Price"], axis=1)
y = np.log1p(df_train["Price"])
# y = df_train["Price"]
X_to_predict = df_test
# Test set

In [21]:
X.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,isWeekend,weekday,journey_day,Month_of_Journey,Depart_Time_Hour,Depart_Time_Minutes,Arr_Time_Hour,Arr_Time_Minutes
0,IndiGo,Banglore,Delhi,BLR → DEL,2h 50m,0,No info,1,Sunday,24,3,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2,No info,0,Wednesday,1,5,5,50,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2,No info,1,Sunday,9,6,9,25,4,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1,No info,1,Sunday,12,5,18,5,23,30
4,IndiGo,Banglore,Delhi,BLR → NAG → DEL,4h 45m,1,No info,0,Friday,1,3,16,50,21,35


In [22]:
#Separate categorical and numerical columns in dataframe
X_categorical = X.select_dtypes(exclude=['int', 'float'])
X_numerical = X.select_dtypes(include=['int', 'float'])

X_to_predict_categorical = X_to_predict.select_dtypes(exclude=['int', 'float'])
X_to_predict_numerical = X_to_predict.select_dtypes(include=['int', 'float'])

In [23]:
X_numerical.head()

Unnamed: 0,Total_Stops,isWeekend,journey_day,Month_of_Journey,Depart_Time_Hour,Depart_Time_Minutes,Arr_Time_Hour,Arr_Time_Minutes
0,0,1,24,3,22,20,1,10
1,2,0,1,5,5,50,13,15
2,2,1,9,6,9,25,4,25
3,1,1,12,5,18,5,23,30
4,1,0,1,3,16,50,21,35


In [24]:
#Create a back up copy specifically for plotting graphs
X_numerical_graph = X_numerical.copy()
X_categorical_graph = X_categorical.copy()

In [25]:
# Checking the value count of feature AIRLINE 
X_categorical['Airline'].value_counts()

Jet Airways                          3849
IndiGo                               2053
Air India                            1751
Multiple carriers                    1196
SpiceJet                              818
Vistara                               479
Air Asia                              319
GoAir                                 194
Multiple carriers Premium economy      13
Jet Airways Business                    6
Vistara Premium economy                 3
Trujet                                  1
Name: Airline, dtype: int64

In [26]:
# Replacing the smaller counts values into Other
# Training set
X_categorical["Airline"].replace(to_replace={'Multiple carriers Premium economy':'Other', 
                                                        'Jet Airways Business':'Other',
                                                        'Vistara Premium economy':'Other',
                                                        'Trujet':'Other'
                                                   },    
                                        inplace=True)
# Testing set
X_to_predict_categorical["Airline"].replace(to_replace={'Multiple carriers Premium economy':'Other', 
                                                        'Jet Airways Business':'Other',
                                                        'Vistara Premium economy':'Other',
                                                        'Trujet':'Other'
                                                   },    
                                        inplace=True)

In [27]:
# Value count of Feature Additional_Info 
X_categorical['Additional_Info'].value_counts()

No info                         8347
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
1 Short layover                    1
2 Long layover                     1
Red-eye flight                     1
Name: Additional_Info, dtype: int64

In [28]:
# Replacing the smaller counts values into Other
# Training set and testing set
X_categorical["Additional_Info"].replace(to_replace={'Change airports':'Other', 
                                                        'Business class':'Other',
                                                        '1 Short layover':'Other',
                                                        'Red-eye flight':'Other',
                                                        '2 Long layover':'Other',   
                                                   },    
                                        inplace=True)
X_to_predict_categorical["Additional_Info"].replace(to_replace={'Change airports':'Other', 
                                                        'Business class':'Other',
                                                        '1 Short layover':'Other',
                                                        'Red-eye flight':'Other',
                                                        '2 Long layover':'Other',   
                                                   },    
                                        inplace=True)

In [29]:
X_categorical.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Additional_Info,weekday
0,IndiGo,Banglore,Delhi,BLR → DEL,2h 50m,No info,Sunday
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,No info,Wednesday
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,No info,Sunday
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,No info,Sunday
4,IndiGo,Banglore,Delhi,BLR → NAG → DEL,4h 45m,No info,Friday


In [30]:
# Transforming the Categorical data into binary form through Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_categorical = X_categorical.apply(LabelEncoder().fit_transform)
X_to_predict_categorical = X_to_predict_categorical.apply(LabelEncoder().fit_transform)

In [31]:
#Check values after label encoding
X_categorical.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Additional_Info,weekday
0,3,0,2,18,240,3,3
1,1,3,0,84,336,3,6
2,4,2,1,118,106,3,3
3,3,3,0,91,311,3,3
4,3,0,2,29,303,3,0


In [32]:
#Check values in numerical columns
X_numerical.head()

Unnamed: 0,Total_Stops,isWeekend,journey_day,Month_of_Journey,Depart_Time_Hour,Depart_Time_Minutes,Arr_Time_Hour,Arr_Time_Minutes
0,0,1,24,3,22,20,1,10
1,2,0,1,5,5,50,13,15
2,2,1,9,6,9,25,4,25
3,1,1,12,5,18,5,23,30
4,1,0,1,3,16,50,21,35


In [33]:
# Check the skew of all numerical features and removing it through BOX COX TRANSFORM
from scipy.stats import skew
# numerical data skewness
skewed_feats = X_numerical.apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))


Skew in numerical features: 

There are 8 skewed numerical features to Box Cox transform


In [34]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
cols_train = list(X_numerical.columns)
index_train = X_numerical.index.tolist()
cols_test = list(X_to_predict_numerical.columns)
index_test = X_to_predict_numerical.index.tolist()

In [35]:
X_numerical_names = X_numerical.columns
X_numerical_index = X_numerical.index
X_to_predict_numerical_names = X_to_predict_numerical.columns

In [36]:
from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.5
for feat in skewed_features:
    #all_data[feat] += 1
    X_numerical[feat] = boxcox1p(X_numerical[feat], lam)
    X_to_predict_numerical[feat] = boxcox1p(X_to_predict_numerical[feat], lam)

In [37]:
#Check values in numerical columns after minmax scaling
X_numerical.head(5)

Unnamed: 0,Total_Stops,isWeekend,journey_day,Month_of_Journey,Depart_Time_Hour,Depart_Time_Minutes,Arr_Time_Hour,Arr_Time_Minutes
0,0.0,0.828427,8.0,2.0,7.591663,7.165151,0.828427,4.63325
1,1.464102,0.0,0.828427,2.898979,2.898979,12.282857,5.483315,6.0
2,1.464102,0.828427,4.324555,3.291503,4.324555,8.198039,2.472136,8.198039
3,0.828427,0.828427,5.211103,2.898979,6.717798,2.898979,7.797959,9.135529
4,0.828427,0.0,0.828427,2.0,6.246211,12.282857,7.380832,10.0


In [38]:
#Merge categorical and numerical columns back into respective X and X_to_predict
X = pd.concat([X_categorical, X_numerical], axis=1)
X_to_predict = pd.concat([X_to_predict_categorical, X_to_predict_numerical], axis=1)

In [39]:
#Check shape of all three to verify that merge was done properly in above step
print(X.shape, X_numerical.shape, X_categorical.shape)
print(X_to_predict.shape, X_to_predict_numerical.shape, X_to_predict_categorical.shape)
print(y.shape)

(10682, 15) (10682, 8) (10682, 7)
(2671, 15) (2671, 8) (2671, 7)
(10682,)


In [40]:
# Splitting the data and a function for metrics measure
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
from sklearn.metrics import mean_squared_log_error
# We have created a function to print accuracy metrics which can be used
# to get accuracy metrics of all models in upcoming steps
def print_accuracy_report(y_test, y_pred,X_test, model):
 print('R Squared(Accuracy)', metrics.r2_score(y_test, y_pred)*100)   
 print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
 print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
 print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
 print('Root Mean Squared Log Error',np.sqrt(mean_squared_log_error( y_test, y_pred )))

In [41]:
# we have created a function to generate linear regression model
# which can then be called again after feature selection or other steps
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
def LinearRegressionModel(X,y):
 X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.3)
 regressor = LinearRegression()  
 regressor.fit(X_train, y_train)
 y_pred = regressor.predict(X_test)
 print(print_accuracy_report(y_test, y_pred, X_test, regressor))
 return regressor
linearModel = LinearRegressionModel(X,y)

R Squared(Accuracy) 59.99591241351404
Mean Absolute Error: 0.2639770972905898
Mean Squared Error: 0.10764255056271275
Root Mean Squared Error: 0.3280892417661889
Root Mean Squared Log Error 0.03293700844546377
None


In [42]:
# Linear Regression model has predicted 60% accuracy .

In [43]:
#similarly define a function for random forest regressor
from sklearn.ensemble import RandomForestRegressor
def RandomForestRegressorModel(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.3)
    rf = RandomForestRegressor(random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print(print_accuracy_report(y_test, y_pred, X_test, rf))
    return rf
randomForestModel = RandomForestRegressorModel(X,y)

R Squared(Accuracy) 93.78930439446829
Mean Absolute Error: 0.0729419133055475
Mean Squared Error: 0.016711670133776727
Root Mean Squared Error: 0.12927362505080736
Root Mean Squared Log Error 0.012894220177210251
None


In [44]:
# Random Forest model has predicted with 93 % which is very good.

In [45]:
# Applying the Extra regressor 
# select features using extratreesclassifier
from sklearn.ensemble import ExtraTreesRegressor
def ExtraTreesRegressorModel(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.3)
    rf = ExtraTreesRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print(print_accuracy_report(y_test, y_pred, X_test, rf))
    return rf
extraTreeRegressorModel = ExtraTreesRegressorModel(X,y)

R Squared(Accuracy) 94.23693628544623
Mean Absolute Error: 0.06791908879736915
Mean Squared Error: 0.01550718725802296
Root Mean Squared Error: 0.12452785735739196
Root Mean Squared Log Error 0.012400649255375498
None


Extratreeregressor has predicted better than Random Forest which is 94.23% while Random forest predicted accuracy of 93.78%

In [47]:
# Applying the Hyperparameter Tuning on the Random Forest Model
# Importing the GridSeachCV
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.3)

In [48]:
params = ({'n_estimators':[20,30,40],'criterion':['mse','mae'],'max_depth':[3,4,5]})
grid_search = GridSearchCV(estimator=randomForestModel,param_grid=params,n_jobs=-1)

In [49]:
grid_search.fit(X_train,y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [20, 30, 40], 'criterion': ['mse', 'mae'], 'max_depth': [3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [50]:
grid_search.predict(X_test)

array([9.32678035, 8.73398342, 9.11852583, ..., 8.72121139, 8.07111641,
       8.73398342])

In [52]:
print("Best parameter of the model :",grid_search.best_params_)

Best parameter of the model : {'criterion': 'mse', 'max_depth': 5, 'n_estimators': 40}


In [53]:
# For applying the Refitting best parameters are there

In [54]:
# Saving the Extra Tree Regressor Model as it has predicted the best 
# Importing Pickle
import pickle
filehandler = open("Project_32.obj","wb")
# saving the model in pickel string
saved_etr_model = pickle.dump(extraTreeRegressorModel,filehandler)
filehandler.close()