# <center><b>Flight Price Predictor ML</b></center>

In [1]:
# import libraries
import numpy as np 
import pandas as pd
import plotly.express as px
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OneHotEncoder , RobustScaler
from category_encoders import BinaryEncoder , OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
# load data
df = pd.read_pickle("Flight_Price_Detection_After_EDA.pkl")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10281 entries, 0 to 10280
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Airline               10281 non-null  object        
 1   Date_of_Journey       10281 non-null  datetime64[ns]
 2   Source                10281 non-null  object        
 3   Destination           10281 non-null  object        
 4   Route                 10281 non-null  object        
 5   Dep_Time              10281 non-null  object        
 6   Arrival_Time          10281 non-null  object        
 7   Duration              10281 non-null  int64         
 8   Additional_Info       10281 non-null  object        
 9   Price                 10281 non-null  int64         
 10  Month_of_Journey_Num  10281 non-null  int64         
 11  Month_of_Journey      10281 non-null  object        
 12  Day_of_Journey_Num    10281 non-null  int64         
 13  Day_of_Journey  

### Data Preparation for Machine Learning

In [3]:
# drop unnecessary columns
df.drop(['Date_of_Journey' , 'Day_of_Journey' , 'Route' , 'Duration' , 'Additional_Info' , 'Arrival_Day' , 'Arrival_Month' , 'Month_of_Journey' , 'Dep_Time' , 'Arrival_Time' , 'Day_Difference'], axis = 1 , inplace = True)

In [4]:
df.duplicated().sum()

117

In [5]:
df.drop_duplicates(inplace = True)

In [6]:
df.reset_index(drop = True , inplace = True)

In [7]:
# encoding categorical features using column transformer
# order_encoder = OrdinalEncoder(cols = ["Distance" , "Categorized_Duration"] , mapping = [{'col' : 'Distance' , 'mapping': {'short_dist':1 , 'medium_dist':2 , 'long_dist':3}} , {'col' : "Categorized_Duration" , 'mapping' : {'Short_duration':1 , 'Medium_duration':2 , 'Long_duration':3} }])
Encoder = ColumnTransformer(transformers = [("OrE" , OrdinalEncoder(cols = ["Distance" , "Categorized_Duration"] , mapping = [{'col' : 'Distance' , 'mapping': {'short_dist':1 ,'medium_dist':2 ,'long_dist':3}} , {'col' : "Categorized_Duration" , 'mapping' : {'Short_duration':1 , 'Medium_duration':2 , 'Long_duration':3} }]) , ["Distance" , "Categorized_Duration"]) , ("BE" , BinaryEncoder() , ['Airline' , 'Source' , 'Destination' , 'Dep_Day_Period' , 'Arrival_Day_Period' , 'Day_of_Journey_Num'] )] , remainder = "passthrough")

In [8]:
# splitting dataframe into feature and target variables
x = df.drop("Price" , axis = 1 )
y = df["Price"]

In [9]:
# creating a list of classification models
models = list()
models.append(("LR" , LinearRegression()))
models.append(("KNN" , KNeighborsRegressor()))
models.append(("CART" , DecisionTreeRegressor()))
models.append(("RF" , RandomForestRegressor()))
# models.append(("SVM" , SVR()))
models.append(("XG" , XGBRegressor()))
# models.append(("MLP", MLPRegressor(hidden_layer_sizes=(512,256,128,64,32, ) , max_iter = 1000 )))

In [10]:
# comparing the performance of different machine learning models using cross-validation and pipeline
for model in models:
    steps = list()
    steps.append(("Encoder" , Encoder))
    steps.append(("Scaler" , RobustScaler()))
    steps.append(model)
    pipeline = Pipeline(steps = steps)
    scores = cross_validate(pipeline , x , y , scoring = "r2"  , cv = 5 , return_train_score = True)
    print(model[0])
    print("Train_r2" , scores["train_score"].mean() )
    print("-" * 10)
    print("Test_r2" , scores["test_score"].mean())
    print("-" * 20)
    print("\n")

LR
Train_r2 0.5857265518218668
----------
Test_r2 0.579932579241627
--------------------


KNN
Train_r2 0.8847433314302924
----------
Test_r2 0.8174544135273083
--------------------


CART
Train_r2 0.9794544202049309
----------
Test_r2 0.8359738784410988
--------------------


RF
Train_r2 0.9704168982814668
----------
Test_r2 0.8906571745039298
--------------------


XG
Train_r2 0.9507210322689076
----------
Test_r2 0.9048158974141783
--------------------




In [11]:
# XGBregressor has the best performance
steps = list()
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("XG" , XGBRegressor()))
pipeline = Pipeline(steps = steps)
scores = cross_validate(pipeline , x , y , scoring = "r2"  ,cv = 5 , return_train_score = True , return_estimator = True)

In [12]:
scores['train_score'].mean()

0.9507210322689076

In [13]:
scores['test_score'].mean()

0.9048158974141783

In [14]:
scores['estimator'][0]['XG']

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [15]:
# tuning hyperparameters of XGBoost model using grid search cross-validation
param =  {
    "XG__learning_rate": [ 0.15, 0.2, 0.25],
    "XG__max_depth": [4, 6, 8],
    "XG__n_estimators": [250, 300, 350],
    "XG__reg_lambda": [2, 2.5, 3]
}

In [17]:
steps = []
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("XG" , XGBRegressor()))
pipeline_ = Pipeline(steps = steps)
grid_search = GridSearchCV(estimator = pipeline_ , param_grid = param , cv = 5 , scoring = "r2" , return_train_score = True , n_jobs = -1)
grid_search.fit(x,y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('Encoder',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('OrE',
                                                                         OrdinalEncoder(cols=['Distance',
                                                                                              'Categorized_Duration'],
                                                                                        mapping=[{'col': 'Distance',
                                                                                                  'data_type': dtype('O'),
                                                                                                  'mapping': short_dist     1
medium_dist    2
long_dist      3
dtype: int64},
                                                                                                 {'col': 'Categorized_Duration'

In [18]:
print("Best hyperparameters: ", grid_search.best_params_)
print("Mean train score: ", grid_search.cv_results_["mean_train_score"].mean())
print("Mean test score: ", grid_search.cv_results_["mean_test_score"].mean())

Best hyperparameters:  {'XG__learning_rate': 0.15, 'XG__max_depth': 6, 'XG__n_estimators': 350, 'XG__reg_lambda': 3}
Mean train score:  0.9514719506445093
Mean test score:  0.8988497811183154


In [None]:
steps = []
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("XG" , XGBRegressor()))
pipeline = Pipeline(steps = steps)
pipeline.fit(x,y)

In [None]:
final_model = pipeline

In [None]:
joblib.dump(final_model , "Model.pkl")
joblib.dump(x.columns , "Inputs.pkl")

In [None]:
x.columns

In [None]:
sorted(x["Meal"].unique().tolist())

In [None]:
df.columns

In [None]:
%%writefile streamlit_app.py

import streamlit as st
import pandas as pd 
import joblib
import sklearn
import xgboost
import category_encoders

# Load the trained model and input variables
Model = joblib.load("Model.pkl")
Inputs = joblib.load("Inputs.pkl")

# Function for making flight price predictions
def prediction(Airline , Source , Destination ,
               Month_of_Journey_Num , Day_of_Journey_Num ,
               Distance , Stops_Counts , Dep_Hour , Categorized_Duration,
               Meal, Arrival_Hour, Arrival_Day_Period,
       Dep_Day_Period):
    # Create a test DataFrame with the input variables
    test_df = pd.DataFrame(columns = Inputs)
    test_df.at[0,"Airline"] = Airline
    test_df.at[0,"Source"] = Source
    test_df.at[0,"Destination"] = Destination
    test_df.at[0,"Month_of_Journey"] = Month_of_Journey_Num
    test_df.at[0,"Day_of_Journey_Num"] = Day_of_Journey_Num
    test_df.at[0,"Distance"] = Distance
    test_df.at[0,"Stops_Counts"] = Stops_Counts
    test_df.at[0,"Dep_Hour"] = Dep_Hour
    test_df.at[0,"Categorized_Duration"] = Categorized_Duration
    test_df.at[0,"Meal"] = Meal
    test_df.at[0,"Arrival_Hour"] = Arrival_Hour
    test_df.at[0,"Arrival_Day_Period"] = Arrival_Day_Period
    test_df.at[0,"Dep_Day_Period"] = Dep_Day_Period
    # Make predictions using the loaded model
    result = Model.predict(test_df)
    return result[0]
def main():
    # Set up the Streamlit app title and input widgets
    st.title("Flight Price Predictor")
    Airline = st.selectbox("Airline name" ,['Air India', 'Jet Airways', 'IndiGo', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia'] )
    Source  = st.selectbox("Departure city" , ['Kolkata', 'Delhi', 'Banglore', 'Chennai', 'Mumbai'])
    Destination = st.selectbox("Arrival city" ,['Banglore', 'Cochin', 'New Delhi', 'Kolkata', 'Delhi', 'Hyderabad'] )
    Month_of_Journey_Num = st.selectbox("Departure month" ,[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] )
    Stops_Counts = st.selectbox("Number of Stops" , [2, 1, 0, 3, 4])
    Day_of_Journey_Num = st.selectbox("Day of travel" , [ 1,  9, 12, 24, 27, 18,  3, 15,  6, 21])
    Categorized_Duration = st.selectbox("Flight duration" , ['Short duration', 'Medium duration', 'Long duration'])
    Distance = st.selectbox("Flight distance" , ['medium distance', 'long distance', 'short distance'])
    Dep_Day_Period = st.selectbox("Departure period" , ['Early Morning', 'Afternoon', 'Evening', 'Night'])
    Arrival_Day_Period = st.selectbox("Arrival period" ,['Afternoon', 'Night', 'Evening', 'Early Morning'] )
    Dep_Hour = st.selectbox("Departure time",[0 , 1, 2 , 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23])
    Arrival_Hour = st.selectbox("Arrival time",[0 , 1, 2 , 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23])
    Meal = st.selectbox("Meal" , [0, 1])
    # Perform prediction when the "predict" button is clicked
    if st.button("predict"):
        results = prediction(Airline , Source , Destination , Month_of_Journey_Num , Day_of_Journey_Num , Distance , Stops_Counts , Dep_Hour , Categorized_Duration ,
               Meal , Arrival_Hour , Arrival_Day_Period , Dep_Day_Period)
        st.text(f"The flight cost will be {round(results)} Indian Rupee.")
if __name__ == '__main__':
    main() 