In [1]:
# import libraries
import numpy as np 
import pandas as pd
import plotly.express as px
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OneHotEncoder , RobustScaler
from category_encoders import BinaryEncoder , OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
# load data
df = pd.read_pickle("Flight_Price_Detection_After_EDA.pkl")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10281 entries, 0 to 10280
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Airline               10281 non-null  object        
 1   Date_of_Journey       10281 non-null  datetime64[ns]
 2   Source                10281 non-null  object        
 3   Destination           10281 non-null  object        
 4   Route                 10281 non-null  object        
 5   Dep_Time              10281 non-null  object        
 6   Arrival_Time          10281 non-null  object        
 7   Duration              10281 non-null  int64         
 8   Additional_Info       10281 non-null  object        
 9   Price                 10281 non-null  int64         
 10  Month_of_Journey_Num  10281 non-null  int32         
 11  Month_of_Journey      10281 non-null  object        
 12  Day_of_Journey_Num    10281 non-null  int32         
 13  Day_of_Journey  

### Data Preparation for Machine Learning

In [3]:
# drop unnecessary columns
df.drop(['Date_of_Journey' , 'Day_of_Journey' , 'Route' , 'Duration' , 'Additional_Info' , 'Arrival_Day' , 'Arrival_Month' , 'Month_of_Journey' , 'Dep_Time' , 'Arrival_Time' , 'Day_Difference'], axis = 1 , inplace = True)

In [4]:
df.duplicated().sum()

117

In [5]:
df.drop_duplicates(inplace = True)

In [6]:
df.reset_index(drop = True , inplace = True)

In [7]:
# encoding categorical features using column transformer
# order_encoder = OrdinalEncoder(cols = ["Distance" , "Categorized_Duration"] , mapping = [{'col' : 'Distance' , 'mapping': {'short_dist':1 , 'medium_dist':2 , 'long_dist':3}} , {'col' : "Categorized_Duration" , 'mapping' : {'Short_duration':1 , 'Medium_duration':2 , 'Long_duration':3} }])
Encoder = ColumnTransformer(transformers = [("OrE" , OrdinalEncoder(cols = ["Distance" , "Categorized_Duration"] , mapping = [{'col' : 'Distance' , 'mapping': {'short_dist':1 ,'medium_dist':2 ,'long_dist':3}} , {'col' : "Categorized_Duration" , 'mapping' : {'Short_duration':1 , 'Medium_duration':2 , 'Long_duration':3} }]) , ["Distance" , "Categorized_Duration"]) , ("BE" , BinaryEncoder() , ['Airline' , 'Source' , 'Destination' , 'Dep_Day_Period' , 'Arrival_Day_Period' , 'Day_of_Journey_Num'] )] , remainder = "passthrough")

In [8]:
# splitting dataframe into feature and target variables
x = df.drop("Price" , axis = 1 )
y = df["Price"]

In [9]:
# creating a list of classification models
models = list()
models.append(("LR" , LinearRegression()))
models.append(("KNN" , KNeighborsRegressor()))
models.append(("CART" , DecisionTreeRegressor()))
models.append(("RF" , RandomForestRegressor()))
# models.append(("SVM" , SVR()))
models.append(("XG" , XGBRegressor()))
# models.append(("MLP", MLPRegressor(hidden_layer_sizes=(512,256,128,64,32, ) , max_iter = 1000 )))

In [10]:
# comparing the performance of different machine learning models using cross-validation and pipeline
for model in models:
    steps = list()
    steps.append(("Encoder" , Encoder))
    steps.append(("Scaler" , RobustScaler()))
    steps.append(model)
    pipeline = Pipeline(steps = steps)
    scores = cross_validate(pipeline , x , y , scoring = "r2"  , cv = 5 , return_train_score = True)
    print(model[0])
    print("Train_r2" , scores["train_score"].mean() )
    print("-" * 10)
    print("Test_r2" , scores["test_score"].mean())
    print("-" * 20)
    print("\n")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

LR
Train_r2 0.5857762738076936
----------
Test_r2 0.5800643595267898
--------------------




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

KNN
Train_r2 0.8849926552271306
----------
Test_r2 0.8169960558531375
--------------------




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

CART
Train_r2 0.9794544202049309
----------
Test_r2 0.8375994072865343
--------------------




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

RF
Train_r2 0.9704288676268475
----------
Test_r2 0.8918651046333055
--------------------




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

XG
Train_r2 0.9507210373878479
----------
Test_r2 0.9048158884048462
--------------------




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

In [11]:
# XGBregressor has the best performance
steps = list()
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("XG" , XGBRegressor()))
pipeline = Pipeline(steps = steps)
scores = cross_validate(pipeline , x , y , scoring = "r2"  ,cv = 5 , return_train_score = True , return_estimator = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

In [12]:
scores['train_score'].mean()

0.9507210373878479

In [13]:
scores['test_score'].mean()

0.9048158884048462

In [14]:
scores['estimator'][0]['XG']

In [15]:
# tuning hyperparameters of XGBoost model using grid search cross-validation
param =  {
    "XG__learning_rate": [ 0.15, 0.2, 0.25],
    "XG__max_depth": [4, 6, 8],
    "XG__n_estimators": [250, 300, 350],
    "XG__reg_lambda": [2, 2.5, 3]
}

In [16]:
steps = []
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("XG" , XGBRegressor()))
pipeline_ = Pipeline(steps = steps)
grid_search = GridSearchCV(estimator = pipeline_ , param_grid = param , cv = 5 , scoring = "r2" , return_train_score = True , n_jobs = -1)
grid_search.fit(x,y)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

In [17]:
print("Best hyperparameters: ", grid_search.best_params_)
print("Mean train score: ", grid_search.cv_results_["mean_train_score"].mean())
print("Mean test score: ", grid_search.cv_results_["mean_test_score"].mean())

Best hyperparameters:  {'XG__learning_rate': 0.15, 'XG__max_depth': 6, 'XG__n_estimators': 350, 'XG__reg_lambda': 3}
Mean train score:  0.9514719508312367
Mean test score:  0.8988497807655805


In [18]:
steps = []
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("XG" , XGBRegressor()))
pipeline = Pipeline(steps = steps)
pipeline.fit(x,y)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

In [19]:
final_model = pipeline

In [20]:
joblib.dump(final_model , "Model.pkl")
joblib.dump(x.columns , "Inputs.pkl")

['Inputs.pkl']

In [21]:
x.columns

Index(['Airline', 'Source', 'Destination', 'Month_of_Journey_Num',
       'Day_of_Journey_Num', 'Distance', 'Stops_Counts', 'Dep_Hour',
       'Categorized_Duration', 'Meal', 'Arrival_Hour', 'Arrival_Day_Period',
       'Dep_Day_Period'],
      dtype='object')

In [22]:
sorted(x["Meal"].unique().tolist())

[0, 1]

In [23]:
df.columns

Index(['Airline', 'Source', 'Destination', 'Price', 'Month_of_Journey_Num',
       'Day_of_Journey_Num', 'Distance', 'Stops_Counts', 'Dep_Hour',
       'Categorized_Duration', 'Meal', 'Arrival_Hour', 'Arrival_Day_Period',
       'Dep_Day_Period'],
      dtype='object')

In [27]:
%%writefile streamlit_app.py

import streamlit as st
import pandas as pd 
import joblib

# Load the trained model and input variables
Model = joblib.load("Model.pkl")
Inputs = joblib.load("Inputs.pkl")

# Function for making flight price predictions
def prediction(Airline, Source, Destination, Month_of_Journey_Num, Day_of_Journey_Num, Distance, Stops_Counts, Dep_Hour, Categorized_Duration, Meal, Arrival_Hour, Arrival_Day_Period, Dep_Day_Period):
    # Create a test DataFrame with the input variables
    test_df = pd.DataFrame(columns=Inputs)
    test_df.at[0, "Airline"] = Airline
    test_df.at[0, "Source"] = Source
    test_df.at[0, "Destination"] = Destination
    test_df.at[0, "Month_of_Journey"] = Month_of_Journey_Num
    test_df.at[0, "Day_of_Journey_Num"] = Day_of_Journey_Num
    test_df.at[0, "Distance"] = Distance
    test_df.at[0, "Stops_Counts"] = Stops_Counts
    test_df.at[0, "Dep_Hour"] = Dep_Hour
    test_df.at[0, "Categorized_Duration"] = Categorized_Duration
    test_df.at[0, "Meal"] = Meal
    test_df.at[0, "Arrival_Hour"] = Arrival_Hour
    test_df.at[0, "Arrival_Day_Period"] = Arrival_Day_Period
    test_df.at[0, "Dep_Day_Period"] = Dep_Day_Period
    # Make predictions using the loaded model
    result = Model.predict(test_df)
    return result[0]

def main():
    # Set up the Streamlit app title and input widgets
    st.set_page_config(page_title="Flight Price Predictor", page_icon="✈️")
    st.title("Flight Price Predictor")
    st.markdown("""
    <style>
    .main {
        background-color: #f5f5f5;
        padding: 10px;
        border-radius: 10px;
    }
    </style>
    """, unsafe_allow_html=True)

    with st.form("prediction_form"):
        st.subheader("Enter the flight details below:")
        
        Airline = st.selectbox("Airline Name", ['Air India', 'Jet Airways', 'IndiGo', 'SpiceJet', 'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia'])
        col1, col2 = st.columns(2)
        with col1:
            Source = st.selectbox("Departure City", ['Kolkata', 'Delhi', 'Banglore', 'Chennai', 'Mumbai'])
        with col2:
            Destination = st.selectbox("Arrival City", ['Banglore', 'Cochin', 'New Delhi', 'Kolkata', 'Delhi', 'Hyderabad'])
        
        Month_of_Journey_Num = st.slider("Departure Month", 1, 12, 1)
        Day_of_Journey_Num = st.slider("Day of Travel", 1, 31, 1)
        Distance = st.selectbox("Flight Distance", ['medium distance', 'long distance', 'short distance'])
        Stops_Counts = st.selectbox("Number of Stops", [0, 1, 2, 3, 4])
        Categorized_Duration = st.selectbox("Flight Duration", ['Short duration', 'Medium duration', 'Long duration'])
        
        col3, col4 = st.columns(2)
        with col3:
            Dep_Day_Period = st.selectbox("Departure Period", ['Early Morning', 'Afternoon', 'Evening', 'Night'])
            Dep_Hour = st.slider("Departure Hour", 0, 23, 0)
        with col4:
            Arrival_Day_Period = st.selectbox("Arrival Period", ['Early Morning', 'Afternoon', 'Evening', 'Night'])
            Arrival_Hour = st.slider("Arrival Hour", 0, 23, 0)
        
        Meal = st.selectbox("Meal", [0, 1])
        
        submitted = st.form_submit_button("Predict")
        if submitted:
            results = prediction(Airline, Source, Destination, Month_of_Journey_Num, Day_of_Journey_Num, Distance, Stops_Counts, Dep_Hour, Categorized_Duration, Meal, Arrival_Hour, Arrival_Day_Period, Dep_Day_Period)
            st.success(f"The predicted flight cost is {round(results)} Indian Rupees.")

if __name__ == '__main__':
    main()


Overwriting streamlit_app.py


In [None]:
!streamlit run streamlit_app.py