# Flight Price Prediction
## Problem Statement:
Flight ticket prices can be something hard to guess, today we might see a price, check out the price of the same flight tomorrow, it will be a different story. We might have often heard travellers saying that flight ticket prices are so unpredictable. Here you will be provided with prices of flight tickets for various airlines between the months of March and June of 2019 and between various cities.

Size of training set: 10683 records

Size of test set: 2671 records

FEATURES:
Airline: The name of the airline.

Date_of_Journey: The date of the journey

Source: The source from which the service begins.

Destination: The destination where the service ends.

Route: The route taken by the flight to reach the destination.

Dep_Time: The time when the journey starts from the source.

Arrival_Time: Time of arrival at the destination.

Duration: Total duration of the flight.

Total_Stops: Total stops between the source and destination.

Additional_Info: Additional information about the flight

Price: The price of the ticket

In [1]:
#importing libraries
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [3]:
#not able to import data
#importing data
df=pd.read_xlsx('C:/Users/User/DownloadsFlight_Ticket_Participant_Datasets/Sample_submission.xlsx')
df.head()

AttributeError: module 'pandas' has no attribute 'read_xlsx'

In [None]:
df.info()   #information about the data

In [None]:
#describe about the data
df.describe()

In [None]:
df.shape

In [None]:
#finding the null values
df.isnull().sum()


In [None]:
import missingno as msno
msno.bar(df)
plt.show

In [None]:
#drop the nullvalues
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.dtypes # datatypes

In [None]:
def change_into_datetime(col):
    df[col]=pd.to_datetime(df[col])

In [None]:
df.columns

In [None]:
for i in ['Date_of_Journey','Dep_Time', 'Arrival_Time']:
    change_into_datetime(i)

In [None]:
df.dtypes

In [None]:
df['journey_day']=df['Date_of_Journey'].dt.day
df['journey_month']=df['Date_of_Journey'].dt.month

In [None]:
df.head(10)

In [None]:
df.drop('Date_of_Journey', axis=1, inplace=True)

In [None]:
# function for extracting hour and minutes
def extract_hour(data,col):
    data[col+'_hour']=data[col].dt.hour
    
def extract_min(data,col):
    data[col+'_min']=data[col].dt.minute
    

def drop_col(data,col):
    data.drop(col,axis=1,inplace=True)

In [None]:
#call the function
# Departure time is when a plane leaves the gate. 
# Similar to Date_of_Journey we can extract values from Dep_Time
extract_hour(df,'Dep_Time')

#extracting minutes
extract_min(df,'Dep_Time')

#drop the column
drop_col(df,'Dep_Time')


In [None]:
#extracting hour
extract_hour(df,'Arrival_Time')

#extracting min
extract_min(df,'Arrival_Time')


#drop the column
drop_col(df,'Arrival_Time')

In [None]:
df.head(10)

In [None]:
duration=list(df['Duration'])
for i in range(len(duration)):
    if len(duration[i].split(' '))==2:
        pass
    else:
        if 'h' in duration[i]: # Check if duration contains only hour
             duration[i]=duration[i] + ' 0m' # Adds 0 minute
        else:
             duration[i]='0h '+ duration[i]

In [None]:
df['Duration']=duration

In [None]:
df.head()

In [None]:
def hour(x):
    return x.split(' ')[0][0:-1]

def minutes(x):
    return x.split(' ')[1][0:-1]

In [None]:
df['dur_hour']=df['Duration'].apply(hour)

In [None]:
df['dur_min']=df['Duration'].apply(minutes)

In [None]:
df.head(10)

In [None]:
drop_col(df,'Duration')

In [None]:
df.dtypes

In [None]:
df['dur_hour'] = df['dur_hour'].astype(int)
df['dur_min'] = df['dur_min'].astype(int)

In [None]:
df.dtypes

In [None]:
column=[column for column in df.columns if df[column].dtype=='object']
column

In [None]:
continuous_col =[column for column in df.columns if df[column].dtype!='object']
continuous_col

In [None]:
categorical = df[column]

In [None]:
categorical.head()

In [None]:
categorical['Airline'].value_counts()

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='Airline',y='Price',data=df.sort_values('Price',ascending=False))

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='Total_Stops',y='Price',data=df.sort_values('Price',ascending=False))

In [None]:
# As Airline is Nominal Categorical data we will perform OneHotEncoding
Airline=pd.get_dummies(categorical['Airline'],drop_first=True)


In [None]:
Airline.head()

In [None]:
categorical['Source'].value_counts()

In [None]:
#Source vs Price

plt.figure(figsize=(15,15))
sns.catplot(x='Source',y='Price',data=df.sort_values('Price',ascending=False),kind='boxen')

In [None]:
#encoding of source column
source=pd.get_dummies(categorical['Source'],drop_first=True)
source.head()

In [None]:
categorical['Destination'].value_counts()

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='Destination',y='Price',data=df.sort_values('Price',ascending=False))

In [None]:
#encoding of destination column
destination=pd.get_dummies(categorical['Destination'],drop_first=True)
destination.head()

In [None]:
# now work on route column
categorical['Route'].value_counts()

In [None]:
categorical['Route1']=categorical['Route'].str.split('→').str[0]
categorical['Route2']=categorical['Route'].str.split('→').str[1]
categorical['Route3']=categorical['Route'].str.split('→').str[2]
categorical['Route4']=categorical['Route'].str.split('→').str[3]
categorical['Route5']=categorical['Route'].str.split('→').str[4]


In [None]:
categorical.head()


In [None]:
drop_col(categorical,'Route')

In [None]:
categorical.isnull().sum()

In [None]:
categorical.columns

In [None]:
for i in ['Route3', 'Route4', 'Route5']:
    categorical[i].fillna('None',inplace=True)

In [None]:
categorical.isnull().sum()

In [None]:
for i in categorical.columns:
    print('{} has total {} categories'.format(i,len(categorical[i].value_counts())))

In [None]:
df.plot.hexbin(x='Arrival_Time_hour',y='Price',gridsize=15)

In [None]:
# Applying label encoder
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
for i in ['Route1', 'Route2', 'Route3', 'Route4', 'Route5']:
    categorical[i]=encoder.fit_transform(categorical[i])


In [None]:
categorical.head()

In [None]:
drop_col(categorical,'Additional_Info')


In [None]:
categorical['Total_Stops'].unique()

In [None]:
# encoding Total stops
dict={'non-stop':0, '2 stops':2, '1 stop':1, '3 stops':3, '4 stops':4}
categorical['Total_Stops']=categorical['Total_Stops'].map(dict)

In [None]:
categorical['Total_Stops']

In [None]:
drop_col(categorical,'Source')
drop_col(categorical,'Destination')
drop_col(categorical,'Airline')

In [None]:
final_df=pd.concat([categorical,Airline,source,destination,df[continuous_col]],axis=1)

In [None]:
final_df.head()

In [None]:
pd.set_option('display.max_columns',33)
final_df.head()

In [None]:
def plot(data,col):
    fig,(ax1,ax2)=plt.subplots(2,1)
    sns.distplot(data[col],ax=ax1)
    sns.boxplot(data[col],ax=ax2)

In [None]:
final_df['Price']=np.where(final_df['Price']>=40000,final_df['Price'].median(),final_df['Price'])


In [None]:
plot(final_df,'Price')

In [None]:
X=final_df.drop('Price',axis=1)
y=df['Price']

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
mutual_info_classif(X,y)

In [None]:
imp = pd.DataFrame(mutual_info_classif(X,y),index=X.columns)
imp

In [None]:
imp.columns=['importance']
imp.sort_values(by='importance',ascending=False)

In [None]:
# spiliting the dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=123)

In [None]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
def predict(ml_model):
    print('Model is: {}'.format(ml_model))
    model= ml_model.fit(X_train,y_train)
    print("Training score: {}".format(model.score(X_train,y_train)))
    predictions = model.predict(X_test)
    print("Predictions are: {}".format(predictions))
    print('\n')
    r2score=r2_score(y_test,predictions) 
    print("r2 score is: {}".format(r2score))
          
    print('MAE:{}'.format(mean_absolute_error(y_test,predictions)))
    print('MSE:{}'.format(mean_squared_error(y_test,predictions)))
    print('RMSE:{}'.format(np.sqrt(mean_squared_error(y_test,predictions))))
     
    sns.distplot(y_test-predictions)      
          

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor

In [None]:
predict(RandomForestRegressor())

In [None]:
predict(LogisticRegression())

In [None]:
predict(KNeighborsRegressor())

In [None]:
from sklearn.svm import SVR
predict(SVR())

In [None]:
predict(GradientBoostingRegressor())

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
random_grid = {
    'n_estimators' : [100, 120, 150, 180, 200,220],
    'max_features':['auto','sqrt'],
    'max_depth':[5,10,15,20],
    }

In [None]:
rf=RandomForestRegressor()
rf_random=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,cv=3,verbose=2,n_jobs=-1,)

rf_random.fit(X_train,y_train)

# best parameter
rf_random.best_params_

In [None]:
# best parameter
rf_random.best_params_

In [None]:
#predicting the values
prediction = rf_random.predict(X_test)

#distribution plot between actual value and predicted value
sns.displot(y_test-prediction)

In [None]:
r2_score(y_test,prediction)