# Predict The Flight Ticket Price 
Flight ticket prices can be something hard to guess, today we might see a price, check out the price of the same flight tomorrow, it will be a different story. We might have often heard travellers saying that flight ticket prices are so unpredictable. Here you will be provided with prices of flight tickets for various airlines between the months of March and June of 2019 and between various cities.

Size of training set: 10683 records

Size of test set: 2671 records

FEATURES:
Airline: The name of the airline.

Date_of_Journey: The date of the journey

Source: The source from which the service begins.

Destination: The destination where the service ends.

Route: The route taken by the flight to reach the destination.

Dep_Time: The time when the journey starts from the source.

Arrival_Time: Time of arrival at the destination.

Duration: Total duration of the flight.

Total_Stops: Total stops between the source and destination.

Additional_Info: Additional information about the flight

Price: The price of the ticket

In [None]:
import pandas as pd
import numpy as np


In [None]:
train=pd.read_csv('Test_air.csv')
test=pd.read_csv('Train_flt.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm,skew
from sklearn.linear_model import LinearRegression


In [None]:
df=train.append(test)
#with the help of append command we can apply both the data in combine and simultaneusly

In [None]:
#Date of journey is important parameter so we split the obj as following
df['Date']=df['Date_of_Journey'].str.split('/').str[0]
df['Month']=df['Date_of_Journey'].str.split('/').str[1]
df['Year']=df['Date_of_Journey'].str.split('/').str[2]

In [None]:
#Arrival Time
df['Arrival_Time']=df['Arrival_Time'].str.split(' ').str[0]

In [None]:
df['Arrival_Hour']=df['Arrival_Time'].str.split(':').str[0]
df['Arrival_Minute']=df['Arrival_Time'].str.split(':').str[1]

df['Arrival_Hour']=df['Arrival_Hour'].astype(int)
df['Arrival_Minute']=df['Arrival_Minute'].astype(int)


In [None]:
#Total stops-This is combination of number and categorical so we will split this data as following
df['Total_Stops']=df['Total_Stops'].replace('non-stop','0 stop')



In [None]:
df['Stop']=df['Total_Stops'].str.split(' ').str[0]


In [None]:
df['Dep_Hour']=df['Dep_Time'].str.split(':').str[0]
df['Dep_Minute']=df['Dep_Time'].str.split(':').str[1]
df['Dep_Hour']=df['Dep_Hour'].astype(int)
df['Dep_Minute']=df['Dep_Minute'].astype(int)
df=df.drop(['Dep_Time'],axis=1)



Route-This columns tells about how many cities they have to reach from source to destination.
this column is very impportant to us because on the basis of rout we can predict the price

In [None]:
df['Route_1']=df['Route'].str.split('->').str[0]
df['Route_2']=df['Route'].str.split('->').str[1]
df['Route_3']=df['Route'].str.split('->').str[2]
df['Route_4']=df['Route'].str.split('->').str[3]
df['Route_5']=df['Route'].str.split('->').str[4]


In [None]:
df['Route_1'].fillna("None",inplace=True)
df['Route_2'].fillna("None",inplace=True)
df['Route_3'].fillna("None",inplace=True)
df['Route_4'].fillna("None",inplace=True)
df['Route_5'].fillna("None",inplace=True)

Prepare categorical variable for model using label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
lb_encode=LabelEncoder()


In [None]:
df["Additional_Info"]=lb_encode.fit_transform(df["Additional_Info"])

df["Airline"]=lb_encode.fit_transform(df["Airline"])
df["Destination"]=lb_encode.fit_transform(df["Destination"])

df["Source"]=lb_encode.fit_transform(df["Source"])

df['Route_1']=lb_encode.fit_transform(df["Route_1"])

df['Route_2']=lb_encode.fit_transform(df["Route_2"])
df['Route_3']=lb_encode.fit_transform(df["Route_3"])
df['Route_4']=lb_encode.fit_transform(df["Route_4"])
df['Route_5']=lb_encode.fit_transform(df["Route_5"])


In [None]:
#Training and testing of the dataset
df_train=df[0:10683]
df_test=df[10683:]
df_test=df_test.drop(['Price'],axis=1)

In [None]:
df_train.fillna(df_train.mean(), inplace=True)
df_train.head()

In [None]:
x=df_train[['Additional_Info','Airline','Destination','Source','Arrival_Hour','Dep_Hour','Route_1','Route_2',
           'Route_3','Route_4','Route_5']]


In [None]:
x.head(10)

In [None]:
y=df_train['Price']
y.head()

In [None]:
#Building the ML Model
#Applying Linear Regression Model
lm=LinearRegression()


In [None]:
#Build cross validation method
from sklearn.model_selection import KFold,cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
kfolds=KFold(n_splits=50,shuffle=True,random_state=100)

In [None]:
from sklearn.model_selection import GridSearchCV
from matplotlib.pylab import rcParams
rcParams['figure.figsize']=12,4
%matplotlib inline


In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.22,random_state=42)


In [None]:
lm=LinearRegression()
lm.fit(x_train,y_train)

In [None]:
lm.score(x_train,y_train)


In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit(x_train,y_train)
knn.score(x_train,y_train)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(x_train,y_train)
dtr.score(x_train,y_train)

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
etr = ExtraTreesRegressor()
etr.fit(x_train,y_train)
etr.score(x_train,y_train)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [None]:
alphas=np.array([1,0.1,0.01,0.0001,0])

In [None]:
model=Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(x_train,y_train)

In [None]:
print(grid.best_score_)
print(grid.best_estimator_.alpha)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rbf=RandomForestRegressor()
rbf.fit(x_train,y_train)
rbf.score(x_train,y_train)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

In [None]:
from sklearn.svm import SVR
m = SVR(gamma="scale")
m.fit(scaler.transform(x_train),y_train)

In [None]:
def score(y_pred,y):
    y_pred = np.log(y_pred)
    y = np.log(y)
    return 1 - ((np.sum((y_pred-y)**2))/len(y))**1/2
# Prediction
y_pred = m.predict(scaler.transform(x_test))
score(y_pred,y_test)

So here we can see that we get good accuracy score by Supported Vector Regressor upto 90% 

In [None]:
pred=dtr.predict(x_test)
print("Predicted result price:",pred)
print("actual price",y_test)