In [1694]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import pandas_profiling as pp
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
import xgboost
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [1695]:
pd.set_option('display.max_columns', 1000)

In [1696]:
def onehotencode(data,col_name,prefix):
    return pd.concat([data, pd.get_dummies(data[col_name], prefix=prefix)],axis=1)

In [1697]:
def encode_total_stops(data):
    data['Encoded_stops'] = 0
    for i in range(0,len(data['Total_Stops'])):
        if data.loc[i, 'Total_Stops'] == '1 stop':
            data.loc[i,'Encoded_stops'] = 1
        elif data.loc[i, 'Total_Stops'] == '2 stops':
            data.loc[i,'Encoded_stops'] = 2
        elif data.loc[i, 'Total_Stops'] == '3 stops' or data.loc[i, 'Total_Stops'] == '':
            data.loc[i,'Encoded_stops'] = 3
        elif data.loc[i, 'Total_Stops'] == '4 stops':
            data.loc[i,'Encoded_stops'] = 4
        elif data.loc[i, 'Total_Stops'] == 'non-stop':
            data.loc[i,'Encoded_stops'] = 0

    return data

In [1698]:
def extract_journey_data(data):
    data['Journey_Day'] = pd.DataFrame(pd.to_datetime(data['Date_of_Journey'], format='%d/%m/%Y').dt.day)
    data['Journey_Month'] = pd.DataFrame(pd.to_datetime(data['Date_of_Journey'], format='%d/%m/%Y').dt.month)
    data['Journey_day_of_week'] =  pd.DataFrame(((pd.to_datetime(data['Date_of_Journey'], format='%d/%m/%Y').dt.weekday_name)))
    data['Journey_over_weekend'] =  pd.DataFrame(((pd.to_datetime(data['Date_of_Journey'], format='%d/%m/%Y').dt.dayofweek) // 5).astype('int'))
    data['Date_Lapse'] = ((pd.to_datetime(data['Date_of_Journey'], format='%d/%m/%Y')) - 
                                        (pd.to_datetime('01/03/2019', format='%d/%m/%Y')))/np.timedelta64(1,'D')
    data['Date_Lapse'] = data['Date_Lapse']/np.max(data['Date_Lapse'])
#    data['is_friday'] = 0
#    data.loc[data['Journey_day_of_week'] == 4, 'is_friday'] = 1
        
    return data

In [1699]:
def encode_duration(data):
    data['Duration_Type'] = 0
    for i in range(0, len(data)):
    
        split_len = data.loc[i,'Duration'].split(' ')
        if len(split_len) == 1:
            if str(data.loc[i,'Duration']).find('h') != -1:
#                print ("hours: ",data.loc[i,'Duration'])
                hours = int(split_len[0].replace('h',""))
                mins = 0
            else:
                hours = 4 
#                print ("mins: ",data.loc[i,'Duration']
                mins = int(split_len[0].replace('m',""))
        else:
            
            hours = int(split_len[0].replace('h',""))
            mins  = int(split_len[1].replace('m',""))
        
        data.loc[i,'Duration_Type'] = (hours*60 + mins)/1440
        
    return data

In [1700]:
def flight_hours(data):
    data['Fly_Hours'] = 0
    
    data_Dep_Time = data['Dep_Time'].str.split(":",n=1,expand=True).astype('int64')
    data_Dep_Time.columns = ['Hours','Mins']
    for i in range(0,len(data)):
        
        hr = data_Dep_Time.loc[i, 'Hours']
        min = data_Dep_Time.loc[i, 'Mins']
        data.loc[i,'Fly_Hours'] = (hr*60 + (min))/1440
            
    return data

In [1701]:
def arrival_hours(data):
    data['Arrival_Hours'] = 0
    for i in range(0,len(data)):
      
        data_Arr_Time = data.loc[i,'Arrival_Time'].split(" ")[0]
        data_Dep_Time = data_Arr_Time.split(":")
        
        hr = int(data_Dep_Time[0])
        min = int(data_Dep_Time[1])
        data.loc[i,'Arrival_Hours'] = (hr*60 + (min))/1440
            
    return data

In [1702]:
def log_transform(data):
    return np.log10(data)

# Data cleaning and Modeling starts here

In [1703]:
data = pd.read_excel('C:\\Users\\LENOVO\\Desktop\\Flight_Ticket_Participant_Datasets\\Data_Train.xlsx')

In [1704]:
test = pd.read_excel('C:\\Users\\LENOVO\\Desktop\\Flight_Ticket_Participant_Datasets\\Test_set.xlsx')

In [1705]:
data_copy = data
test_copy = test

In [1706]:
data_copy.shape

(10683, 11)

In [1707]:
#data_copy.drop_duplicates(keep='first', inplace=True)

In [1708]:
data_copy.shape

(10683, 11)

In [1709]:
#Drop anamolies.
#data_copy = data_copy[data_copy['Price'] > data["Price"].quantile(0.01)]
#data_copy = data_copy[data_copy['Price'] < data["Price"].quantile(0.99)]
data_copy = data_copy[data_copy['Airline'] != 'Trujet']
data_copy = data_copy[data_copy['Additional_Info'] != 'Red-eye flight']

In [1710]:
#data_copy.loc[data_copy['Airline'] == 'Jet Airways Business','Airline'] = 'Jet Airways'
#data_copy.loc[data_copy['Airline'] == 'Multiple carriers Premium economy','Airline'] = 'Multiple carriers'
#data_copy.loc[data_copy['Airline'] == 'Vistara Premium economy','Airline'] = 'Vistara'

In [1711]:
#data_copy.loc[data_copy['Destination'] == 'New Delhi','Destination'] = 'Delhi'

In [1712]:
data_copy.reset_index(inplace=True)

In [1713]:
data_copy = onehotencode(data_copy,'Airline',"Airline_")

In [1714]:
data_copy = onehotencode(data_copy,'Source',"Source_")

In [1715]:
data_copy = onehotencode(data_copy,'Destination',"Destination_")

In [1716]:
#handle No Info, No info values before one hot encoding.
data_copy.loc[data_copy['Additional_Info'] == 'No info','Additional_Info'] = 'No Info'
data_copy.loc[data_copy['Additional_Info'] == '1 Long layover','Additional_Info'] = 'Layover'
data_copy.loc[data_copy['Additional_Info'] == '1 Short layover','Additional_Info'] = 'Layover'
data_copy.loc[data_copy['Additional_Info'] == '2 Long layover','Additional_Info'] = 'Layover'
data_copy.loc[data_copy['Additional_Info'] == 'Business class','Additional_Info'] = 'Business_Class'
data_copy.loc[data_copy['Additional_Info'] == 'Change airports','Additional_Info'] = 'Airport_Changed'


data_copy = onehotencode(data_copy,'Additional_Info',"Additional_Info_")

In [1717]:
data_copy = encode_total_stops(data_copy)

In [1718]:
data_copy = extract_journey_data(data_copy)

In [1719]:
data_copy = onehotencode(data_copy,'Journey_day_of_week',"Dayis_")

In [1720]:
data_copy['Month_Start'] = 0
data_copy.loc[data_copy['Journey_Day'] == 1,'Month_Start'] = 1
data_copy.loc[data_copy['Journey_Day'] == 3,'Month_Start'] = 1
data_copy.loc[data_copy['Journey_Day'] == 6,'Month_Start'] = 1
data_copy.loc[data_copy['Journey_Day'] == 9,'Month_Start'] = 1

In [1721]:
data_copy = encode_duration(data_copy)

In [1722]:
data_copy = flight_hours(data_copy)

In [1723]:
data_copy = arrival_hours(data_copy)

In [1724]:
data_copy = onehotencode(data_copy,'Journey_Month',"Month_")

In [1725]:
data_final = data_copy

In [1726]:
data_copy.head(2)

Unnamed: 0,index,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Airline__Air Asia,Airline__Air India,Airline__GoAir,Airline__IndiGo,Airline__Jet Airways,Airline__Jet Airways Business,Airline__Multiple carriers,Airline__Multiple carriers Premium economy,Airline__SpiceJet,Airline__Vistara,Airline__Vistara Premium economy,Source__Banglore,Source__Chennai,Source__Delhi,Source__Kolkata,Source__Mumbai,Destination__Banglore,Destination__Cochin,Destination__Delhi,Destination__Hyderabad,Destination__Kolkata,Destination__New Delhi,Additional_Info__Airport_Changed,Additional_Info__Business_Class,Additional_Info__In-flight meal not included,Additional_Info__Layover,Additional_Info__No Info,Additional_Info__No check-in baggage included,Encoded_stops,Journey_Day,Journey_Month,Journey_day_of_week,Journey_over_weekend,Date_Lapse,Dayis__Friday,Dayis__Monday,Dayis__Saturday,Dayis__Sunday,Dayis__Thursday,Dayis__Tuesday,Dayis__Wednesday,Month_Start,Duration_Type,Fly_Hours,Arrival_Hours,Month__3,Month__4,Month__5,Month__6
0,0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No Info,3897,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,24,3,Sunday,1,0.194915,0,0,0,1,0,0,0,0,0.118056,0.930556,0.048611,1,0,0,0
1,1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No Info,7662,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,2,1,5,Wednesday,0,0.516949,0,0,0,0,0,0,1,1,0.309028,0.243056,0.552083,0,0,1,0


In [1727]:
data_final.drop(['Airline','Date_of_Journey','Source','Destination','Route','Dep_Time','Arrival_Time','Duration',
                'Total_Stops','Additional_Info','Journey_day_of_week','Journey_Month'],axis=1,inplace=True)

In [1728]:
data_final.drop(['index'],axis=1,inplace=True)

In [1729]:
train_Feature = data_copy[[x for x in data_final.columns if (x != 'Price' and x != 'index')]]
train_Target = data_copy['Price']

In [1730]:
X_train, X_test, y_train, y_test = train_test_split(train_Feature, train_Target, test_size=0.30, random_state=101)

In [1731]:
null_columns=X_train.columns[X_train.isnull().any()]
X_train[null_columns].isnull().sum()

Series([], dtype: float64)

In [1732]:
X_train.head()

Unnamed: 0,Airline__Air Asia,Airline__Air India,Airline__GoAir,Airline__IndiGo,Airline__Jet Airways,Airline__Jet Airways Business,Airline__Multiple carriers,Airline__Multiple carriers Premium economy,Airline__SpiceJet,Airline__Vistara,Airline__Vistara Premium economy,Source__Banglore,Source__Chennai,Source__Delhi,Source__Kolkata,Source__Mumbai,Destination__Banglore,Destination__Cochin,Destination__Delhi,Destination__Hyderabad,Destination__Kolkata,Destination__New Delhi,Additional_Info__Airport_Changed,Additional_Info__Business_Class,Additional_Info__In-flight meal not included,Additional_Info__Layover,Additional_Info__No Info,Additional_Info__No check-in baggage included,Encoded_stops,Journey_Day,Journey_over_weekend,Date_Lapse,Dayis__Friday,Dayis__Monday,Dayis__Saturday,Dayis__Sunday,Dayis__Thursday,Dayis__Tuesday,Dayis__Wednesday,Month_Start,Duration_Type,Fly_Hours,Arrival_Hours,Month__3,Month__4,Month__5,Month__6
9541,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,27,1,0.483051,0,0,1,0,0,0,0,0,0.131944,0.611111,0.743056,0,1,0,0
3798,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,2,21,0,0.169492,0,0,0,0,1,0,0,0,1.083333,0.71875,0.802083,1,0,0,0
7863,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,27,0,1.0,0,0,0,0,1,0,0,0,0.121528,0.465278,0.586806,0,0,0,1
3750,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,15,1,0.898305,0,0,1,0,0,0,0,0,0.104167,0.829861,0.934028,0,0,0,1
6726,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0.516949,0,0,0,0,0,0,1,1,0.0625,0.381944,0.444444,0,0,1,0


In [1733]:
scaler = StandardScaler()

In [1734]:
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=train_Feature.columns)

In [1735]:
X_test =  pd.DataFrame(scaler.transform(X_test),columns=train_Feature.columns)

# Final Prediction on Test set starts here

In [1736]:
#test_copy.loc[test_copy['Airline'] == 'Jet Airways Business','Airline'] = 'Jet Airways'
#test_copy.loc[test_copy['Airline'] == 'Multiple carriers Premium economy','Airline'] = 'Multiple carriers'
#test_copy.loc[test_copy['Airline'] == 'Vistara Premium economy','Airline'] = 'Vistara'

In [1737]:
test_copy = onehotencode(test_copy,'Airline',"Airline_")

In [1738]:
test_copy = onehotencode(test_copy,'Source',"Source_")

In [1739]:
test_copy = onehotencode(test_copy,'Destination',"Destination_")

In [1740]:
#handle No Info, No info values before one hot encoding.
test_copy.loc[test_copy['Additional_Info'] == 'No info','Additional_Info'] = 'No Info'
test_copy.loc[test_copy['Additional_Info'] == '1 Long layover','Additional_Info'] = 'Layover'
test_copy.loc[test_copy['Additional_Info'] == '1 Short layover','Additional_Info'] = 'Layover'
test_copy.loc[test_copy['Additional_Info'] == '1 Long layover','Additional_Info'] = 'Layover'
test_copy.loc[test_copy['Additional_Info'] == 'Business class','Additional_Info'] = 'Business_Class'
test_copy.loc[test_copy['Additional_Info'] == 'Change airports','Additional_Info'] = 'Airport_Changed'

test_copy = onehotencode(test_copy,'Additional_Info',"Additional_Info_")

In [1741]:
test_copy = encode_total_stops(test_copy)

In [1742]:
test_copy = extract_journey_data(test_copy)

In [1743]:
test_copy['Month_Start'] = 0
test_copy.loc[test_copy['Journey_Day'] == 1,'Month_Start'] = 1
test_copy.loc[test_copy['Journey_Day'] == 3,'Month_Start'] = 1
test_copy.loc[test_copy['Journey_Day'] == 6,'Month_Start'] = 1
test_copy.loc[test_copy['Journey_Day'] == 9,'Month_Start'] = 1

In [1744]:
test_copy = encode_duration(test_copy)

In [1745]:
test_copy = flight_hours(test_copy)

In [1746]:
test_copy = arrival_hours(test_copy)

In [1747]:
test_copy = onehotencode(test_copy,'Journey_day_of_week',"Dayis_")

In [1748]:
test_copy = onehotencode(test_copy,'Journey_Month',"Month_")

In [1749]:
test_final = test_copy

In [1750]:
test_copy.shape

(2671, 59)

In [1751]:
[X for X in data_copy.columns if X not in test_final.columns]

['Price']

In [1752]:
data_copy.head()

Unnamed: 0,Price,Airline__Air Asia,Airline__Air India,Airline__GoAir,Airline__IndiGo,Airline__Jet Airways,Airline__Jet Airways Business,Airline__Multiple carriers,Airline__Multiple carriers Premium economy,Airline__SpiceJet,Airline__Vistara,Airline__Vistara Premium economy,Source__Banglore,Source__Chennai,Source__Delhi,Source__Kolkata,Source__Mumbai,Destination__Banglore,Destination__Cochin,Destination__Delhi,Destination__Hyderabad,Destination__Kolkata,Destination__New Delhi,Additional_Info__Airport_Changed,Additional_Info__Business_Class,Additional_Info__In-flight meal not included,Additional_Info__Layover,Additional_Info__No Info,Additional_Info__No check-in baggage included,Encoded_stops,Journey_Day,Journey_over_weekend,Date_Lapse,Dayis__Friday,Dayis__Monday,Dayis__Saturday,Dayis__Sunday,Dayis__Thursday,Dayis__Tuesday,Dayis__Wednesday,Month_Start,Duration_Type,Fly_Hours,Arrival_Hours,Month__3,Month__4,Month__5,Month__6
0,3897,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,24,1,0.194915,0,0,0,1,0,0,0,0,0.118056,0.930556,0.048611,1,0,0,0
1,7662,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,2,1,0,0.516949,0,0,0,0,0,0,1,1,0.309028,0.243056,0.552083,0,0,1,0
2,13882,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,2,9,1,0.847458,0,0,0,1,0,0,0,1,0.791667,0.392361,0.184028,0,0,0,1
3,6218,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,12,1,0.610169,0,0,0,1,0,0,0,0,0.225694,0.753472,0.979167,0,0,1,0
4,13302,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0.0,1,0,0,0,0,0,0,1,0.197917,0.701389,0.899306,1,0,0,0


In [1753]:
test_final.drop(['Airline','Date_of_Journey','Source','Destination','Route','Dep_Time','Arrival_Time','Duration',
                'Total_Stops','Additional_Info','Journey_day_of_week','Journey_Month'],axis=1,inplace=True)

In [1754]:
test_final =  pd.DataFrame(scaler.transform(test_final),columns=test_final.columns)

In [1755]:
test_final.shape

(2671, 47)

In [1756]:
train_Feature.shape

(10681, 47)

In [1757]:
y_train_log =  y_train.apply(log_transform)

In [1758]:
y_test_log =  y_test.apply(log_transform)

##Random Forest

In [1759]:
#X_train = X_train.loc[:,['Airline__Air Asia','Airline__Air India','Airline__GoAir',
#                     'Airline__IndiGo','Airline__Multiple carriers','Airline__Vistara',
#                     'Additional_Info__Airport_Changed','Additional_Info__Business_Class',
#                     'Encoded_stops','Dayis__Friday','Dayis__Monday','Dayis__Thursday',
#                     'Dayis__Tuesday','Month_Start','Duration_Type','Fly_Hours','Arrival_Hours']]

In [1760]:
#X_test = X_test.loc[:,['Airline__Air Asia','Airline__Air India','Airline__GoAir',
#                     'Airline__IndiGo','Airline__Multiple carriers','Airline__Vistara',
#                     'Additional_Info__Airport_Changed','Additional_Info__Business_Class',
#                     'Encoded_stops','Dayis__Friday','Dayis__Monday','Dayis__Thursday',
#                     'Dayis__Tuesday','Month_Start','Duration_Type','Fly_Hours','Arrival_Hours']]

In [1761]:
#test_final = test_final.loc[:,['Airline__Air Asia','Airline__Air India','Airline__GoAir',
#                     'Airline__IndiGo','Airline__Multiple carriers','Airline__Vistara',
#                    'Additional_Info__Airport_Changed','Additional_Info__Business_Class',
#                     'Encoded_stops','Dayis__Friday','Dayis__Monday','Dayis__Thursday',
#                     'Dayis__Tuesday','Month_Start','Duration_Type','Fly_Hours','Arrival_Hours']]

In [1762]:
rf = RandomForestRegressor(random_state=101,
                          bootstrap=True,max_depth=12,
                           max_features='auto',min_samples_leaf=4,
                           min_samples_split=5,n_estimators=100)

In [1763]:
X_train.head()

Unnamed: 0,Airline__Air Asia,Airline__Air India,Airline__GoAir,Airline__IndiGo,Airline__Jet Airways,Airline__Jet Airways Business,Airline__Multiple carriers,Airline__Multiple carriers Premium economy,Airline__SpiceJet,Airline__Vistara,Airline__Vistara Premium economy,Source__Banglore,Source__Chennai,Source__Delhi,Source__Kolkata,Source__Mumbai,Destination__Banglore,Destination__Cochin,Destination__Delhi,Destination__Hyderabad,Destination__Kolkata,Destination__New Delhi,Additional_Info__Airport_Changed,Additional_Info__Business_Class,Additional_Info__In-flight meal not included,Additional_Info__Layover,Additional_Info__No Info,Additional_Info__No check-in baggage included,Encoded_stops,Journey_Day,Journey_over_weekend,Date_Lapse,Dayis__Friday,Dayis__Monday,Dayis__Saturday,Dayis__Sunday,Dayis__Thursday,Dayis__Tuesday,Dayis__Wednesday,Month_Start,Duration_Type,Fly_Hours,Arrival_Hours,Month__3,Month__4,Month__5,Month__6
0,-0.170421,-0.444843,-0.131458,-0.489485,-0.752536,-0.016358,-0.356023,-0.036598,-0.281799,4.505457,-0.011566,-0.511637,-0.189818,1.163256,-0.602899,-0.266283,-0.602899,1.163256,-0.365729,-0.266283,-0.189818,-0.313498,-0.020036,-0.016358,-0.476426,-0.043315,0.525343,-0.172076,-1.221631,1.604183,1.54761,-0.214522,-0.306761,-0.461782,2.346599,-0.404898,-0.45044,-0.300719,-0.487594,-0.877909,-0.89394,0.315136,0.592942,-0.584764,2.979232,-0.691626,-0.686559
1,-0.170421,2.247986,-0.131458,-0.489485,-0.752536,-0.016358,-0.356023,-0.036598,-0.281799,-0.221953,-0.011566,-0.511637,-0.189818,1.163256,-0.602899,-0.266283,-0.602899,1.163256,-0.365729,-0.266283,-0.189818,-0.313498,-0.020036,-0.016358,-0.476426,-0.043315,0.525343,-0.172076,1.738315,0.893206,-0.646158,-1.230589,-0.306761,-0.461782,-0.426149,-0.404898,2.220051,-0.300719,-0.487594,-0.877909,1.796806,0.764349,0.800699,1.71009,-0.335657,-0.691626,-0.686559
2,-0.170421,-0.444843,-0.131458,-0.489485,1.32884,-0.016358,-0.356023,-0.036598,-0.281799,-0.221953,-0.011566,1.954512,-0.189818,-0.859656,-0.602899,-0.266283,-0.602899,-0.859656,2.734262,-0.266283,-0.189818,-0.313498,-0.020036,-0.016358,2.098962,-0.043315,-1.903517,-0.172076,-1.221631,1.604183,-0.646158,1.460616,-0.306761,-0.461782,-0.426149,-0.404898,2.220051,-0.300719,-0.487594,-0.877909,-0.923401,-0.293474,0.042994,-0.584764,-0.335657,-0.691626,1.456538
3,5.867817,-0.444843,-0.131458,-0.489485,-0.752536,-0.016358,-0.356023,-0.036598,-0.281799,-0.221953,-0.011566,-0.511637,-0.189818,-0.859656,1.658653,-0.266283,1.658653,-0.859656,-0.365729,-0.266283,-0.189818,-0.313498,-0.020036,-0.016358,-0.476426,-0.043315,0.525343,-0.172076,-1.221631,0.18223,1.54761,1.131081,-0.306761,-0.461782,2.346599,-0.404898,-0.45044,-0.300719,-0.487594,-0.877909,-0.972502,1.228051,1.265099,-0.584764,-0.335657,-0.691626,1.456538
4,-0.170421,-0.444843,-0.131458,2.042964,-0.752536,-0.016358,-0.356023,-0.036598,-0.281799,-0.221953,-0.011566,-0.511637,-0.189818,-0.859656,-0.602899,3.7554,-0.602899,-0.859656,-0.365729,3.7554,-0.189818,-0.313498,-0.020036,-0.016358,-0.476426,-0.043315,0.525343,-0.172076,-1.221631,-1.476716,-0.646158,-0.104677,-0.306761,-0.461782,-0.426149,-0.404898,-0.45044,-0.300719,2.050885,1.13907,-1.090345,-0.641251,-0.458069,-0.584764,-0.335657,1.445868,-0.686559


In [1764]:
rf.fit(X_train, y_train_log)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=101, verbose=0, warm_start=False)

In [1765]:
np.sqrt(mean_squared_log_error(y_test,10**rf.predict(X_test)))

0.1407015124527019

In [1766]:
np.sqrt(mean_squared_log_error(y_train,10**rf.predict(X_train)))

0.11611850135127433

In [1767]:
test_pred = rf.predict(test_final)

In [1768]:
test_pred = pd.DataFrame(10**test_pred,columns=['Price'])

In [1769]:
test_pred.to_excel("C:/Users/LENOVO/Desktop/RandomForest_v1.xlsx",index=False)

In [1770]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [10,20,35,50],
    'n_estimators': [200,400,600,800,1000]
}

In [1771]:
rf = RandomForestRegressor()

In [1772]:
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [1773]:
grid_search.fit(X_train, y_train_log)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 11.9min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [200, 400, 600, 800, 1000], 'max_depth': [10, 20, 35, 50], 'bootstrap': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [1774]:
grid_search.best_params_

{'bootstrap': True, 'max_depth': 35, 'n_estimators': 200}

In [1775]:
np.sqrt(mean_squared_log_error(y_test,10**grid_search.predict(X_test)))

0.12254197530298852

In [1776]:
np.sqrt(mean_squared_log_error(y_train,10**grid_search.predict(X_train)))

0.053537698409291262

In [1777]:
test_pred = 10**grid_search.predict(test_final)

In [1778]:
test_pred = pd.DataFrame(test_pred,columns=['Price'])

In [1785]:
test_pred.to_excel("C:/Users/LENOVO/Desktop/FinalOutput.xlsx",index=False)

In [1786]:
for i in zip(X_train.columns,grid_search.best_estimator_.feature_importances_):
    print (i)

('Airline__Air Asia', 0.0017665909542312174)
('Airline__Air India', 0.0075490430061892679)
('Airline__GoAir', 0.0015146364648180278)
('Airline__IndiGo', 0.0093781561917556343)
('Airline__Jet Airways', 0.060738411563027291)
('Airline__Jet Airways Business', 0.0012044165118728072)
('Airline__Multiple carriers', 0.017594707067675016)
('Airline__Multiple carriers Premium economy', 0.0011430142918755417)
('Airline__SpiceJet', 0.0058009378602809939)
('Airline__Vistara', 0.0065774394162790104)
('Airline__Vistara Premium economy', 3.9221711745292223e-05)
('Source__Banglore', 0.00096912715285481189)
('Source__Chennai', 0.00090023047646028813)
('Source__Delhi', 0.0020198442726403433)
('Source__Kolkata', 0.0035210266346373271)
('Source__Mumbai', 0.00241998991808019)
('Destination__Banglore', 0.003654471362548518)
('Destination__Cochin', 0.0019959059153145521)
('Destination__Delhi', 0.0010942645951736313)
('Destination__Hyderabad', 0.0027754482738147988)
('Destination__Kolkata', 0.0007527825049992

In [1781]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [1782]:
data_copy1 = data_copy

In [1783]:
data_copy2 = add_constant(data_copy1)

In [1784]:
pd.Series([variance_inflation_factor(data_copy2.values, i) 
               for i in range(data_copy2.shape[1])], 
              index=data_copy2.columns)

  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


const                                            0.000000
Price                                            3.506180
Airline__Air Asia                                     inf
Airline__Air India                                    inf
Airline__GoAir                                        inf
Airline__IndiGo                                       inf
Airline__Jet Airways                                  inf
Airline__Jet Airways Business                         inf
Airline__Multiple carriers                            inf
Airline__Multiple carriers Premium economy            inf
Airline__SpiceJet                                     inf
Airline__Vistara                                      inf
Airline__Vistara Premium economy                      inf
Source__Banglore                                      inf
Source__Chennai                                       inf
Source__Delhi                                         inf
Source__Kolkata                                       inf
Source__Mumbai