In [120]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import pandas_profiling as pp
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
import xgboost
from sklearn.preprocessing import MinMaxScaler

In [121]:
pd.set_option('display.max_columns', 1000)

In [122]:
def onehotencode(data,col_name,prefix):
    return pd.concat([data, pd.get_dummies(data[col_name], prefix=prefix)],axis=1)

In [123]:
def encode_total_stops(data):
    data['Encoded_stops'] = 0.0
    for i in range(0,len(data['Total_Stops'])):
        if data.loc[i, 'Total_Stops'] == '1 stop':
            data.loc[i,'Encoded_stops'] = 0.1
        elif data.loc[i, 'Total_Stops'] == '2 stops':
            data.loc[i,'Encoded_stops'] = 0.2
        elif data.loc[i, 'Total_Stops'] == '3 stops':
            data.loc[i,'Encoded_stops'] = 0.3
        elif data.loc[i, 'Total_Stops'] == '4 stops':
            data.loc[i,'Encoded_stops'] = 0.4
        elif data.loc[i, 'Total_Stops'] == 'non-stop':
            data.loc[i,'Encoded_stops'] = 0.0

    return data

In [124]:
def extract_journey_data(data):
    data['Journey_Month'] = pd.DataFrame(pd.to_datetime(data['Date_of_Journey'], format='%d/%m/%Y').dt.month)
    data['Journey_over_weekend'] =  pd.DataFrame(((pd.to_datetime(data['Date_of_Journey'], format='%d/%m/%Y').dt.dayofweek) // 5).astype('int'))
    
    return data

In [125]:
def encode_duration(data):
    data_copy['Duration_Type'] = 0
    for i in range(0, len(data)):
    
        try:
            val = int(str(data.loc[i,'Duration'].split('h')[0]))
        except:
            # this is for AirIndia for with flight duration as '5m', changing the duration
            #based on same row for AirIndia for same destination at same time with same stop overs.
            val = 4
            
       
        if val < 3:
            data.loc[i,'Duration_Type'] = 0.0
        elif val < 6:
            data.loc[i,'Duration_Type'] = 0.1
        elif val < 12:
            data.loc[i,'Duration_Type'] = 0.2
        else:
            data.loc[i,'Duration_Type'] = 0.3
        
    return data

In [126]:
def flight_hours(data):
    data['Fly_Hours'] = 0
    
    data_Dep_Time = data['Dep_Time'].str.split(":",n=1,expand=True).astype('int64')
    data_Dep_Time.columns = ['Hours','Mins']
    for i in range(0,len(data)):
        
        val = data_Dep_Time.loc[i, 'Hours']
        
        if val == 0:
            data.loc[i,'Fly_Hours'] = 'Midnight'
        elif val >= 1 and val < 4:
            data.loc[i,'Fly_Hours'] = 'Early_Morning'
        elif val >= 4 and val < 6:
            data.loc[i,'Fly_Hours'] = 'Dawn'
        elif val >= 6 and val < 9:
            data.loc[i,'Fly_Hours'] = 'Morning'
        elif val >= 9 and val < 12:
            data.loc[i,'Fly_Hours'] = 'Mid_Morning'
        elif val >= 12 and val < 13:
            data.loc[i,'Fly_Hours'] = 'Noon'
        elif val >= 13 and val < 16:
            data.loc[i,'Fly_Hours'] = 'After_Noon'
        elif val >= 16 and val < 21:
            data.loc[i,'Fly_Hours'] = 'Evening'
        elif val >= 21 and val <= 23:
            data.loc[i,'Fly_Hours'] = 'Night'
            
    return data

# Data cleaning and Modeling starts here

In [127]:
data = pd.read_excel('C:\\Users\\LENOVO\\Desktop\\Flight_Ticket_Participant_Datasets\\Data_Train.xlsx')

In [128]:
test = pd.read_excel('C:\\Users\\LENOVO\\Desktop\\Flight_Ticket_Participant_Datasets\\Test_set.xlsx')

In [131]:
data_copy = data
test_copy = test

In [132]:
#Drop anamolies.
#data_copy = data_copy[data_copy['Price'] > data["Price"].quantile(0.01)]
#data_copy = data_copy[data_copy['Price'] < data["Price"].quantile(0.99)]
data_copy = data_copy[data_copy['Airline'] != 'Trujet']
data_copy = data_copy[data_copy['Additional_Info'] != 'Red-eye flight']

In [133]:
data_copy.reset_index(inplace=True)

In [134]:
data_copy = onehotencode(data_copy,'Airline',"Airline_")

In [135]:
data_copy.columns

Index(['index', 'Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price', 'Airline__Air Asia', 'Airline__Air India',
       'Airline__GoAir', 'Airline__IndiGo', 'Airline__Jet Airways',
       'Airline__Jet Airways Business', 'Airline__Multiple carriers',
       'Airline__Multiple carriers Premium economy', 'Airline__SpiceJet',
       'Airline__Vistara', 'Airline__Vistara Premium economy'],
      dtype='object')

In [136]:
data_check = onehotencode(data,'Airline',"Airline_")

In [137]:
data_copy = onehotencode(data_copy,'Source',"Source_")

In [138]:
data_copy = onehotencode(data_copy,'Destination',"Destination_")

In [139]:
#handle No Info, No info values before one hot encoding.
data_copy.loc[data_copy['Additional_Info'] == 'No info','Additional_Info'] = 'No Info'
data_copy.loc[data_copy['Additional_Info'] == '1 Long layover','Additional_Info'] = 'Others'
data_copy.loc[data_copy['Additional_Info'] == '1 Short layover','Additional_Info'] = 'Others'
data_copy.loc[data_copy['Additional_Info'] == '2 Long layover','Additional_Info'] = 'Others'
data_copy.loc[data_copy['Additional_Info'] == 'Business class','Additional_Info'] = 'Others'
data_copy.loc[data_copy['Additional_Info'] == 'Change airports','Additional_Info'] = 'Others'


data_copy = onehotencode(data_copy,'Additional_Info',"Additional_Info_")

In [140]:
data_copy = encode_total_stops(data_copy)

In [141]:
data_copy = extract_journey_data(data_copy)

In [142]:
data_copy = encode_duration(data_copy)

In [143]:
data_copy = flight_hours(data_copy)

In [144]:
data_copy = onehotencode(data_copy,'Fly_Hours',"Fly_Hours_")

In [145]:
data_final = data_copy

In [146]:
data_final.loc[data_final['Encoded_stops'].isnull(),'Encoded_stops'] = 0.3

In [147]:
data_final.drop(['Airline','Date_of_Journey','Source','Destination','Route','Dep_Time','Arrival_Time','Duration',
                'Total_Stops','Additional_Info','Fly_Hours'],axis=1,inplace=True)

In [148]:
data_final.drop(['index'],axis=1,inplace=True)

In [149]:
train_Feature = data_copy[[x for x in data_final.columns if (x != 'Price' and x != 'index')]]
train_Target = data_copy['Price']

In [150]:
train_Feature.columns

Index(['Airline__Air Asia', 'Airline__Air India', 'Airline__GoAir',
       'Airline__IndiGo', 'Airline__Jet Airways',
       'Airline__Jet Airways Business', 'Airline__Multiple carriers',
       'Airline__Multiple carriers Premium economy', 'Airline__SpiceJet',
       'Airline__Vistara', 'Airline__Vistara Premium economy',
       'Source__Banglore', 'Source__Chennai', 'Source__Delhi',
       'Source__Kolkata', 'Source__Mumbai', 'Destination__Banglore',
       'Destination__Cochin', 'Destination__Delhi', 'Destination__Hyderabad',
       'Destination__Kolkata', 'Destination__New Delhi',
       'Additional_Info__In-flight meal not included',
       'Additional_Info__No Info',
       'Additional_Info__No check-in baggage included',
       'Additional_Info__Others', 'Encoded_stops', 'Journey_Month',
       'Journey_over_weekend', 'Duration_Type', 'Fly_Hours__After_Noon',
       'Fly_Hours__Dawn', 'Fly_Hours__Early_Morning', 'Fly_Hours__Evening',
       'Fly_Hours__Mid_Morning', 'Fly_Hours__

In [151]:
X_train, X_test, y_train, y_test = train_test_split(train_Feature, train_Target, test_size=0.30, random_state=101)

In [152]:
null_columns=X_train.columns[X_train.isnull().any()]
X_train[null_columns].isnull().sum()

Series([], dtype: float64)

In [153]:
X_train.head()

Unnamed: 0,Airline__Air Asia,Airline__Air India,Airline__GoAir,Airline__IndiGo,Airline__Jet Airways,Airline__Jet Airways Business,Airline__Multiple carriers,Airline__Multiple carriers Premium economy,Airline__SpiceJet,Airline__Vistara,Airline__Vistara Premium economy,Source__Banglore,Source__Chennai,Source__Delhi,Source__Kolkata,Source__Mumbai,Destination__Banglore,Destination__Cochin,Destination__Delhi,Destination__Hyderabad,Destination__Kolkata,Destination__New Delhi,Additional_Info__In-flight meal not included,Additional_Info__No Info,Additional_Info__No check-in baggage included,Additional_Info__Others,Encoded_stops,Journey_Month,Journey_over_weekend,Duration_Type,Fly_Hours__After_Noon,Fly_Hours__Dawn,Fly_Hours__Early_Morning,Fly_Hours__Evening,Fly_Hours__Mid_Morning,Fly_Hours__Midnight,Fly_Hours__Morning,Fly_Hours__Night,Fly_Hours__Noon
9541,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0.0,4,1,0.1,1,0,0,0,0,0,0,0,0
3798,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0.2,3,0,0.3,0,0,0,1,0,0,0,0,0
7863,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0.0,6,0,0.0,0,0,0,0,1,0,0,0,0
3750,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0.0,6,1,0.0,0,0,0,1,0,0,0,0,0
6726,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0.0,5,0,0.0,0,0,0,0,1,0,0,0,0


In [154]:
scaler = MinMaxScaler()

In [155]:
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=train_Feature.columns)

In [157]:
X_test =  pd.DataFrame(scaler.transform(X_test),columns=train_Feature.columns)

In [159]:
reg = LinearRegression(normalize=True,fit_intercept=False)

In [160]:
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=True)

In [161]:
reg.coef_

array([ -3.07602085e+15,  -3.07602085e+15,  -3.07602085e+15,
        -3.07602085e+15,  -3.07602085e+15,  -3.07602085e+15,
        -3.07602085e+15,  -3.07602085e+15,  -3.07602085e+15,
        -3.07602085e+15,  -3.07602085e+15,  -4.33038778e+15,
        -3.50280708e+15,  -5.80637915e+15,  -1.81995586e+15,
         2.30556611e+15,  -3.37007010e+15,   6.16353195e+14,
        -8.59638176e+14,  -7.49559207e+15,  -1.68721888e+15,
        -8.59638176e+14,   3.75792159e+15,   3.75792159e+15,
         3.75792159e+15,   3.75792159e+15,   9.22850000e+03,
        -1.03675000e+03,   3.84062500e+02,   1.11850000e+03,
         4.50812521e+15,   4.50812521e+15,   4.50812521e+15,
         4.50812521e+15,   4.50812521e+15,   4.50812521e+15,
         4.50812521e+15,   4.50812521e+15,   4.50812521e+15])

In [162]:
reg.predict(X_test)[reg.predict(X_test) < 0]

array([], dtype=float64)

In [163]:
test_msle = mean_squared_log_error(y_test,reg.predict(X_test))

In [164]:
np.sqrt(test_msle)

0.25809218360684322

In [165]:
train_msle = mean_squared_log_error(y_train,reg.predict(X_train))

In [166]:
np.sqrt(train_msle)

0.25982089282664667

# Final Prediction on Test set starts here

In [167]:
test_copy = onehotencode(test_copy,'Airline',"Airline_")

In [168]:
test_copy = onehotencode(test_copy,'Source',"Source_")

In [169]:
test_copy = onehotencode(test_copy,'Destination',"Destination_")

In [170]:
#handle No Info, No info values before one hot encoding.
test_copy.loc[test_copy['Additional_Info'] == 'No info','Additional_Info'] = 'No Info'
test_copy.loc[test_copy['Additional_Info'] == '1 Long layover','Additional_Info'] = 'Others'
test_copy.loc[test_copy['Additional_Info'] == '1 Short layover','Additional_Info'] = 'Others'
test_copy.loc[test_copy['Additional_Info'] == '1 Long layover','Additional_Info'] = 'Others'
test_copy.loc[test_copy['Additional_Info'] == 'Business class','Additional_Info'] = 'Others'
test_copy.loc[test_copy['Additional_Info'] == 'Change airports','Additional_Info'] = 'Others'

test_copy = onehotencode(test_copy,'Additional_Info',"Additional_Info_")

In [171]:
test_copy = encode_total_stops(test_copy)

In [172]:
test_copy = extract_journey_data(test_copy)

In [173]:
test_copy = encode_duration(test_copy)

In [174]:
test_copy = flight_hours(test_copy)

In [175]:
test_copy = onehotencode(test_copy,'Fly_Hours',"Fly_Hours_")

In [176]:
test_final = test_copy

In [177]:
test_final.drop(['Airline','Date_of_Journey','Source','Destination','Route','Dep_Time','Arrival_Time','Duration',
                'Total_Stops','Additional_Info','Fly_Hours'],axis=1,inplace=True)

In [178]:
test_copy.columns

Index(['Airline__Air Asia', 'Airline__Air India', 'Airline__GoAir',
       'Airline__IndiGo', 'Airline__Jet Airways',
       'Airline__Jet Airways Business', 'Airline__Multiple carriers',
       'Airline__Multiple carriers Premium economy', 'Airline__SpiceJet',
       'Airline__Vistara', 'Airline__Vistara Premium economy',
       'Source__Banglore', 'Source__Chennai', 'Source__Delhi',
       'Source__Kolkata', 'Source__Mumbai', 'Destination__Banglore',
       'Destination__Cochin', 'Destination__Delhi', 'Destination__Hyderabad',
       'Destination__Kolkata', 'Destination__New Delhi',
       'Additional_Info__In-flight meal not included',
       'Additional_Info__No Info',
       'Additional_Info__No check-in baggage included',
       'Additional_Info__Others', 'Encoded_stops', 'Journey_Month',
       'Journey_over_weekend', 'Duration_Type', 'Fly_Hours__After_Noon',
       'Fly_Hours__Dawn', 'Fly_Hours__Early_Morning', 'Fly_Hours__Evening',
       'Fly_Hours__Mid_Morning', 'Fly_Hours__

In [179]:
train_Feature.columns

Index(['Airline__Air Asia', 'Airline__Air India', 'Airline__GoAir',
       'Airline__IndiGo', 'Airline__Jet Airways',
       'Airline__Jet Airways Business', 'Airline__Multiple carriers',
       'Airline__Multiple carriers Premium economy', 'Airline__SpiceJet',
       'Airline__Vistara', 'Airline__Vistara Premium economy',
       'Source__Banglore', 'Source__Chennai', 'Source__Delhi',
       'Source__Kolkata', 'Source__Mumbai', 'Destination__Banglore',
       'Destination__Cochin', 'Destination__Delhi', 'Destination__Hyderabad',
       'Destination__Kolkata', 'Destination__New Delhi',
       'Additional_Info__In-flight meal not included',
       'Additional_Info__No Info',
       'Additional_Info__No check-in baggage included',
       'Additional_Info__Others', 'Encoded_stops', 'Journey_Month',
       'Journey_over_weekend', 'Duration_Type', 'Fly_Hours__After_Noon',
       'Fly_Hours__Dawn', 'Fly_Hours__Early_Morning', 'Fly_Hours__Evening',
       'Fly_Hours__Mid_Morning', 'Fly_Hours__

In [180]:
test_final.shape

(2671, 39)

In [181]:
train_Feature.shape

(10681, 39)

In [182]:
reg.score(X_train,y_train)

0.66699202889432141

In [183]:
reg.score(X_test, y_test)

0.70849751687926654

In [184]:
np.sum(reg.coef_!=0) #coeff_used

39

In [185]:
test_final =  pd.DataFrame(scaler.transform(test_final), columns=test_copy.columns)

In [186]:
test_pred = reg.predict(test_final)

In [187]:
test_pred = pd.DataFrame(test_pred,columns=['Price'])
test_pred.to_excel("C:/Users/LENOVO/Desktop/LR_v2.xlsx",index=False)

## Lasso Regression

In [188]:
lasso = Lasso(alpha=1.0)
#lasso = Lasso(alpha=0.01, max_iter=10e5)
#lasso = Lasso(alpha=0.05)

In [189]:
lasso.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [190]:
lasso.score(X_train,y_train)

0.66701202855960617

In [191]:
lasso.score(X_test,y_test)

0.70779737241411878

In [192]:
np.sum(lasso.coef_!=0) #coeff_used

33

In [193]:
np.sqrt(mean_squared_log_error(y_test,lasso.predict(X_test)))

0.25845032410539537

In [194]:
np.sqrt(mean_squared_log_error(y_train,lasso.predict(X_train)))

0.26019571913221096

In [195]:
test_pred = lasso.predict(test_final)

In [196]:
test_pred = pd.DataFrame(test_pred,columns=['Price'])

In [197]:
test_pred.to_excel("C:/Users/LENOVO/Desktop/Lasso_v2.xlsx",index=False)

## Ridge Regression

In [198]:
ridge = Ridge(alpha=1.0)
#ridge = Ridge(alpha=0.0001)

In [199]:
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [200]:
ridge.score(X_train,y_train)

0.66492460304511436

In [201]:
ridge.score(X_test,y_test)

0.69878833608426372

In [202]:
np.sum(ridge.coef_!=0) #coeff_used

39

In [203]:
np.sqrt(mean_squared_log_error(y_test,ridge.predict(X_test)))

0.2587877837308209

In [204]:
np.sqrt(mean_squared_log_error(y_train,ridge.predict(X_train)))

0.26056128007135287

In [205]:
test_pred = ridge.predict(test_final)

In [206]:
test_pred = pd.DataFrame(test_pred,columns=['Price'])

In [207]:
test_pred.to_excel("C:/Users/LENOVO/Desktop/Ridge_v2.xlsx",index=False)

## RandomForest

In [208]:
rf = RandomForestRegressor(random_state=101)

In [209]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=101, verbose=0, warm_start=False)

In [210]:
np.sqrt(mean_squared_log_error(y_test,rf.predict(X_test)))

0.20762694284185321

In [211]:
np.sqrt(mean_squared_log_error(y_train,rf.predict(X_train)))

0.16362481797586448

In [212]:
test_pred = rf.predict(test_final)

In [213]:
test_pred = pd.DataFrame(test_pred,columns=['Price'])

In [214]:
test_pred.to_excel("C:/Users/LENOVO/Desktop/RF_v2.xlsx",index=False)

## XGB Regressor

In [215]:
xgb = xgboost.XGBRegressor(seed=101)

In [216]:
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=101,
       silent=True, subsample=1)

In [217]:
np.sqrt(mean_squared_log_error(y_test,xgb.predict(X_test)))

0.22245484371432186

In [218]:
np.sqrt(mean_squared_log_error(y_train,xgb.predict(X_train)))

0.21914058448416485

In [219]:
test_pred = xgb.predict(test_final)

In [220]:
test_pred = pd.DataFrame(test_pred,columns=['Price'])

In [221]:
test_pred.to_excel("C:/Users/LENOVO/Desktop/XGB_v2.xlsx",index=False)