In [232]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import pandas_profiling as pp
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
import xgboost

In [16]:
pd.set_option('display.max_columns', 1000)

In [17]:
def onehotencode(data,col_name,prefix):
    return pd.concat([data, pd.get_dummies(data[col_name], prefix=prefix)],axis=1)

In [18]:
def encode_total_stops(data):
    data['Encoded_stops'] = 0.0
    for i in range(0,len(data['Total_Stops'])):
        if data.loc[i, 'Total_Stops'] == '1 stop':
            data.loc[i,'Encoded_stops'] = 0.1
        elif data.loc[i, 'Total_Stops'] == '2 stops':
            data.loc[i,'Encoded_stops'] = 0.2
        elif data.loc[i, 'Total_Stops'] == '3 stops':
            data.loc[i,'Encoded_stops'] = 0.3
        elif data.loc[i, 'Total_Stops'] == '4 stops':
            data.loc[i,'Encoded_stops'] = 0.4
        elif data.loc[i, 'Total_Stops'] == 'non-stop':
            data.loc[i,'Encoded_stops'] = 0.0

    return data

In [19]:
def extract_journey_data(data):
    data['Journey_Month'] = pd.DataFrame(pd.to_datetime(data['Date_of_Journey'], format='%d/%m/%Y').dt.month)
    data['Journey_over_weekend'] =  pd.DataFrame(((pd.to_datetime(data['Date_of_Journey'], format='%d/%m/%Y').dt.dayofweek) // 5).astype('int'))
    
    return data

In [20]:
def encode_duration(data):
    data_copy['Duration_Type'] = 0
    for i in range(0, len(data)):
    
        try:
            val = int(str(data.loc[i,'Duration'].split('h')[0]))
        except:
            # this is for AirIndia for with flight duration as '5m', changing the duration
            #based on same row for AirIndia for same destination at same time with same stop overs.
            val = 4
            
       
        if val < 3:
            data.loc[i,'Duration_Type'] = 0.0
        elif val < 6:
            data.loc[i,'Duration_Type'] = 0.1
        elif val < 12:
            data.loc[i,'Duration_Type'] = 0.2
        else:
            data.loc[i,'Duration_Type'] = 0.3
        
    return data

In [21]:
def flight_hours(data):
    data['Fly_Hours'] = 0
    
    data_Dep_Time = data['Dep_Time'].str.split(":",n=1,expand=True).astype('int64')
    data_Dep_Time.columns = ['Hours','Mins']
    for i in range(0,len(data)):
        
        val = data_Dep_Time.loc[i, 'Hours']
        
        if val == 0:
            data.loc[i,'Fly_Hours'] = 'Midnight'
        elif val >= 1 and val < 4:
            data.loc[i,'Fly_Hours'] = 'Early_Morning'
        elif val >= 4 and val < 6:
            data.loc[i,'Fly_Hours'] = 'Dawn'
        elif val >= 6 and val < 9:
            data.loc[i,'Fly_Hours'] = 'Morning'
        elif val >= 9 and val < 12:
            data.loc[i,'Fly_Hours'] = 'Mid_Morning'
        elif val >= 12 and val < 13:
            data.loc[i,'Fly_Hours'] = 'Noon'
        elif val >= 13 and val < 16:
            data.loc[i,'Fly_Hours'] = 'After_Noon'
        elif val >= 16 and val < 21:
            data.loc[i,'Fly_Hours'] = 'Evening'
        elif val >= 21 and val <= 23:
            data.loc[i,'Fly_Hours'] = 'Night'
            
    return data

# Data cleaning and Modeling starts here

In [22]:
data = pd.read_excel('C:\\Users\\LENOVO\\Desktop\\Flight_Ticket_Participant_Datasets\\Data_Train.xlsx')

In [23]:
test = pd.read_excel('C:\\Users\\LENOVO\\Desktop\\Flight_Ticket_Participant_Datasets\\Test_set.xlsx')

In [24]:
data_copy = data
test_copy = test

In [25]:
#Drop anamolies.
#data_copy = data_copy[data_copy['Price'] > data["Price"].quantile(0.01)]
#data_copy = data_copy[data_copy['Price'] < data["Price"].quantile(0.99)]
data_copy = data_copy[data_copy['Airline'] != 'Trujet']
data_copy = data_copy[data_copy['Additional_Info'] != 'Red-eye flight']

In [26]:
data_copy.reset_index(inplace=True)

In [27]:
data_copy = onehotencode(data_copy,'Airline',"Airline_")

In [28]:
data_copy.columns

Index(['index', 'Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price', 'Airline__Air Asia', 'Airline__Air India',
       'Airline__GoAir', 'Airline__IndiGo', 'Airline__Jet Airways',
       'Airline__Jet Airways Business', 'Airline__Multiple carriers',
       'Airline__Multiple carriers Premium economy', 'Airline__SpiceJet',
       'Airline__Vistara', 'Airline__Vistara Premium economy'],
      dtype='object')

In [29]:
data_check = onehotencode(data,'Airline',"Airline_")

In [30]:
data_copy = onehotencode(data_copy,'Source',"Source_")

In [31]:
data_copy = onehotencode(data_copy,'Destination',"Destination_")

In [32]:
#handle No Info, No info values before one hot encoding.
data_copy.loc[data_copy['Additional_Info'] == 'No info','Additional_Info'] = 'No Info'
data_copy.loc[data_copy['Additional_Info'] == '1 Long layover','Additional_Info'] = 'Others'
data_copy.loc[data_copy['Additional_Info'] == '1 Short layover','Additional_Info'] = 'Others'
data_copy.loc[data_copy['Additional_Info'] == '2 Long layover','Additional_Info'] = 'Others'
data_copy.loc[data_copy['Additional_Info'] == 'Business class','Additional_Info'] = 'Others'
data_copy.loc[data_copy['Additional_Info'] == 'Change airports','Additional_Info'] = 'Others'


data_copy = onehotencode(data_copy,'Additional_Info',"Additional_Info_")

In [33]:
data_copy = encode_total_stops(data_copy)

In [34]:
data_copy = extract_journey_data(data_copy)

In [35]:
data_copy = encode_duration(data_copy)

In [36]:
data_copy = flight_hours(data_copy)

In [37]:
data_copy = onehotencode(data_copy,'Fly_Hours',"Fly_Hours_")

In [38]:
data_final = data_copy

In [39]:
data_final.loc[data_final['Encoded_stops'].isnull(),'Encoded_stops'] = 0.3

In [40]:
data_final.drop(['Airline','Date_of_Journey','Source','Destination','Route','Dep_Time','Arrival_Time','Duration',
                'Total_Stops','Additional_Info','Fly_Hours'],axis=1,inplace=True)

In [41]:
data_final.drop(['index'],axis=1,inplace=True)

In [42]:
train_Feature = data_copy[[x for x in data_final.columns if (x != 'Price' and x != 'index')]]
train_Target = data_copy['Price']

In [43]:
train_Feature.columns

Index(['Airline__Air Asia', 'Airline__Air India', 'Airline__GoAir',
       'Airline__IndiGo', 'Airline__Jet Airways',
       'Airline__Jet Airways Business', 'Airline__Multiple carriers',
       'Airline__Multiple carriers Premium economy', 'Airline__SpiceJet',
       'Airline__Vistara', 'Airline__Vistara Premium economy',
       'Source__Banglore', 'Source__Chennai', 'Source__Delhi',
       'Source__Kolkata', 'Source__Mumbai', 'Destination__Banglore',
       'Destination__Cochin', 'Destination__Delhi', 'Destination__Hyderabad',
       'Destination__Kolkata', 'Destination__New Delhi',
       'Additional_Info__In-flight meal not included',
       'Additional_Info__No Info',
       'Additional_Info__No check-in baggage included',
       'Additional_Info__Others', 'Encoded_stops', 'Journey_Month',
       'Journey_over_weekend', 'Duration_Type', 'Fly_Hours__After_Noon',
       'Fly_Hours__Dawn', 'Fly_Hours__Early_Morning', 'Fly_Hours__Evening',
       'Fly_Hours__Mid_Morning', 'Fly_Hours__

In [44]:
X_train, X_test, y_train, y_test = train_test_split(train_Feature, train_Target, test_size=0.30, random_state=101)

In [45]:
null_columns=X_train.columns[X_train.isnull().any()]
X_train[null_columns].isnull().sum()

Series([], dtype: float64)

In [46]:
reg = LinearRegression(normalize=True,fit_intercept=False)

In [47]:
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=True)

In [48]:
reg.coef_

array([ -1.89224361e+15,  -1.89224361e+15,  -1.89224361e+15,
        -1.89224361e+15,  -1.89224361e+15,  -1.89224361e+15,
        -1.89224361e+15,  -1.89224361e+15,  -1.89224361e+15,
        -1.89224361e+15,  -1.89224361e+15,  -3.50692726e+15,
        -7.01470458e+14,  -3.32098148e+15,   5.47226470e+14,
         9.53957262e+15,  -3.78111225e+15,   8.70956916e+13,
         2.73041474e+14,  -1.27734584e+16,  -2.53241533e+15,
         2.73041474e+14,   3.89349088e+15,   3.89349088e+15,
         3.89349088e+15,   3.89349088e+15,   2.31379219e+04,
        -3.48937500e+02,   3.82750000e+02,   3.70618750e+03,
         1.23263852e+15,   1.23263852e+15,   1.23263852e+15,
         1.23263852e+15,   1.23263852e+15,   1.23263852e+15,
         1.23263852e+15,   1.23263852e+15,   1.23263852e+15])

In [49]:
reg.predict(X_test)[reg.predict(X_test) < 0]

array([], dtype=float64)

In [50]:
test_msle = mean_squared_log_error(y_test,reg.predict(X_test))

In [51]:
np.sqrt(test_msle)

0.25829194896638258

In [52]:
train_msle = mean_squared_log_error(y_train,reg.predict(X_train))

In [53]:
np.sqrt(train_msle)

0.26005852315754074

# Final Prediction on Test set starts here

In [54]:
test_copy = onehotencode(test_copy,'Airline',"Airline_")

In [55]:
test_copy = onehotencode(test_copy,'Source',"Source_")

In [56]:
test_copy = onehotencode(test_copy,'Destination',"Destination_")

In [57]:
#handle No Info, No info values before one hot encoding.
test_copy.loc[test_copy['Additional_Info'] == 'No info','Additional_Info'] = 'No Info'
test_copy.loc[test_copy['Additional_Info'] == '1 Long layover','Additional_Info'] = 'Others'
test_copy.loc[test_copy['Additional_Info'] == '1 Short layover','Additional_Info'] = 'Others'
test_copy.loc[test_copy['Additional_Info'] == '1 Long layover','Additional_Info'] = 'Others'
test_copy.loc[test_copy['Additional_Info'] == 'Business class','Additional_Info'] = 'Others'
test_copy.loc[test_copy['Additional_Info'] == 'Change airports','Additional_Info'] = 'Others'

test_copy = onehotencode(test_copy,'Additional_Info',"Additional_Info_")

In [58]:
test_copy = encode_total_stops(test_copy)

In [59]:
test_copy = extract_journey_data(test_copy)

In [60]:
test_copy = encode_duration(test_copy)

In [61]:
test_copy = flight_hours(test_copy)

In [62]:
test_copy = onehotencode(test_copy,'Fly_Hours',"Fly_Hours_")

In [63]:
test_final = test_copy

In [64]:
test_final.drop(['Airline','Date_of_Journey','Source','Destination','Route','Dep_Time','Arrival_Time','Duration',
                'Total_Stops','Additional_Info','Fly_Hours'],axis=1,inplace=True)

In [65]:
test_copy.columns

Index(['Airline__Air Asia', 'Airline__Air India', 'Airline__GoAir',
       'Airline__IndiGo', 'Airline__Jet Airways',
       'Airline__Jet Airways Business', 'Airline__Multiple carriers',
       'Airline__Multiple carriers Premium economy', 'Airline__SpiceJet',
       'Airline__Vistara', 'Airline__Vistara Premium economy',
       'Source__Banglore', 'Source__Chennai', 'Source__Delhi',
       'Source__Kolkata', 'Source__Mumbai', 'Destination__Banglore',
       'Destination__Cochin', 'Destination__Delhi', 'Destination__Hyderabad',
       'Destination__Kolkata', 'Destination__New Delhi',
       'Additional_Info__In-flight meal not included',
       'Additional_Info__No Info',
       'Additional_Info__No check-in baggage included',
       'Additional_Info__Others', 'Encoded_stops', 'Journey_Month',
       'Journey_over_weekend', 'Duration_Type', 'Fly_Hours__After_Noon',
       'Fly_Hours__Dawn', 'Fly_Hours__Early_Morning', 'Fly_Hours__Evening',
       'Fly_Hours__Mid_Morning', 'Fly_Hours__

In [66]:
train_Feature.columns

Index(['Airline__Air Asia', 'Airline__Air India', 'Airline__GoAir',
       'Airline__IndiGo', 'Airline__Jet Airways',
       'Airline__Jet Airways Business', 'Airline__Multiple carriers',
       'Airline__Multiple carriers Premium economy', 'Airline__SpiceJet',
       'Airline__Vistara', 'Airline__Vistara Premium economy',
       'Source__Banglore', 'Source__Chennai', 'Source__Delhi',
       'Source__Kolkata', 'Source__Mumbai', 'Destination__Banglore',
       'Destination__Cochin', 'Destination__Delhi', 'Destination__Hyderabad',
       'Destination__Kolkata', 'Destination__New Delhi',
       'Additional_Info__In-flight meal not included',
       'Additional_Info__No Info',
       'Additional_Info__No check-in baggage included',
       'Additional_Info__Others', 'Encoded_stops', 'Journey_Month',
       'Journey_over_weekend', 'Duration_Type', 'Fly_Hours__After_Noon',
       'Fly_Hours__Dawn', 'Fly_Hours__Early_Morning', 'Fly_Hours__Evening',
       'Fly_Hours__Mid_Morning', 'Fly_Hours__

In [67]:
test_final.shape

(2671, 39)

In [68]:
train_Feature.shape

(10681, 39)

In [101]:
reg.score(X_train,y_train)

0.66696131458101893

In [102]:
reg.score(X_test, y_test)

0.70852197106903114

In [103]:
np.sum(reg.coef_!=0) #coeff_used

39

In [69]:
test_pred = reg.predict(test_final)

In [72]:
test_pred = pd.DataFrame(test_pred,columns=['Price'])
test_pred.to_excel("C:/Users/LENOVO/Desktop/LR_v1.xlsx",index=False)

## Lasso Regression

In [196]:
#lasso = Lasso(alpha=1.0)
#lasso = Lasso(alpha=0.01, max_iter=10e5)
lasso = Lasso(alpha=0.05)

In [197]:
lasso.fit(X_train, y_train)



Lasso(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [198]:
lasso.score(X_train,y_train)

0.66727081318645021

In [199]:
lasso.score(X_test,y_test)

0.70920246729869352

In [200]:
np.sum(lasso.coef_!=0) #coeff_used

36

In [201]:
np.sqrt(mean_squared_log_error(y_test,lasso.predict(X_test)))

0.25881525680699358

In [202]:
np.sqrt(mean_squared_log_error(y_train,lasso.predict(X_train)))

0.26064992583202989

In [111]:
test_pred = lasso.predict(test_final)

In [112]:
test_pred = pd.DataFrame(test_pred,columns=['Price'])

In [113]:
test_pred.to_excel("C:/Users/LENOVO/Desktop/Lasso01_v1.xlsx",index=False)

## Ridge Regression

In [224]:
#ridge = Ridge(alpha=1.0)
ridge = Ridge(alpha=0.0001)

In [225]:
ridge.fit(X_train, y_train)

Ridge(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [226]:
ridge.score(X_train,y_train)

0.6672724124477587

In [227]:
ridge.score(X_test,y_test)

0.70916703027859451

In [228]:
np.sum(ridge.coef_!=0) #coeff_used

39

In [229]:
np.sqrt(mean_squared_log_error(y_test,ridge.predict(X_test)))

0.2588651409886964

In [230]:
np.sqrt(mean_squared_log_error(y_train,ridge.predict(X_train)))

0.26066408818439157

In [98]:
test_pred = ridge.predict(test_final)

In [99]:
test_pred = pd.DataFrame(test_pred,columns=['Price'])

In [100]:
test_pred.to_excel("C:/Users/LENOVO/Desktop/Ridge_v1.xlsx",index=False)

## RandomForest

In [233]:
rf = RandomForestRegressor(random_state=101)

In [234]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=101, verbose=0, warm_start=False)

In [235]:
np.sqrt(mean_squared_log_error(y_test,rf.predict(X_test)))

0.20762694284185321

In [236]:
np.sqrt(mean_squared_log_error(y_train,rf.predict(X_train)))

0.16362481797586448

In [237]:
test_pred = rf.predict(test_final)

In [238]:
test_pred = pd.DataFrame(test_pred,columns=['Price'])

In [239]:
test_pred.to_excel("C:/Users/LENOVO/Desktop/RF_v1.xlsx",index=False)

## XGB Regressor

In [245]:
xgb = xgboost.XGBRegressor(seed=101)

In [246]:
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=101,
       silent=True, subsample=1)

In [247]:
np.sqrt(mean_squared_log_error(y_test,xgb.predict(X_test)))

0.22245484371432186

In [248]:
np.sqrt(mean_squared_log_error(y_train,xgb.predict(X_train)))

0.21914058448416485

In [249]:
test_pred = xgb.predict(test_final)

In [250]:
test_pred = pd.DataFrame(test_pred,columns=['Price'])

In [251]:
test_pred.to_excel("C:/Users/LENOVO/Desktop/XGB_v1.xlsx",index=False)