In [25]:
# Import required libraries
import pandas as pd
import os
from sklearn.ensemble import ExtraTreesRegressor
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import pickle

In [2]:
def is_path_exist(path):
    if not os.path.exists(path):
        print("please check the path")
    else: 
        print(path)

In [3]:
# project_path = os.getcwd()
project_path = "F:\\Machine Learning\\flight-price-prediction\\"
train_dataset_path = os.path.join(project_path ,"Dataset\\Data_Train.xlsx")
test_dataset_path = os.path.join(project_path ,"Dataset\\Test_set.xlsx")
is_path_exist(train_dataset_path)
is_path_exist(test_dataset_path)


F:\Machine Learning\flight-price-prediction\Dataset\Data_Train.xlsx
F:\Machine Learning\flight-price-prediction\Dataset\Test_set.xlsx


In [4]:
train_dataset = pd.read_excel(train_dataset_path)
train_dataset.head(5)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [5]:
# As we can not handle date/time directly in our ML model we have to find a way which can hold the information in  numeric way, 
# So we will add 2 or 3 column for these cols date or time.

def convert_date_time(dataset, col_name, prefix = None, extract_date = False, extract_time = False, 
                      extract_year = False, extract_second = False):
    if prefix == None:
        prefix = col_name
    if extract_date:
        dataset[prefix+"_day"] = pd.to_datetime(dataset[col_name], format = "%d/%m/%Y").dt.day
        dataset[prefix+"_month"] = pd.to_datetime(dataset[col_name], format = "%d/%m/%Y").dt.month
        if extract_year:
            dataset[prefix+"_year"] = pd.to_datetime(dataset[col_name], format = "%d/%m/%Y").dt.year
    
    elif extract_time:
        dataset[prefix+"_hour"] = pd.to_datetime(dataset[col_name]).dt.hour
        dataset[prefix+"_min"] = pd.to_datetime(dataset[col_name]).dt.minute
        if extract_second:
            dataset[prefix+"_sec"] = pd.to_datetime(dataset[col_name]).dt.second
    dataset.drop([col_name], axis = 1, inplace = True)
    
    return dataset

def convert_duration(dataset,col_name):
    duration_hour = []
    duration_min =[]
    for duration in dataset[col_name]:
        duration = duration.split(" ")
        if len(duration) > 1:
            duration_hour.append(duration[0].split("h")[0])
            duration_min.append(duration[1].split("m")[0])
        else:
            if "h" in duration[0]:
                duration_hour.append(duration[0].split("h")[0])
                duration_min.append(0)
            elif "m" in duration[0]:
                duration_hour.append(0)
                duration_min.append(duration[0].split("m")[0])
    dataset["Durations_hours"] = duration_hour 
    dataset["Durations_mins"] = duration_min
    dataset.drop([col_name], axis = 1, inplace = True)
    
    return dataset

def add_dummies(dataset,col_name):
    col_data = dataset[col_name]
    col_dummies = pd.get_dummies(col_data,drop_first= True)
    dataset = pd.concat([dataset,col_dummies],axis = 1)
    dataset.drop([col_name], axis = 1, inplace = True)
    return dataset

def preprocessing(dataset):
    dataset.dropna(inplace = True)
    dataset = convert_date_time(dataset, "Date_of_Journey", prefix = "Journey", extract_date=True)
    dataset = convert_date_time(dataset, "Arrival_Time", prefix = "Arrival", extract_time=True)
    dataset = convert_date_time(dataset, "Dep_Time", prefix = "Departure", extract_time=True)
    dataset = convert_duration(dataset, "Duration")
    # As we can see Route and addtional_info column seems less important, so we drop them
    dataset.drop(["Route","Additional_Info"], axis = 1, inplace = True)
    dataset = add_dummies(dataset,"Airline")
    dataset = add_dummies(dataset,"Source")
    dataset = add_dummies(dataset,"Destination")
    dataset.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace = True)
    
    return dataset
train_dataset = pd.read_excel(train_dataset_path)
train_dataset = preprocessing(train_dataset)
train_dataset.head()


Unnamed: 0,Total_Stops,Price,Journey_day,Journey_month,Arrival_hour,Arrival_min,Departure_hour,Departure_min,Durations_hours,Durations_mins,...,Vistara Premium economy,Chennai,Delhi,Kolkata,Mumbai,Cochin,Delhi.1,Hyderabad,Kolkata.1,New Delhi
0,0,3897,24,3,1,10,22,20,2,50,...,0,0,0,0,0,0,0,0,0,1
1,2,7662,1,5,13,15,5,50,7,25,...,0,0,0,1,0,0,0,0,0,0
2,2,13882,9,6,4,25,9,25,19,0,...,0,0,1,0,0,1,0,0,0,0
3,1,6218,12,5,23,30,18,5,5,25,...,0,0,0,1,0,0,0,0,0,0
4,1,13302,1,3,21,35,16,50,4,45,...,0,0,0,0,0,0,0,0,0,1


In [6]:
# For testing
test_dataset = pd.read_excel(test_dataset_path)
test_dataset = preprocessing(test_dataset)
test_dataset.head()

Unnamed: 0,Total_Stops,Journey_day,Journey_month,Arrival_hour,Arrival_min,Departure_hour,Departure_min,Durations_hours,Durations_mins,Air India,...,Vistara Premium economy,Chennai,Delhi,Kolkata,Mumbai,Cochin,Delhi.1,Hyderabad,Kolkata.1,New Delhi
0,1,6,6,4,25,17,30,10,55,0,...,0,0,1,0,0,1,0,0,0,0
1,1,12,5,10,20,6,20,4,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,21,5,19,0,19,15,23,45,0,...,0,0,1,0,0,1,0,0,0,0
3,1,21,5,21,0,8,0,13,0,0,...,0,0,1,0,0,1,0,0,0,0
4,0,24,6,2,45,23,55,2,50,0,...,0,0,0,0,0,0,1,0,0,0


In [8]:
features = train_dataset.drop(["Price"], axis = 1)
labels = train_dataset.Price


# Feature Selection

In [9]:
# Let's check important features by using extra_tree_regressior
selection = ExtraTreesRegressor()
selection.fit(features_train, labels_train)
print(selection.feature_importances_)

NameError: name 'features_train' is not defined

In [None]:
plt.figure(figsize = (12,8))
feat_importances = pd.Series(selection.feature_importances_, index=features_train.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()

# Fitting model using Random Forest¶

1. Split dataset into train and test set in order to prediction w.r.t X_test
2. If needed do scaling of data<br/>
    a. Scaling is not done in Random forest
3. Import model
4. Fit the data
5. Predict w.r.t X_test

6. In regression check RSME Score


In [13]:
features_train, features_test, labels_train, labels_test = train_test_split(features, 
                                            labels, test_size = 0.2, random_state = 42)
model = RandomForestRegressor()
model.fit(features_train,labels_train)
labels_predict = model.predict(features_test)



In [17]:
print("Training Accuracy : ", model.score(features_train, labels_train))
print("Testing Accuracy : ", model.score(features_test, labels_test))

print("-"*75)

print('MAE:', metrics.mean_absolute_error(labels_test, labels_predict))
print('MSE:', metrics.mean_squared_error(labels_test, labels_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(labels_test, labels_predict)))
print("R2 score :", metrics.r2_score(labels_test, labels_predict))

Training Accuracy :  0.9533015293443073
Testing Accuracy :  0.7963709161352978
---------------------------------------------------------------------------
MAE: 1180.51323202602
MSE: 4390661.775528173
RMSE: 2095.3906021379817
R2 score : 0.7963709161352978


## Hyperparameter Tuning
1. Choose following method for hyperparameter tuning<br/>
    a. RandomizedSearchCV --> Fast<br>
    b. GridSearchCV
2. Assign hyperparameters in form of dictionery
3. Fit the model
4. Check best paramters and best score

In [21]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
model = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, 
                               scoring='neg_mean_squared_error', n_iter = 10, cv = 5, 
                               verbose=2, random_state=42, n_jobs = 1)

rf_random.fit(features_train, labels_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   6.9s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.8s remaining:    0.0s


[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   7.0s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   6.9s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   7.6s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   6.9s
[CV] n_estimators=1100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=15 
[CV]  n_estimators=1100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=15, total=  10.5s
[CV] n_estimators=1100, min_samples_split=10, mi

[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   3.8s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   3.6s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   3.6s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   3.6s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   3.6s
[CV] n_estimators=700, min_samples_split=15, min_sam

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  8.5min finished


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100,
                                                         1200]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [22]:
rf_random.best_params_


{'n_estimators': 700,
 'min_samples_split': 15,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 20}

In [29]:
labels_predict = rf_random.predict(features_test)
print('MAE:', metrics.mean_absolute_error(labels_test, labels_predict))
print('MSE:', metrics.mean_squared_error(labels_test, labels_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(labels_test, labels_predict)))
print("R2 score :", metrics.r2_score(labels_test, labels_predict))

MAE: 1165.1785809562468
MSE: 4050581.7853487646
RMSE: 2012.6057202911763
R2 score : 0.8121430662988388


In [30]:

# open a file, where you ant to store the data
file = open('flight_rf.pkl', 'wb')

# dump information to that file
pickle.dump(rf_random, file)