In [1]:
import matplotlib
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, mean_absolute_percentage_error



In [2]:
data = pd.read_csv ('/Users/nina/Downloads/mlproject/data/Clean_Dataset.csv')
# drop index column and check the datatype
data = data.drop(['Unnamed: 0'], axis=1)
# label encode three categorical columns
le = LabelEncoder()
data["airline_label"] = le.fit_transform(data['airline'])
data["source_city_label"] = le.fit_transform(data['source_city'])
data["destination_city_label"] = le.fit_transform(data['destination_city'])
# category time and stops according to sequence

def time_label(value):
    if value == "Early_Morning":
        return 0
    elif value == "Morning":
        return 1
    elif value == "Afternoon":
        return 2
    elif value == "Evening":
        return 3
    elif value == "Night":
        return 4
    elif value == "Late_Night":
        return 5

def stops_label(value):
    if value == "zero":
        return 0
    elif value == "one":
        return 1
    elif value == "two_or_more":
        return 2
    

data['departure_time_label'] = data['departure_time'].map(time_label)
data['arrival_time_label'] = data['arrival_time'].map(time_label)
data['stops_label'] = data['stops'].map(stops_label)

# Split Dataframe using groupby()
# grouping by economy and business class
data['class_label'] = np.where(data['class'] == "Economy", True, False)
grouped = data.groupby(data.class_label)
economyData = grouped.get_group(True)
economyData=economyData.drop(['class_label'],axis=1)
print(economyData.head())
data=economyData.drop(['airline', 'flight', 'source_city','departure_time','stops','arrival_time', 'destination_city','class'],axis=1)


    airline   flight source_city departure_time stops   arrival_time  \
0  SpiceJet  SG-8709       Delhi        Evening  zero          Night   
1  SpiceJet  SG-8157       Delhi  Early_Morning  zero        Morning   
2   AirAsia   I5-764       Delhi  Early_Morning  zero  Early_Morning   
3   Vistara   UK-995       Delhi        Morning  zero      Afternoon   
4   Vistara   UK-963       Delhi        Morning  zero        Morning   

  destination_city    class  duration  days_left  price  airline_label  \
0           Mumbai  Economy      2.17          1   5953              4   
1           Mumbai  Economy      2.33          1   5953              4   
2           Mumbai  Economy      2.17          1   5956              0   
3           Mumbai  Economy      2.25          1   5955              5   
4           Mumbai  Economy      2.33          1   5955              5   

   source_city_label  destination_city_label  departure_time_label  \
0                  2                       5        

In [3]:
X=data.drop(['price'],axis=1)
y=data[['price']]
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state = 42)
rav_train_Y = np.ravel(Train_Y)
rav_test_Y = np.ravel(Test_Y)
print(X.head())
print(y.head())

   duration  days_left  airline_label  source_city_label  \
0      2.17          1              4                  2   
1      2.33          1              4                  2   
2      2.17          1              0                  2   
3      2.25          1              5                  2   
4      2.33          1              5                  2   

   destination_city_label  departure_time_label  arrival_time_label  \
0                       5                     3                   4   
1                       5                     0                   1   
2                       5                     0                   0   
3                       5                     1                   2   
4                       5                     1                   1   

   stops_label  
0            0  
1            0  
2            0  
3            0  
4            0  
   price
0   5953
1   5953
2   5956
3   5955
4   5955


# models

In [4]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [5]:
Ridge_model = Ridge()
# define Ridge_grid
Ridge_grid = dict()
##tune alpha
Ridge_grid['alpha'] = np.arange(0.01, 1, 0.01)
Ridge_grid['max_iter'] = np.array([500,1000,2000])
Ridge_search = GridSearchCV(Ridge_model, Ridge_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
Ridge_results = Ridge_search.fit(Train_X, Train_Y)
print('Negative MAPE: ' , Ridge_results.best_score_)
print('Best param are : %s' % Ridge_results.best_params_)

Negative MAPE:  -0.369246700299228
Best param are : {'alpha': 0.99, 'max_iter': 500}


In [6]:
Lasso_model = Lasso()
# define Lasso_grid
Lasso_grid = dict()
##tune alpha
Lasso_grid['alpha'] = np.arange(0.01, 1, 0.01)
Lasso_grid['max_iter'] = np.array([500,1000,2000])
Lasso_search = GridSearchCV(Lasso_model, Lasso_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
Lasso_results = Lasso_search.fit(Train_X, Train_Y)
print('Negative MAPE: ' ,Lasso_results.best_score_)
print('Best params are : %s' % Lasso_results.best_params_)

Negative MAPE:  -0.3692240875097642
Best params are : {'alpha': 0.99, 'max_iter': 500}


In [7]:
ElasticNet_model = ElasticNet()
ElasticNet_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define ElasticNet_grid
ElasticNet_grid = dict()
##tune alpha
ElasticNet_grid['alpha'] = np.arange(0.01, 1, 0.01)
ElasticNet_grid['max_iter'] = np.array([500,1000,2000])
ElasticNet_search = GridSearchCV(ElasticNet_model, ElasticNet_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
ElasticNet_results = ElasticNet_search.fit(Train_X,Train_Y)
print('Negative MAPE: ' , ElasticNet_results.best_score_)
print('Best params are : %s'% ElasticNet_results.best_params_)

Negative MAPE:  -0.3683633311842456
Best params are : {'alpha': 0.08, 'max_iter': 500}


In [8]:
DecisionTree_model = DecisionTreeRegressor()
DecisionTree_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define DecisionTree_grid
DecisionTree_grid = dict()
##tune alpha
DecisionTree_grid['max_depth'] = np.arange(5, 30, 5)
DecisionTree_grid['max_leaf_nodes'] = np.array([10,100,1000])
DecisionTree_search = GridSearchCV(DecisionTree_model, DecisionTree_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
DecisionTree_results = DecisionTree_search.fit(Train_X, Train_Y)
print('Negative MAPE:' , DecisionTree_results.best_score_)
print('Best params are : %s'% DecisionTree_results.best_params_)

Negative MAPE: -0.16828584855564635
Best params are : {'max_depth': 25, 'max_leaf_nodes': 1000}


In [9]:
#Random forest regression
RandomForest_model = RandomForestRegressor()
RandomForest_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define RandomForest_grid
RandomForest_grid = dict()
##tune alpha
RandomForest_grid['max_depth'] = np.arange(5, 30, 5)
RandomForest_grid['max_leaf_nodes'] = np.array([10,100,1000])
# RandomForest_grid['n_estimators'] = np.array([50,100,200])

RandomForest_search = GridSearchCV(RandomForest_model, RandomForest_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
RandomForest_results = RandomForest_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:', RandomForest_results.best_score_)
print('Best params are : %s'% RandomForest_results.best_params_)

Negative MAPE: -0.1630026039870373
Best params are : {'max_depth': 15, 'max_leaf_nodes': 1000}


In [10]:
BayesianRidge_model = BayesianRidge()
BayesianRidge_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

BayesianRidge_grid = dict()


BayesianRidge_grid['alpha_1'] = np.arange(0.01, 1, 0.01)
BayesianRidge_grid['n_iter'] = np.array([200,400,600])
BayesianRidge_grid['lambda_1'] = np.array([1e-6, 1e-3, 1])
BayesianRidge_grid['lambda_2'] = np.array([1e-6, 1e-3, 1])


BayesianRidge_search = GridSearchCV(BayesianRidge_model, BayesianRidge_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
BayesianRidge_results = BayesianRidge_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , BayesianRidge_results.best_score_)
print('Best params are : %s'% BayesianRidge_results.best_params_)

Negative MAPE: -0.3692387796853123
Best params are : {'alpha_1': 0.01, 'lambda_1': 1.0, 'lambda_2': 1e-06, 'n_iter': 200}


In [11]:
from sklearn.ensemble import AdaBoostRegressor
adaBoost = AdaBoostRegressor()
adaBoost_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
adaBoost_grid = dict()
adaBoost_grid['learning_rate'] = np.arange(0.1, 1, 0.1)
adaBoost_grid['n_estimators'] = np.array([1,50,10])

adaBoost_search = GridSearchCV(adaBoost, adaBoost_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
adaBoost_results = adaBoost_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , adaBoost_results.best_score_)
print('Best params are : %s'% adaBoost_results.best_params_)

Negative MAPE: -0.2893346208831779
Best params are : {'learning_rate': 0.8, 'n_estimators': 1}


In [12]:
from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor()
bagging_grid = dict()
bagging_grid['n_estimators'] = np.arange(1, 5, 1)

bagging_search = GridSearchCV(bagging, bagging_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
bagging_results = bagging_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , bagging_results.best_score_)
print('Best params are : %s'% bagging_results.best_params_)

Negative MAPE: -0.09613776726474814
Best params are : {'n_estimators': 4}


In [13]:
from sklearn.ensemble import ExtraTreesRegressor
exTree = ExtraTreesRegressor()
exTree_grid = dict()
exTree_grid['n_estimators'] = np.arange(65, 95, 10)
exTree_grid['max_depth'] = np.arange(20, 50, 5)

exTree_search = GridSearchCV(exTree, exTree_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
exTree_results = exTree_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , exTree_results.best_score_)
print('Best params are : %s'% exTree_results.best_params_)



Negative MAPE: -0.0973132237104423
Best params are : {'max_depth': 30, 'n_estimators': 85}


In [14]:
# xgboost,lightGBM,Catboost
import xgboost as xgb 
xg_reg = xgb.XGBRegressor()
xg_reg_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
xg_reg_grid = dict()
xg_reg_grid['max_depth'] = np.arange(5, 20, 5)
xg_reg_grid['eta'] = np.arange(0.01, 0.25, 0.05)

xg_reg_search = GridSearchCV(xg_reg, xg_reg_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
xg_reg_results = xg_reg_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , xg_reg_results.best_score_)
print('Best params are : %s'% xg_reg_results.best_params_)

Negative MAPE: -0.09555962228834022
Best params are : {'eta': 0.21000000000000002, 'max_depth': 15}
