In [18]:
import matplotlib
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, mean_absolute_percentage_error



In [19]:
data = pd.read_csv ('/Users/nina/Downloads/mlproject/data/Clean_Dataset.csv')
# drop index column and check the datatype
data = data.drop(['Unnamed: 0'], axis=1)
# label encode three categorical columns
le = LabelEncoder()
data["airline_label"] = le.fit_transform(data['airline'])
data["source_city_label"] = le.fit_transform(data['source_city'])
data["destination_city_label"] = le.fit_transform(data['destination_city'])
# category time and stops according to sequence

def time_label(value):
    if value == "Early_Morning":
        return 0
    elif value == "Morning":
        return 1
    elif value == "Afternoon":
        return 2
    elif value == "Evening":
        return 3
    elif value == "Night":
        return 4
    elif value == "Late_Night":
        return 5

def stops_label(value):
    if value == "zero":
        return 0
    elif value == "one":
        return 1
    elif value == "two_or_more":
        return 2
    

data['departure_time_label'] = data['departure_time'].map(time_label)
data['arrival_time_label'] = data['arrival_time'].map(time_label)
data['stops_label'] = data['stops'].map(stops_label)

# Split Dataframe using groupby()
# grouping by economy and business class
data['class_label'] = np.where(data['class'] == "Economy", True, False)
grouped = data.groupby(data.class_label)
economyData = grouped.get_group(True)
economyData=economyData.drop(['class_label'],axis=1)
print(economyData.head())
data=economyData.drop(['airline', 'flight', 'source_city','departure_time','stops','arrival_time', 'destination_city','class'],axis=1)


    airline   flight source_city departure_time stops   arrival_time  \
0  SpiceJet  SG-8709       Delhi        Evening  zero          Night   
1  SpiceJet  SG-8157       Delhi  Early_Morning  zero        Morning   
2   AirAsia   I5-764       Delhi  Early_Morning  zero  Early_Morning   
3   Vistara   UK-995       Delhi        Morning  zero      Afternoon   
4   Vistara   UK-963       Delhi        Morning  zero        Morning   

  destination_city    class  duration  days_left  price  airline_label  \
0           Mumbai  Economy      2.17          1   5953              4   
1           Mumbai  Economy      2.33          1   5953              4   
2           Mumbai  Economy      2.17          1   5956              0   
3           Mumbai  Economy      2.25          1   5955              5   
4           Mumbai  Economy      2.33          1   5955              5   

   source_city_label  destination_city_label  departure_time_label  \
0                  2                       5        

In [20]:
X=data.drop(['price'],axis=1)
y=data[['price']]
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state = 42)
rav_train_Y = np.ravel(Train_Y)
rav_test_Y = np.ravel(Test_Y)
print(X.head())
print(y.head())

   duration  days_left  airline_label  source_city_label  \
0      2.17          1              4                  2   
1      2.33          1              4                  2   
2      2.17          1              0                  2   
3      2.25          1              5                  2   
4      2.33          1              5                  2   

   destination_city_label  departure_time_label  arrival_time_label  \
0                       5                     3                   4   
1                       5                     0                   1   
2                       5                     0                   0   
3                       5                     1                   2   
4                       5                     1                   1   

   stops_label  
0            0  
1            0  
2            0  
3            0  
4            0  
   price
0   5953
1   5953
2   5956
3   5955
4   5955


# models

In [21]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

bagging model tuning

In [11]:
from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor()
bagging_grid = dict()
bagging_grid['n_estimators'] = np.arange(5, 25, 5)

bagging_search = GridSearchCV(bagging, bagging_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
bagging_results = bagging_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , bagging_results.best_score_)
print('Best params are : %s'% bagging_results.best_params_)

Negative MAPE: -0.09000369583456301
Best params are : {'n_estimators': 20}


In [12]:

from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor()
bagging_grid = dict()
bagging_grid['n_estimators'] = np.array([18,19,21,22,23])
bagging_search = GridSearchCV(bagging, bagging_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
bagging_results = bagging_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , bagging_results.best_score_)
print('Best params are : %s'% bagging_results.best_params_)

Negative MAPE: -0.0897292210267557
Best params are : {'n_estimators': 22}


In [13]:
from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor()
bagging_grid = dict()
bagging_grid['n_estimators'] = np.array([20,21,22,23,24])
bagging_search = GridSearchCV(bagging, bagging_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
bagging_results = bagging_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , bagging_results.best_score_)
print('Best params are : %s'% bagging_results.best_params_)

Negative MAPE: -0.089633515698997
Best params are : {'n_estimators': 23}


In [15]:
from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor()
bagging_grid = dict()
bagging_grid['n_estimators'] = np.array([22,23,25,30,35,40])
bagging_search = GridSearchCV(bagging, bagging_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
bagging_results = bagging_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , bagging_results.best_score_)
print('Best params are : %s'% bagging_results.best_params_)

Negative MAPE: -0.08885907860489971
Best params are : {'n_estimators': 40}


In [17]:
from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor()
bagging_grid = dict()
bagging_grid['n_estimators'] = np.array([40,50,60])
bagging_search = GridSearchCV(bagging, bagging_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
bagging_results = bagging_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , bagging_results.best_score_)
print('Best params are : %s'% bagging_results.best_params_)

Negative MAPE: -0.08858018920634736
Best params are : {'n_estimators': 60}


In [18]:
from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor()
bagging_grid = dict()
bagging_grid['n_estimators'] = np.array([70,80,90,100])
bagging_search = GridSearchCV(bagging, bagging_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
bagging_results = bagging_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , bagging_results.best_score_)
print('Best params are : %s'% bagging_results.best_params_)

Negative MAPE: -0.0882315163891861
Best params are : {'n_estimators': 100}


In [19]:
from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor()
bagging_grid = dict()
bagging_grid['n_estimators'] = np.array([110,120,140])
bagging_search = GridSearchCV(bagging, bagging_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
bagging_results = bagging_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , bagging_results.best_score_)
print('Best params are : %s'% bagging_results.best_params_)



Negative MAPE: -0.08810972922242126
Best params are : {'n_estimators': 140}


In [20]:
from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor()
bagging_grid = dict()
bagging_grid['n_estimators'] = np.array([150,160])
bagging_search = GridSearchCV(bagging, bagging_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
bagging_results = bagging_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:' , bagging_results.best_score_)
print('Best params are : %s'% bagging_results.best_params_)



Negative MAPE: -0.0880960256885574
Best params are : {'n_estimators': 150}


after comparsion, the best parameter is n_estimators = 150, the Negative MAPE is -0.0880960256885574

random forest tuning

In [32]:
#Random forest regression
RandomForest_model = RandomForestRegressor()
RandomForest_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define RandomForest_grid
RandomForest_grid = dict()
RandomForest_grid['max_depth'] = np.array([5,10,15])
RandomForest_grid['n_estimators'] = np.array([110,120])

RandomForest_search = GridSearchCV(RandomForest_model, RandomForest_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
RandomForest_results = RandomForest_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:', RandomForest_results.best_score_)
print('Best params are : %s'% RandomForest_results.best_params_)

Negative MAPE: -0.1340710444499974
Best params are : {'max_depth': 15, 'n_estimators': 120}


In [22]:
#Random forest regression
RandomForest_model = RandomForestRegressor()
RandomForest_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define RandomForest_grid
RandomForest_grid = dict()
RandomForest_grid['max_depth'] = np.array([1,2,3])
RandomForest_grid['random_state'] = np.array([None, 1, 2])

RandomForest_search = GridSearchCV(RandomForest_model, RandomForest_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
RandomForest_results = RandomForest_search.fit(Train_X, rav_train_Y)
print('Negative MAPE:', RandomForest_results.best_score_)
print('Best params are : %s'% RandomForest_results.best_params_)

Negative MAPE: -0.28881683688872434
Best params are : {'max_depth': 3, 'random_state': 1}


decision tree tuning

In [25]:
DecisionTree_model = DecisionTreeRegressor()
DecisionTree_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define DecisionTree_grid
DecisionTree_grid = dict()

DecisionTree_grid['max_depth'] = [None, 5, 10, 15, 25, 30]
DecisionTree_grid['max_leaf_nodes'] = [None, 1,2,3]
DecisionTree_grid['random_state'] = [None, 1, 2]
DecisionTree_search = GridSearchCV(DecisionTree_model, DecisionTree_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
DecisionTree_results = DecisionTree_search.fit(Train_X, Train_Y)
print('Negative MAPE:' , DecisionTree_results.best_score_)
print('Best params are : %s'% DecisionTree_results.best_params_)

540 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nina/opt/anaconda3/envs/tensorflow/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nina/opt/anaconda3/envs/tensorflow/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/Users/nina/opt/anaconda3/envs/tensorflow/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/Users/nina/opt/anaconda3/envs/tensorflow/lib/python3.10/site-packages/sklearn/base.

Negative MAPE: -0.09308148659075996
Best params are : {'max_depth': None, 'max_leaf_nodes': None, 'random_state': None}


In [30]:
DecisionTree_model = DecisionTreeRegressor()
DecisionTree_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define DecisionTree_grid
DecisionTree_grid = dict()

DecisionTree_grid['max_depth'] = [None, 5]
DecisionTree_grid['min_samples_split'] = [1,2,3,4,5]
DecisionTree_grid['random_state'] = [None, 1, 2]

DecisionTree_search = GridSearchCV(DecisionTree_model, DecisionTree_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
DecisionTree_results = DecisionTree_search.fit(Train_X, Train_Y)
print('Negative MAPE:' , DecisionTree_results.best_score_)
print('Best params are : %s'% DecisionTree_results.best_params_)

Negative MAPE: -0.09286160783203476
Best params are : {'max_depth': None, 'min_samples_split': 3, 'random_state': None}


In [43]:
DecisionTree_model = DecisionTreeRegressor()
DecisionTree_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define DecisionTree_grid
DecisionTree_grid = dict()


DecisionTree_grid['max_depth'] = [None, 5, 10, 15]
DecisionTree_grid['min_samples_split'] = [1,2,3,4,5]
DecisionTree_grid['min_samples_leaf'] = [1, 2, 3, 4, 5]

DecisionTree_search = GridSearchCV(DecisionTree_model, DecisionTree_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
DecisionTree_results = DecisionTree_search.fit(Train_X, Train_Y)
print('Negative MAPE:' , DecisionTree_results.best_score_)
print('Best params are : %s'% DecisionTree_results.best_params_)

Negative MAPE: -0.09283874293326487
Best params are : {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3}


In [44]:
DecisionTree_model = DecisionTreeRegressor()
DecisionTree_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define DecisionTree_grid
DecisionTree_grid = dict()


DecisionTree_grid['min_samples_split'] = [1,2,3,4,5]
DecisionTree_grid['min_samples_leaf'] = [1, 2, 3, 4, 5]
DecisionTree_grid['ccp_alpha'] = [0, 0.01, 0.1, 1]
DecisionTree_grid['max_features'] = ['sqrt', 'log2', None]

DecisionTree_search = GridSearchCV(DecisionTree_model, DecisionTree_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
DecisionTree_results = DecisionTree_search.fit(Train_X, Train_Y)
print('Negative MAPE:' , DecisionTree_results.best_score_)
print('Best params are : %s'% DecisionTree_results.best_params_)


Negative MAPE: -0.09289034906808695
Best params are : {'ccp_alpha': 0.01, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3}


# voting

Using decision tree, random forest, and bagging

voting before tuning

In [11]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor


DecisionTree_model = DecisionTreeRegressor()
RandomForest_model = RandomForestRegressor()
bagging = BaggingRegressor(n_estimators =150)

votingModel = VotingRegressor([('DecisionTree_model', DecisionTree_model), ('RandomForest_model', RandomForest_model), ('bagging', bagging)])
votingModel=votingModel.fit(Train_X, rav_train_Y)

voting_pred = votingModel.predict(Test_X)

print('Negative MAPE: - ', mean_absolute_percentage_error( rav_test_Y,voting_pred))


Negative MAPE: -  0.0833328056555178


tune voting algorithm

In [12]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor

# Create base estimators
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
bagging = BaggingRegressor(n_estimators=150)

# Create voting regressor ensemble
ensemble = VotingRegressor(estimators=[('dt', dt), ('rf', rf), ('bagging', bagging)])

# Define weight values to test
weight_list = [(0.4, 0.3, 0.3),(0.3, 0.3, 0.4),(0.3, 0.4, 0.3),(0.3333, 0.3333, 0.3334),(0.2, 0.2, 0.6),
              (0.6, 0.2, 0.2),(0.2, 0.6, 0.2),(0.4, 0.4, 0.2),(0.4, 0.2, 0.4),(0.2, 0.4, 0.4)]

# Train and evaluate ensemble for each weight combination
for weights in weight_list:
  # Set weights in ensemble
  ensemble.weights = weights
  
  # Train ensemble on training data
  ensemble.fit(Train_X, rav_train_Y)
  
  # Evaluate ensemble on test data
  voting_tuning_pred = ensemble.predict(Test_X)
  mape = mean_absolute_percentage_error( rav_test_Y,voting_tuning_pred)
  # Print weight values and RMSE score
  print(f"Weights: {weights}, NMAPE:- {mape:.8f}")


Weights: (0.4, 0.3, 0.3), NMAPE:- 0.08368354
Weights: (0.3, 0.3, 0.4), NMAPE:- 0.08330798
Weights: (0.3, 0.4, 0.3), NMAPE:- 0.08363307
Weights: (0.3333, 0.3333, 0.3334), NMAPE:- 0.08359534
Weights: (0.2, 0.2, 0.6), NMAPE:- 0.08355618
Weights: (0.6, 0.2, 0.2), NMAPE:- 0.08430581
Weights: (0.2, 0.6, 0.2), NMAPE:- 0.08349566
Weights: (0.4, 0.4, 0.2), NMAPE:- 0.08348641
Weights: (0.4, 0.2, 0.4), NMAPE:- 0.08372747
Weights: (0.2, 0.4, 0.4), NMAPE:- 0.08356614


The best negative MAPE is -0.08330798, and the weights are (0.3,0.3,0.4)

# stacking

stacking model tune meta regressor

In [38]:
import xgboost as xgb 
from mlxtend.regressor import StackingCVRegressor


stacking_grid = dict()
# etr = ExtraTreesRegressor(n_estimators = 80)
# bagging = BaggingRegressor(n_estimators= 9)
# xgb = xgb.XGBRegressor(eta= 0.21000000000000002, max_depth= 15)
lr = LinearRegression()
Ridge_model = Ridge()

DecisionTree_model = DecisionTreeRegressor()
DecisionTree_model = RandomForestRegressor()
bagging = BaggingRegressor(n_estimators =150)


regressors = [DecisionTree_model,bagging,DecisionTree_model]
stregr = StackingCVRegressor(regressors=regressors, meta_regressor=lr)



Stacking_search = GridSearchCV(estimator=stregr, param_grid={'meta_regressor': [lr,Ridge_model]} ,
                    cv=5, scoring='neg_mean_absolute_percentage_error', n_jobs=-1, verbose = 4,
                    refit=True)



Stacking_results = Stacking_search.fit(Train_X, rav_train_Y)
stacking_pred = Stacking_search.predict(Test_X)

print(pd.DataFrame(Stacking_search.cv_results_))
print('Negative MAPE: ', Stacking_results.best_score_)
print('Best param are : ' , Stacking_results.best_params_)

Fitting 5 folds for each of 2 candidates, totalling 10 fits




[CV 5/5] END meta_regressor=LinearRegression();, score=-0.091 total time= 8.2min
[CV 3/5] END ...........meta_regressor=Ridge();, score=-0.089 total time= 8.2min
[CV 5/5] END ...........meta_regressor=Ridge();, score=-0.091 total time= 8.2min
[CV 4/5] END meta_regressor=LinearRegression();, score=-0.090 total time= 8.2min
[CV 1/5] END ...........meta_regressor=Ridge();, score=-0.090 total time= 8.2min
[CV 4/5] END ...........meta_regressor=Ridge();, score=-0.090 total time= 8.2min
[CV 1/5] END meta_regressor=LinearRegression();, score=-0.090 total time= 8.2min
[CV 3/5] END meta_regressor=LinearRegression();, score=-0.089 total time= 8.2min
[CV 2/5] END meta_regressor=LinearRegression();, score=-0.090 total time= 8.2min
[CV 2/5] END ...........meta_regressor=Ridge();, score=-0.090 total time= 8.2min




   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0     476.145979      0.823954        13.526047        0.736364   
1     476.200717      0.683234        13.434675        0.610314   

  param_meta_regressor                                  params  \
0   LinearRegression()  {'meta_regressor': LinearRegression()}   
1              Ridge()             {'meta_regressor': Ridge()}   

   split0_test_score  split1_test_score  split2_test_score  split3_test_score  \
0          -0.090359          -0.090075          -0.089300          -0.089959   
1          -0.090255          -0.090019          -0.089134          -0.089932   

   split4_test_score  mean_test_score  std_test_score  rank_test_score  
0          -0.090728        -0.090084        0.000473                2  
1          -0.090748        -0.090018        0.000525                1  
Negative MAPE:  -0.09001769776008903
Best param are :  {'meta_regressor': Ridge()}


# best performance model analysis

In [17]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

DecisionTree_model = DecisionTreeRegressor()
RandomForest_model = RandomForestRegressor()
bagging = BaggingRegressor(n_estimators =150)

# define weights
weights = [0.3, 0.3, 0.4]

votingModel = VotingRegressor([('DecisionTree_model', DecisionTree_model), ('RandomForest_model', RandomForest_model), ('bagging', bagging)],
                             weights=weights)

votingModel=votingModel.fit(Train_X, rav_train_Y)

voting_pred_best_para = votingModel.predict(Test_X)

mape = mean_absolute_percentage_error(rav_test_Y,voting_pred_best_para)
mse = mean_squared_error(rav_test_Y,voting_pred_best_para)
rmse = np.sqrt(mse)
mae = mean_absolute_error(rav_test_Y,voting_pred_best_para)


print('Negative MAPE: - ', mape)
print('MSE: ', mse)
print('RMSE ' , rmse)
print('MAE ' , mae)

Negative MAPE: -  0.08349298590643765
MSE:  1892035.0417698033
RMSE  1375.5126468956232
MAE  593.9537599454592
