In [1]:
import matplotlib
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, mean_absolute_percentage_error



In [2]:
data = pd.read_csv ('/Users/nina/Downloads/mlproject/data/Clean_Dataset.csv')
# drop index column and check the datatype
data = data.drop(['Unnamed: 0'], axis=1)
# label encode three categorical columns
le = LabelEncoder()
data["airline_label"] = le.fit_transform(data['airline'])
data["source_city_label"] = le.fit_transform(data['source_city'])
data["destination_city_label"] = le.fit_transform(data['destination_city'])
# category time and stops according to sequence

def time_label(value):
    if value == "Early_Morning":
        return 0
    elif value == "Morning":
        return 1
    elif value == "Afternoon":
        return 2
    elif value == "Evening":
        return 3
    elif value == "Night":
        return 4
    elif value == "Late_Night":
        return 5

def stops_label(value):
    if value == "zero":
        return 0
    elif value == "one":
        return 1
    elif value == "two_or_more":
        return 2
    

data['departure_time_label'] = data['departure_time'].map(time_label)
data['arrival_time_label'] = data['arrival_time'].map(time_label)
data['stops_label'] = data['stops'].map(stops_label)

# Split Dataframe using groupby()
# grouping by economy and business class
data['class_label'] = np.where(data['class'] == "Economy", True, False)
grouped = data.groupby(data.class_label)
economyData = grouped.get_group(True)
economyData=economyData.drop(['class_label'],axis=1)
print(economyData.head())
data=economyData.drop(['airline', 'flight', 'source_city','departure_time','stops','arrival_time', 'destination_city','class'],axis=1)


    airline   flight source_city departure_time stops   arrival_time  \
0  SpiceJet  SG-8709       Delhi        Evening  zero          Night   
1  SpiceJet  SG-8157       Delhi  Early_Morning  zero        Morning   
2   AirAsia   I5-764       Delhi  Early_Morning  zero  Early_Morning   
3   Vistara   UK-995       Delhi        Morning  zero      Afternoon   
4   Vistara   UK-963       Delhi        Morning  zero        Morning   

  destination_city    class  duration  days_left  price  airline_label  \
0           Mumbai  Economy      2.17          1   5953              4   
1           Mumbai  Economy      2.33          1   5953              4   
2           Mumbai  Economy      2.17          1   5956              0   
3           Mumbai  Economy      2.25          1   5955              5   
4           Mumbai  Economy      2.33          1   5955              5   

   source_city_label  destination_city_label  departure_time_label  \
0                  2                       5        

In [3]:
X=data.drop(['price'],axis=1)
y=data[['price']]
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state = 42)
rav_train_Y = np.ravel(Train_Y)
rav_test_Y = np.ravel(Test_Y)
print(X.head())
print(y.head())

   duration  days_left  airline_label  source_city_label  \
0      2.17          1              4                  2   
1      2.33          1              4                  2   
2      2.17          1              0                  2   
3      2.25          1              5                  2   
4      2.33          1              5                  2   

   destination_city_label  departure_time_label  arrival_time_label  \
0                       5                     3                   4   
1                       5                     0                   1   
2                       5                     0                   0   
3                       5                     1                   2   
4                       5                     1                   1   

   stops_label  
0            0  
1            0  
2            0  
3            0  
4            0  
   price
0   5953
1   5953
2   5956
3   5955
4   5955


# models

In [4]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [5]:
Ridge_model = Ridge()

Ridge_model.fit(Train_X, Train_Y)

y_pred_ridge = Ridge_model.predict(Test_X)

ridge_mape = mean_absolute_percentage_error(Test_Y, y_pred_ridge)

print('Negative MAPE: -' , y_pred_ridge)

Negative MAPE: - 0.36838702384428623


In [6]:
Lasso_model = Lasso()

Lasso_model.fit(Train_X, Train_Y)

y_pred_Lasso = Lasso_model.predict(Test_X)

Lasso_mape = mean_absolute_percentage_error(Test_Y, y_pred_Lasso)

print('Negative MAPE: -' , Lasso_mape)

Negative MAPE: - 0.36836396755014034


In [9]:
ElasticNet_model = ElasticNet()

ElasticNet_model.fit(Train_X, Train_Y)

y_pred_Elastic = ElasticNet_model.predict(Test_X)

elastic_mape = mean_absolute_percentage_error(Test_Y, y_pred_Elastic)

print('Negative MAPE: -' , elastic_mape)

Negative MAPE: - 0.37407777647917334


In [15]:
DecisionTree_model = DecisionTreeRegressor()

DecisionTree_model.fit(Train_X, Train_Y)

y_pred_DecisionTree = DecisionTree_model.predict(Test_X)

decisionTree_mape = mean_absolute_percentage_error(Test_Y, y_pred_DecisionTree)

print('Negative MAPE: -' , decisionTree_mape)

Negative MAPE: - 0.08899790671833943


In [11]:
RandomForest_model = RandomForestRegressor()

RandomForest_model.fit(Train_X, Train_Y)

y_pred_randomForest = RandomForest_model.predict(Test_X)

randomForest_mape = mean_absolute_percentage_error(Test_Y, y_pred_randomForest)

print('Negative MAPE: -' , randomForest_mape)

  RandomForest_model.fit(Train_X, Train_Y)


Negative MAPE: - 0.08473876432575116


In [12]:
BayesianRidge_model = BayesianRidge()

BayesianRidge_model.fit(Train_X, Train_Y)

y_pred_BayesianRidge = BayesianRidge_model.predict(Test_X)

BayesianRidge_mape = mean_absolute_percentage_error(Test_Y, y_pred_BayesianRidge)

print('Negative MAPE: -' , BayesianRidge_mape)

Negative MAPE: - 0.3683813024091547


  y = column_or_1d(y, warn=True)


In [13]:
from sklearn.ensemble import AdaBoostRegressor
adaBoost = AdaBoostRegressor()

adaBoost.fit(Train_X, Train_Y)

y_pred_adaBoost = adaBoost.predict(Test_X)

adaBoost_mape = mean_absolute_percentage_error(Test_Y, y_pred_adaBoost)

print('Negative MAPE: -' , adaBoost_mape)

  y = column_or_1d(y, warn=True)


Negative MAPE: - 0.5350788691803386


In [19]:
from sklearn.ensemble import BaggingRegressor

bagging = BaggingRegressor()

bagging.fit(Train_X, rav_train_Y)

y_pred_bagging = bagging.predict(Test_X)

bagging_mape = mean_absolute_percentage_error(Test_Y, y_pred_bagging)

print('Negative MAPE: -' , bagging_mape)

Negative MAPE: - 0.08823164721758261


In [18]:
from sklearn.ensemble import ExtraTreesRegressor

exTree = ExtraTreesRegressor()

exTree.fit(Train_X, rav_train_Y)

y_pred_exTree = exTree.predict(Test_X)

exTree_mape = mean_absolute_percentage_error(Test_Y, y_pred_exTree)

print('Negative MAPE: -' , exTree_mape)

Negative MAPE: - 0.09367492055711514


In [14]:
import xgboost as xgb 
xg_reg_model = xgb.XGBRegressor()

xg_reg_model.fit(Train_X, Train_Y)

y_pred_xg = xg_reg_model.predict(Test_X)

xgb_mape = mean_absolute_percentage_error(Test_Y, y_pred_xg)

print('Negative MAPE: -' , xgb_mape)

Negative MAPE: - 0.1619964439386442
