In [32]:
import matplotlib
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, mean_absolute_percentage_error



In [33]:
data = pd.read_csv ('/Users/nina/Downloads/mlproject/data/Clean_Dataset.csv')
# drop index column and check the datatype
data = data.drop(['Unnamed: 0'], axis=1)


In [34]:
# label encode three categorical columns
le = LabelEncoder()
data["airline_label"] = le.fit_transform(data['airline'])
data["source_city_label"] = le.fit_transform(data['source_city'])
data["destination_city_label"] = le.fit_transform(data['destination_city'])

In [35]:
# category time and stops according to sequence

def time_label(value):
    if value == "Early_Morning":
        return 0
    elif value == "Morning":
        return 1
    elif value == "Afternoon":
        return 2
    elif value == "Evening":
        return 3
    elif value == "Night":
        return 4
    elif value == "Late_Night":
        return 5

def stops_label(value):
    if value == "zero":
        return 0
    elif value == "one":
        return 1
    elif value == "two_or_more":
        return 2
    

data['departure_time_label'] = data['departure_time'].map(time_label)
data['arrival_time_label'] = data['arrival_time'].map(time_label)
data['stops_label'] = data['stops'].map(stops_label)

In [36]:
# Split Dataframe using groupby()
# grouping by economy and business class
data['class_label'] = np.where(data['class'] == "Economy", True, False)
grouped = data.groupby(data.class_label)
economyData = grouped.get_group(True)
economyData=economyData.drop(['class_label'],axis=1)
data=economyData.drop(['airline', 'flight', 'source_city','departure_time','stops','arrival_time', 'destination_city','class'],axis=1)


# do standardscaler

In [37]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [38]:
X=data.drop(['price'],axis=1)
y=data[['price']]


Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state = 42)

print(Train_X.head())
print(Train_Y.head())

        duration  days_left  airline_label  source_city_label  \
179214     12.25         35              1                  3   
187031      9.17         46              2                  1   
117650     13.17         27              1                  0   
167716     12.33         23              5                  3   
51603       2.17         44              2                  5   

        destination_city_label  departure_time_label  arrival_time_label  \
179214                       1                     1                   4   
187031                       2                     2                   4   
117650                       1                     3                   0   
167716                       0                     1                   4   
51603                        2                     4                   4   

        stops_label  
179214            1  
187031            1  
117650            1  
167716            2  
51603             0  
        price
179214

In [41]:
scaler = StandardScaler()
Train_X = scaler.fit_transform(Train_X)
Test_X = scaler.fit_transform(Test_X)

rav_train_Y = np.ravel(Train_Y)
rav_test_Y = np.ravel(Test_Y)

In [42]:
Ridge_model = Ridge()
# define Ridge_grid
Ridge_grid = dict()
##tune alpha
Ridge_grid['alpha'] = np.arange(0.01, 1, 0.01)
Ridge_grid['max_iter'] = np.array([500,1000,2000])
Ridge_search = GridSearchCV(Ridge_model, Ridge_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
Ridge_results = Ridge_search.fit(Train_X, Train_Y)
print('Negative MAPE: ' , Ridge_results.best_score_)
print('Best param are : %s' % Ridge_results.best_params_)

Negative MAPE:  -0.36924663325923107
Best param are : {'alpha': 0.99, 'max_iter': 500}


In [43]:
Lasso_model = Lasso()
# define Lasso_grid
Lasso_grid = dict()
##tune alpha
Lasso_grid['alpha'] = np.arange(0.01, 1, 0.01)
Lasso_grid['max_iter'] = np.array([500,1000,2000])
Lasso_search = GridSearchCV(Lasso_model, Lasso_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
Lasso_results = Lasso_search.fit(Train_X, Train_Y)
print('Negative MAPE: ' ,Lasso_results.best_score_)
print('Best params are : %s' % Lasso_results.best_params_)

Negative MAPE:  -0.3692281473572322
Best params are : {'alpha': 0.99, 'max_iter': 500}


In [46]:
ElasticNet_model = ElasticNet()
ElasticNet_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define ElasticNet_grid
ElasticNet_grid = dict()
##tune alpha
ElasticNet_grid['alpha'] = np.arange(0.01, 1, 0.01)
ElasticNet_grid['max_iter'] = np.array([500,1000,2000])
ElasticNet_search = GridSearchCV(ElasticNet_model, ElasticNet_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
ElasticNet_results = ElasticNet_search.fit(Train_X,Train_Y)
print('Negative MAPE: ' , ElasticNet_results.best_score_)
print('Best params are : %s'% ElasticNet_results.best_params_)

Negative MAPE:  -0.3643543515547674
Best params are : {'alpha': 0.33, 'max_iter': 500}


In [45]:
DecisionTree_model = DecisionTreeRegressor()
DecisionTree_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define DecisionTree_grid
DecisionTree_grid = dict()
##tune alpha
DecisionTree_grid['max_depth'] = np.arange(5, 30, 5)
DecisionTree_grid['max_leaf_nodes'] = np.array([10,100,1000])
DecisionTree_search = GridSearchCV(DecisionTree_model, DecisionTree_grid, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
DecisionTree_results = DecisionTree_search.fit(Train_X, Train_Y)
print('Negative MAPE:' , DecisionTree_results.best_score_)
print('Best params are : %s'% DecisionTree_results.best_params_)

Negative MAPE: -0.1682979294819324
Best params are : {'max_depth': 25, 'max_leaf_nodes': 1000}
