In [1]:
import matplotlib
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, mean_absolute_percentage_error



In [2]:
data = pd.read_csv ('/Users/nina/Downloads/mlproject/data/Clean_Dataset.csv')
# drop index column and check the datatype
data = data.drop(['Unnamed: 0'], axis=1)


In [3]:
# label encode three categorical columns
le = LabelEncoder()
data["airline_label"] = le.fit_transform(data['airline'])
data["source_city_label"] = le.fit_transform(data['source_city'])
data["destination_city_label"] = le.fit_transform(data['destination_city'])

In [4]:
# category time and stops according to sequence

def time_label(value):
    if value == "Early_Morning":
        return 0
    elif value == "Morning":
        return 1
    elif value == "Afternoon":
        return 2
    elif value == "Evening":
        return 3
    elif value == "Night":
        return 4
    elif value == "Late_Night":
        return 5

def stops_label(value):
    if value == "zero":
        return 0
    elif value == "one":
        return 1
    elif value == "two_or_more":
        return 2
    

data['departure_time_label'] = data['departure_time'].map(time_label)
data['arrival_time_label'] = data['arrival_time'].map(time_label)
data['stops_label'] = data['stops'].map(stops_label)

In [5]:
# Split Dataframe using groupby()
# grouping by economy and business class
data['class_label'] = np.where(data['class'] == "Economy", True, False)
grouped = data.groupby(data.class_label)
economyData = grouped.get_group(True)
economyData=economyData.drop(['class_label'],axis=1)
print(economyData.head())
data=economyData.drop(['airline', 'flight', 'source_city','departure_time','stops','arrival_time', 'destination_city','class'],axis=1)


    airline   flight source_city departure_time stops   arrival_time  \
0  SpiceJet  SG-8709       Delhi        Evening  zero          Night   
1  SpiceJet  SG-8157       Delhi  Early_Morning  zero        Morning   
2   AirAsia   I5-764       Delhi  Early_Morning  zero  Early_Morning   
3   Vistara   UK-995       Delhi        Morning  zero      Afternoon   
4   Vistara   UK-963       Delhi        Morning  zero        Morning   

  destination_city    class  duration  days_left  price  airline_label  \
0           Mumbai  Economy      2.17          1   5953              4   
1           Mumbai  Economy      2.33          1   5953              4   
2           Mumbai  Economy      2.17          1   5956              0   
3           Mumbai  Economy      2.25          1   5955              5   
4           Mumbai  Economy      2.33          1   5955              5   

   source_city_label  destination_city_label  departure_time_label  \
0                  2                       5        

In [6]:
X=data.drop(['price'],axis=1)
y=data[['price']]
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state = 42)
rav_train_Y = np.ravel(Train_Y)
rav_test_Y = np.ravel(Test_Y)
print(X.head())
print(y.head())

   duration  days_left  airline_label  source_city_label  \
0      2.17          1              4                  2   
1      2.33          1              4                  2   
2      2.17          1              0                  2   
3      2.25          1              5                  2   
4      2.33          1              5                  2   

   destination_city_label  departure_time_label  arrival_time_label  \
0                       5                     3                   4   
1                       5                     0                   1   
2                       5                     0                   0   
3                       5                     1                   2   
4                       5                     1                   1   

   stops_label  
0            0  
1            0  
2            0  
3            0  
4            0  
   price
0   5953
1   5953
2   5956
3   5955
4   5955


# one layer neural network

In [11]:
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Input 
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# one layer neural network
model = Sequential()

model.add(Dense(1, input_dim=8, activation='relu'))

# Define the optimizer
optimizer = tf.keras.optimizers.Adam()

# Compile the model
model.compile(optimizer=optimizer, loss='mean_absolute_percentage_error', metrics=['accuracy'])

# Train the model for 10 epochs
model.fit(Train_X, Train_Y, epochs=10)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x299be6d70>

In [15]:
# Predict the classes for the test set using the trained model
onelayer_y_pred = model.predict(Test_X)

# Define the MAPE loss function
def mape(y_true, y_pred):
    return tf.reduce_mean(tf.abs((y_true - y_pred) / y_true))

# Calculate the MAPE
onelayer_mape_value = mape(Test_Y, onelayer_y_pred).numpy()

print("One-layer model evaluation metrics:")
print("negative mape: - {:.4f}".format(onelayer_mape_value))


One-layer model evaluation metrics:
negative mape: - 0.5164


# grid search 3 layer

In [16]:
# Define the 3-layer neural network model
def create_model(n_hidden1, n_hidden2, n_hidden3, learning_rate):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(units=n_hidden1, input_dim=8, activation='relu'),
        tf.keras.layers.Dense(units=n_hidden2, activation='relu'),
        tf.keras.layers.Dense(units=n_hidden3, activation='relu'),
        tf.keras.layers.Dense(units=1, activation='linear')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=mape)
    return model

In [20]:
# Define the parameter grid for the grid search
param_grid = {
    'n_hidden1': [64, 128],
    'n_hidden2': [64, 128],
    'n_hidden3': [64, 128],
    'learning_rate': [ 0.01, 0.1]
}

In [23]:
# Create the KerasRegressor wrapper for use with scikit-learn
model = tf.keras.wrappers.scikit_learn.KerasRegressor(build_fn=create_model, verbose=0)

# Define the grid search object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,  cv=3)

# Perform the grid search
grid_result = grid_search.fit(Train_X.astype(np.float64), Train_Y.astype(np.float64))

# Print the results
print('Best negative MAPE: -', np.abs(grid_result.best_score_))
print('Best parameters:', grid_result.best_params_)


  model = tf.keras.wrappers.scikit_learn.KerasRegressor(build_fn=create_model, verbose=0)


Best negative MAPE: - 0.25448765357335407
Best parameters: {'learning_rate': 0.01, 'n_hidden1': 64, 'n_hidden2': 128, 'n_hidden3': 128}
