In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import set_config

set_config(display='diagram')

In [2]:
df = pd.read_excel('../../dataset/Data_Train.xlsx')

In [3]:
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [4]:
df_test = pd.read_excel('../../dataset/Test_set.xlsx')

In [5]:
df_test.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [6]:
df = df.drop_duplicates()

In [34]:
# Import the required library
from geopy.geocoders import Nominatim

# Initialize Nominatim API
geolocator = Nominatim(user_agent="MyApp")


for index, row in df.iterrows():
    location = geolocator.geocode(row['Source'])
    df.loc[index, "origin_one_latitude"] = location.latitude
    df.loc[index, "origin_one_longitude"] = location.longitude
    location = geolocator.geocode(row['Destination'])
    df.loc[index, "origin_two_latitude"] = location.latitude
    df.loc[index, "origin_two_longitude"] = location.longitude

KeyboardInterrupt: 

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4751 entries, 0 to 4800
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Airline               4751 non-null   object 
 1   Date_of_Journey       4751 non-null   object 
 2   Source                4751 non-null   object 
 3   Destination           4751 non-null   object 
 4   Route                 4751 non-null   object 
 5   Dep_Time              4751 non-null   object 
 6   Arrival_Time          4751 non-null   object 
 7   Duration              4751 non-null   object 
 8   Total_Stops           4751 non-null   object 
 9   Additional_Info       4751 non-null   object 
 10  Price                 4751 non-null   int64  
 11  origin_one_latitude   4751 non-null   float64
 12  origin_one_longitude  4751 non-null   float64
 13  origin_two_latitude   4751 non-null   float64
 14  origin_two_longitude  4751 non-null   float64
dtypes: float64(4), int64(

In [56]:
class DateFormatter(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xdate = X.apply(pd.to_datetime)
        return Xdate

In [57]:
date_pipe = DateFormatter()
date_pipe.fit_transform(df[['Date_of_Journey', 'Dep_Time', 'Arrival_Time']])

Unnamed: 0,Date_of_Journey,Dep_Time,Arrival_Time
0,2019-03-24,2022-03-07 22:20:00,2022-03-22 01:10:00
1,2019-01-05,2022-03-07 05:50:00,2022-03-07 13:15:00
2,2019-09-06,2022-03-07 09:25:00,2022-06-10 04:25:00
3,2019-12-05,2022-03-07 18:05:00,2022-03-07 23:30:00
4,2019-01-03,2022-03-07 16:50:00,2022-03-07 21:35:00
...,...,...,...
4796,2019-06-06,2022-03-07 14:05:00,2022-03-07 23:35:00
4797,2019-05-18,2022-03-07 17:00:00,2022-03-07 19:45:00
4798,2019-06-27,2022-03-07 18:45:00,2022-06-28 19:15:00
4799,2019-04-27,2022-03-07 10:30:00,2022-03-07 13:20:00


In [58]:
class DateEncoder(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        new_cols = []
        for col in X_copy.columns: 
            new_cols.append(X_copy[col].dt.month)
            new_cols.append(X_copy[col].dt.day)
        return pd.concat(new_cols, axis=1)

In [59]:
date_pipe = Pipeline([
        ('date_format', DateFormatter()),
        ('date_enc', DateEncoder())
    ])
date_pipe.fit_transform(df[['Date_of_Journey', 'Dep_Time', 'Arrival_Time']])

Unnamed: 0,Date_of_Journey,Date_of_Journey.1,Dep_Time,Dep_Time.1,Arrival_Time,Arrival_Time.1
0,3,24,3,7,3,22
1,1,5,3,7,3,7
2,9,6,3,7,6,10
3,12,5,3,7,3,7
4,1,3,3,7,3,7
...,...,...,...,...,...,...
4796,6,6,3,7,3,7
4797,5,18,3,7,3,7
4798,6,27,3,7,6,28
4799,4,27,3,7,3,7


In [60]:
class TimeFeaturesEncoder(BaseEstimator, TransformerMixin):
    """Extract the day of week (dow), the hour, the month and the year from a time column."""

    def __init__(self, time_zone_name='UTC'):
#         self.time_column = time_column
        self.time_zone_name = time_zone_name

    def extract_time_features(self, X):
        timezone_name = self.time_zone_name
#         time_column = self.time_column
        df = X.copy()
#         df.index = df[time_column].apply(pd.to_datetime)
        old_cols = list(df.columns)
        for col in old_cols:
            df[col] = pd.to_datetime(df[col])
            df[col] = df[col].dt.tz_localize(timezone_name)
            df[f"{col}_hour"] = df[col].dt.hour
            df[f"{col}_minute"] = df[col].dt.minute
        df.drop(columns=old_cols, inplace=True)
        return df
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """Returns a copy of the DataFrame X with only four columns: 'hour', 'month'"""
        return self.extract_time_features(X)#.reset_index(drop=True)

In [61]:
time_enc = TimeFeaturesEncoder()
time_features = time_enc.fit_transform(df[['Dep_Time', 'Arrival_Time']])
time_features

Unnamed: 0,Dep_Time_hour,Dep_Time_minute,Arrival_Time_hour,Arrival_Time_minute
0,22,20,1,10
1,5,50,13,15
2,9,25,4,25
3,18,5,23,30
4,16,50,21,35
...,...,...,...,...
4796,14,5,23,35
4797,17,0,19,45
4798,18,45,19,15
4799,10,30,13,20


In [62]:
def haversine_vectorized(df,
                         start_lat="origin_one_latitude",
                         start_lon="origine_one_longitude",
                         end_lat="origin_two_latitude",
                         end_lon="origin_two_longitude"):
    """ 
        Calculates the great circle distance between two points 
        on the earth (specified in decimal degrees).
        Vectorized version of the haversine distance for pandas df.
        Computes the distance in kms.
    """

    lat_1_rad, lon_1_rad = np.radians(df[start_lat].astype(float)), np.radians(
        df[start_lon].astype(float))
    lat_2_rad, lon_2_rad = np.radians(df[end_lat].astype(float)), np.radians(
        df[end_lon].astype(float))
    dlon = lon_2_rad - lon_1_rad
    dlat = lat_2_rad - lat_1_rad

    a = np.sin(dlat / 2.0)**2 + np.cos(lat_1_rad) * np.cos(lat_2_rad) * np.sin(
        dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c

In [63]:
# create a DistanceTransformer
class DistanceTransformer(BaseEstimator, TransformerMixin):
    """
        Computes the haversine distance between two GPS points.
        Returns a copy of the DataFrame X with only one column: 'distance'.
    """

    def __init__(self,
                 start_lat="origin_one_latitude",
                 start_lon="origin_one_longitude",
                 end_lat="origin_two_latitude",
                 end_lon="origin_two_longitude"):
        self.start_lat = start_lat
        self.start_lon = start_lon
        self.end_lat = end_lat
        self.end_lon = end_lon

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        X_ = X.copy()
        X_["distance"] = haversine_vectorized(X_,
                                              start_lat=self.start_lat,
                                              start_lon=self.start_lon,
                                              end_lat=self.end_lat,
                                              end_lon=self.end_lon)
        return X_[['distance']]

In [64]:
dist_transformer = DistanceTransformer("origin_one_latitude", "origin_one_longitude","origin_two_latitude","origin_two_longitude")
dist_df = dist_transformer.fit_transform(df[["origin_one_latitude", "origin_one_longitude","origin_two_latitude","origin_two_longitude"]])

In [65]:
dist_df.head(20)

Unnamed: 0,distance
0,1741.146394
1,1561.809022
2,2083.994365
3,1561.809022
4,1741.146394
5,1561.809022
6,1741.146394
7,1741.146394
8,1741.146394
9,2083.994365


In [66]:
def duration_process(df):
    duration_obj = df.values.reshape(-1)
    print(duration_obj)
    col = 'duration'
    for i in range(len(duration_obj)):
        if len(duration_obj[i].split(' ')) == 2:
            pass
        else:
            if 'h' in duration_obj[i]:
                duration_obj[i] = duration_obj[i] + ' ' + '0m'
            else:
                duration_obj[i] = '0h' + ' ' + duration_obj[i]
    df = pd.DataFrame(duration_obj, dtype='str')
    df[f'{col}_hour'] = df[0].str.split("h ", n=1,
                                          expand=True)[0].astype('int64')
    df[f'{col}_min'] = df[0].str.split(
        " ", n=2, expand=True)[1].str.strip('m').astype('int64')
    df.drop(columns=0, inplace=True)
    return df

In [67]:
transformer = FunctionTransformer(duration_process)
transformer.fit_transform(df['Duration']) #df['Duration']

['2h 50m' '7h 25m' '19h 0m' ... '24h 30m' '2h 50m' '2h 55m']


Unnamed: 0,duration_hour,duration_min
0,2,50
1,7,25
2,19,0
3,5,25
4,4,45
...,...,...
4746,9,30
4747,2,45
4748,24,30
4749,2,50


In [68]:

def set_preproc_pipeline():
    # create date pipeline
    date_pipe = Pipeline([
        ('date_format', DateFormatter()),
        ('date_enc', DateEncoder())
    ])
        
    # create time pipeline
    time_pipe = Pipeline([
        ('time_enc', TimeFeaturesEncoder()),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])
    
    dist_pipe = Pipeline([
        ('dist_trans', DistanceTransformer()),
        ('stdscaler', StandardScaler())
    ])
    
    duration_pipe = FunctionTransformer(duration_process)
    
    preproc_pipe = ColumnTransformer([('date_pipe', date_pipe, ['Date_of_Journey', 'Dep_Time', 'Arrival_Time']),
                       ('time_pipe', time_pipe, ['Dep_Time', 'Arrival_Time']), 
                       ('dist_pipe', dist_pipe, ["origin_one_latitude", "origin_one_longitude","origin_two_latitude","origin_two_longitude"]),
                       ('duration_pipe', duration_pipe, ['Duration'])
                                     ], remainder='drop')
    final_pipe = Pipeline([
        ('preproc', preproc_pipe),
        ('stdscaler', StandardScaler())
    ])

# display time pipeline
    return final_pipe

In [69]:
preproc_pipeline = set_preproc_pipeline()
preproc_pipeline

In [70]:
final_pipe = Pipeline([
        ('preproc', preproc_pipeline),
        ('random_forest_model', RandomForestRegressor())
    ])

In [71]:
final_pipe

In [72]:
# set X and y
y = df["Price"]
X = df.drop("Price", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

preproc_pipeline.fit_transform(X)

['2h 50m' '7h 25m' '19h 0m' ... '24h 30m' '2h 50m' '2h 55m']


array([[-0.85049609,  1.29231854,  0.        , ..., -0.03079157,
        -0.97741325,  1.28106698],
       [-1.52434032, -0.87054675,  0.        , ..., -0.50263212,
        -0.3820836 , -0.20689244],
       [ 1.1710366 , -0.75671174,  0.        , ...,  0.87124893,
         1.04670757, -1.69485187],
       ...,
       [ 0.16027025,  1.63382359,  0.        , ...,  0.87124893,
         1.64203722,  0.09069944],
       [-0.51357398,  1.63382359,  0.        , ..., -0.01980042,
        -0.97741325,  1.28106698],
       [-0.85049609,  0.60930845,  0.        , ...,  0.87124893,
        -0.97741325,  1.57865887]])

In [73]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3800 entries, 4334 to 2740
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Airline               3800 non-null   object 
 1   Date_of_Journey       3800 non-null   object 
 2   Source                3800 non-null   object 
 3   Destination           3800 non-null   object 
 4   Route                 3800 non-null   object 
 5   Dep_Time              3800 non-null   object 
 6   Arrival_Time          3800 non-null   object 
 7   Duration              3800 non-null   object 
 8   Total_Stops           3800 non-null   object 
 9   Additional_Info       3800 non-null   object 
 10  origin_one_latitude   3800 non-null   float64
 11  origin_one_longitude  3800 non-null   float64
 12  origin_two_latitude   3800 non-null   float64
 13  origin_two_longitude  3800 non-null   float64
dtypes: float64(4), object(10)
memory usage: 445.3+ KB


In [74]:
# implement train() function
def train(X_train, y_train, pipeline):
    '''returns a trained pipelined model'''
    pipeline.fit(X_train, y_train)
    return pipeline

In [78]:
def compute_rmse(y_pred, y_true):
    return np.sqrt(((y_pred - y_true)**2).mean())

In [75]:
# implement evaluate() function
def evaluate(X_test, y_test, pipeline):
    '''returns the value of the RMSE'''
    y_pred = pipeline.predict(X_test)
    rmse = compute_rmse(y_pred, y_test)
    print(rmse)
    return rmse

In [79]:
# train the pipeline
train(X_train, y_train, final_pipe)

# evaluate the pipeline
rmse = evaluate(X_test, y_test, final_pipe)

['2h 25m' '2h 25m' '1h 25m' ... '11h 5m' '26h 35m' '2h 50m']
['1h 15m' '9h 50m' '7h 30m' '9h 10m' '25h 15m' '6h 35m' '10h 0m' '2h 50m'
 '2h 30m' '26h 10m' '2h 45m' '2h 55m' '9h 0m' '9h 25m' '15h 35m' '4h 35m'
 '10h 25m' '6h 35m' '4h 35m' '2h 15m' '5h 30m' '13h 15m' '22h 0m'
 '10h 30m' '11h 30m' '2h 30m' '2h 50m' '2h 50m' '24h 35m' '9h 30m'
 '8h 30m' '8h 40m' '5h 0m' '2h 45m' '4h 55m' '2h 40m' '7h 15m' '12h 0m'
 '7h 15m' '24h 0m' '2h 35m' '12h 10m' '11h 25m' '16h 45m' '10h 25m'
 '8h 10m' '14h 55m' '14h 50m' '2h 50m' '14h 25m' '2h 40m' '1h 25m'
 '33h 15m' '22h 15m' '1h 30m' '15h 10m' '1h 30m' '3h 0m' '24h 55m'
 '23h 35m' '2h 50m' '14h 25m' '14h 25m' '2h 45m' '6h 20m' '21h 5m'
 '2h 50m' '5h 30m' '26h 40m' '22h 35m' '3h 5m' '4h 30m' '25h 50m'
 '15h 15m' '15h 45m' '2h 20m' '2h 50m' '7h 20m' '7h 35m' '2h 35m' '2h 45m'
 '26h 30m' '3h 0m' '3h 0m' '10h 15m' '11h 0m' '2h 30m' '26h 30m' '13h 20m'
 '7h 40m' '23h 10m' '9h 35m' '3h 5m' '10h 15m' '26h 0m' '9h 30m' '14h 35m'
 '13h 15m' '2h 55m' '3h 15