In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import set_config

set_config(display='diagram')

In [2]:
df = pd.read_excel('../../dataset/Data_Train.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: '../../dataset/Data_Train.xlsx'

In [None]:
df.head()

In [None]:
df_test = pd.read_excel('../../dataset/Test_set.xlsx')

In [None]:
df_test.head()

In [None]:
df = df.drop_duplicates()

In [None]:
# Import the required library
#from geopy.geocoders import Nominatim

# Initialize Nominatim API
#geolocator = Nominatim(user_agent="MyApp")


#for index, row in df.iterrows():
#    location = geolocator.geocode(row['Source'])
#    df.loc[index, "origin_one_latitude"] = location.latitude
#    df.loc[index, "origin_one_longitude"] = location.longitude
#    location = geolocator.geocode(row['Destination'])
#    df.loc[index, "origin_two_latitude"] = location.latitude
#    df.loc[index, "origin_two_longitude"] = location.longitude

In [None]:
df.info()

In [None]:
class DateFormatter(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xdate = X.apply(pd.to_datetime)
        return Xdate

In [None]:
date_pipe = DateFormatter()
date_pipe.fit_transform(df[['Date_of_Journey', 'Dep_Time', 'Arrival_Time']])

In [None]:
class DateEncoder(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        new_cols = []
        for col in X_copy.columns: 
            new_cols.append(X_copy[col].dt.month)
            new_cols.append(X_copy[col].dt.day)
        return pd.concat(new_cols, axis=1)

In [None]:
date_pipe = Pipeline([
        ('date_format', DateFormatter()),
        ('date_enc', DateEncoder())
    ])
date_pipe.fit_transform(df[['Date_of_Journey', 'Dep_Time', 'Arrival_Time']])

In [None]:
class TimeFeaturesEncoder(BaseEstimator, TransformerMixin):
    """Extract the day of week (dow), the hour, the month and the year from a time column."""

    def __init__(self, time_zone_name='UTC'):
#         self.time_column = time_column
        self.time_zone_name = time_zone_name

    def extract_time_features(self, X):
        timezone_name = self.time_zone_name
#         time_column = self.time_column
        df = X.copy()
#         df.index = df[time_column].apply(pd.to_datetime)
        old_cols = list(df.columns)
        for col in old_cols:
            df[col] = pd.to_datetime(df[col])
            df[col] = df[col].dt.tz_localize(timezone_name)
            df[f"{col}_hour"] = df[col].dt.hour
            df[f"{col}_minute"] = df[col].dt.minute
        df.drop(columns=old_cols, inplace=True)
        return df
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """Returns a copy of the DataFrame X with only four columns: 'hour', 'month'"""
        return self.extract_time_features(X)#.reset_index(drop=True)

In [None]:
time_enc = TimeFeaturesEncoder()
time_features = time_enc.fit_transform(df[['Dep_Time', 'Arrival_Time']])
time_features

In [None]:
def haversine_vectorized(df,
                         start_lat="origin_one_latitude",
                         start_lon="origine_one_longitude",
                         end_lat="origin_two_latitude",
                         end_lon="origin_two_longitude"):
    """ 
        Calculates the great circle distance between two points 
        on the earth (specified in decimal degrees).
        Vectorized version of the haversine distance for pandas df.
        Computes the distance in kms.
    """

    lat_1_rad, lon_1_rad = np.radians(df[start_lat].astype(float)), np.radians(
        df[start_lon].astype(float))
    lat_2_rad, lon_2_rad = np.radians(df[end_lat].astype(float)), np.radians(
        df[end_lon].astype(float))
    dlon = lon_2_rad - lon_1_rad
    dlat = lat_2_rad - lat_1_rad

    a = np.sin(dlat / 2.0)**2 + np.cos(lat_1_rad) * np.cos(lat_2_rad) * np.sin(
        dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c

In [None]:
# create a DistanceTransformer
class DistanceTransformer(BaseEstimator, TransformerMixin):
    """
        Computes the haversine distance between two GPS points.
        Returns a copy of the DataFrame X with only one column: 'distance'.
    """

    def __init__(self,
                 start_lat="origin_one_latitude",
                 start_lon="origin_one_longitude",
                 end_lat="origin_two_latitude",
                 end_lon="origin_two_longitude"):
        self.start_lat = start_lat
        self.start_lon = start_lon
        self.end_lat = end_lat
        self.end_lon = end_lon

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        X_ = X.copy()
        X_["distance"] = haversine_vectorized(X_,
                                              start_lat=self.start_lat,
                                              start_lon=self.start_lon,
                                              end_lat=self.end_lat,
                                              end_lon=self.end_lon)
        return X_[['distance']]

In [None]:
dist_transformer = DistanceTransformer("origin_one_latitude", "origin_one_longitude","origin_two_latitude","origin_two_longitude")
dist_df = dist_transformer.fit_transform(df[["origin_one_latitude", "origin_one_longitude","origin_two_latitude","origin_two_longitude"]])

In [None]:
dist_df.head(20)

In [None]:
def duration_process(df):
    duration_obj = df.values.reshape(-1)
    print(duration_obj)
    col = 'duration'
    for i in range(len(duration_obj)):
        if len(duration_obj[i].split(' ')) == 2:
            pass
        else:
            if 'h' in duration_obj[i]:
                duration_obj[i] = duration_obj[i] + ' ' + '0m'
            else:
                duration_obj[i] = '0h' + ' ' + duration_obj[i]
    df = pd.DataFrame(duration_obj, dtype='str')
    df[f'{col}_hour'] = df[0].str.split("h ", n=1,
                                          expand=True)[0].astype('int64')
    df[f'{col}_min'] = df[0].str.split(
        " ", n=2, expand=True)[1].str.strip('m').astype('int64')
    df.drop(columns=0, inplace=True)
    return df

In [None]:
transformer = FunctionTransformer(duration_process)
transformer.fit_transform(df['Duration']) #df['Duration']

In [None]:

def set_preproc_pipeline():
    # create date pipeline
    date_pipe = Pipeline([
        ('date_format', DateFormatter()),
        ('date_enc', DateEncoder())
    ])
        
    # create time pipeline
    time_pipe = Pipeline([
        ('time_enc', TimeFeaturesEncoder()),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])
    
    dist_pipe = Pipeline([
        ('dist_trans', DistanceTransformer()),
        ('stdscaler', StandardScaler())
    ])
    
    duration_pipe = FunctionTransformer(duration_process)
    
    preproc_pipe = ColumnTransformer([('date_pipe', date_pipe, ['Date_of_Journey', 'Dep_Time', 'Arrival_Time']),
                       ('time_pipe', time_pipe, ['Dep_Time', 'Arrival_Time']), 
                       ('dist_pipe', dist_pipe, ["origin_one_latitude", "origin_one_longitude","origin_two_latitude","origin_two_longitude"]),
                       ('duration_pipe', duration_pipe, ['Duration'])
                                     ], remainder='drop')
    final_pipe = Pipeline([
        ('preproc', preproc_pipe),
        ('stdscaler', StandardScaler())
    ])

# display time pipeline
    return final_pipe

In [None]:
preproc_pipeline = set_preproc_pipeline()
preproc_pipeline

In [None]:
final_pipe = Pipeline([
        ('preproc', preproc_pipeline),
        ('random_forest_model', RandomForestRegressor())
    ])

In [None]:
final_pipe

In [None]:
# set X and y
y = df["Price"]
X = df.drop("Price", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

preproc_pipeline.fit_transform(X)

In [None]:
X_train.info()

In [None]:
# implement train() function
def train(X_train, y_train, pipeline):
    '''returns a trained pipelined model'''
    pipeline.fit(X_train, y_train)
    return pipeline

In [None]:
def compute_rmse(y_pred, y_true):
    return np.sqrt(((y_pred - y_true)**2).mean())

In [None]:
# implement evaluate() function
def evaluate(X_test, y_test, pipeline):
    '''returns the value of the RMSE'''
    y_pred = pipeline.predict(X_test)
    rmse = compute_rmse(y_pred, y_test)
    print(rmse)
    return rmse

In [None]:
# train the pipeline
trained_model = train(X_train, y_train, final_pipe)

## saving model to .joblib
joblib.dump(trained_model,'rf_model.joblib')

In [None]:
# evaluate the pipeline
rmse = evaluate(X_test, y_test, final_pipe)