#### What is the Prediction Pipeline?
Right now, your trained SGD Regressor is locked inside a .pkl file in your artifacts folder. It is smart, but it is deaf and blind. It cannot speak to users.

The Prediction Pipeline is the translator. It takes raw input from a user (e.g., a taxi driver typing their trip details into an app), formats that data into the exact mathematical shape your model expects, and returns the predicted tip amount.

INFERENCE PART:
 { RAW DATA(UI) -> DATA TRANSFORM(data cleaning + feature engineering ) } -> PREDICT(from .pkl file)  -> output(Tip amount)

Raw Data:
['vendorid', 'ratecodeid', 'pulocationid','dolocationid', 'passenger_count',
       'extra', 'tolls_amount','Date & time',
       'congestion_surcharge', 'airport_fee',
       'trip_distance', 'fare_amount',
       ]  13 Columns

Tranformed Data:
['onehot__vendorid_2', 'onehot__ratecodeid_2.0',
       'onehot__ratecodeid_3.0', 'onehot__ratecodeid_4.0',
       'onehot__ratecodeid_5.0', 'onehot__pickup_dow_1',
       'onehot__pickup_dow_2', 'onehot__pickup_dow_3', 'onehot__pickup_dow_4',
       'onehot__pickup_dow_5', 'onehot__pickup_dow_6', 'target__pulocationid',
       'target__dolocationid', 'standardScaler__passenger_count',
       'standardScaler__extra', 'standardScaler__tolls_amount',
       'standardScaler__congestion_surcharge', 'standardScaler__airport_fee',
       'standardScaler__trip_distance', 'standardScaler__fare_amount',
       'standardScaler__farePerMile', 'standardScaler__pre_tip_amount',
       'standardScaler__avg_speed', 'standardScaler__trip_duration_mins',
       'is_weekend', 'pickup_ampm', 'Is_Airport_Trip', 'is_airport_peak_hour',
       'is_pm_peak_hour', 'PickUpHr_sin', 'PickUpHr_cos'] 30 coLUMNS



In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
%pwd

'e:\\dsProject\\nycTaxiProject\\research'

In [3]:
os.chdir("..")

In [4]:
%pwd

'e:\\dsProject\\nycTaxiProject'

In [5]:
# how to show all the columns in the dataframe
pd.set_option('display.max_columns', None)

In [12]:
df.columns

Index(['onehot__vendorid_2', 'onehot__ratecodeid_2.0',
       'onehot__ratecodeid_3.0', 'onehot__ratecodeid_4.0',
       'onehot__ratecodeid_5.0', 'onehot__pickup_dow_1',
       'onehot__pickup_dow_2', 'onehot__pickup_dow_3', 'onehot__pickup_dow_4',
       'onehot__pickup_dow_5', 'onehot__pickup_dow_6', 'target__pulocationid',
       'target__dolocationid', 'standardScaler__passenger_count',
       'standardScaler__extra', 'standardScaler__tolls_amount',
       'standardScaler__congestion_surcharge', 'standardScaler__airport_fee',
       'standardScaler__trip_distance', 'standardScaler__fare_amount',
       'standardScaler__farePerMile', 'standardScaler__pre_tip_amount',
       'standardScaler__avg_speed', 'standardScaler__trip_duration_mins',
       'is_weekend', 'pickup_ampm', 'Is_Airport_Trip', 'is_airport_peak_hour',
       'is_pm_peak_hour', 'PickUpHr_sin', 'PickUpHr_cos'],
      dtype='object')

In [None]:
['vendorid', 'onehot__ratecodeid_2.0','onehot__pickup_dow_1',
       'pulocationid','dolocationid', 'passenger_count',
       'extra', 'tolls_amount',
       'congestion_surcharge', 'airport_fee',
       'trip_distance', 'fare_amount',
       'farePerMile', 'pre_tip_amount',
       'avg_speed', 'trip_duration_mins',
       'is_weekend', 'pickup_ampm', 'Is_Airport_Trip', 'is_airport_peak_hour',
       'is_pm_peak_hour', 'PickUpHr_sin', 'PickUpHr_cos']

In [13]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import OneHotEncoder, StandardScaler,TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

##### Feature Engineering : Pickup Date time Insight + Encoding

In [None]:
# raw_df.head()

Unnamed: 0,vendorid,passenger_count,trip_distance,ratecodeid,pulocationid,dolocationid,fare_amount,extra,tolls_amount,congestion_surcharge,airport_fee,pickup_dow,is_weekend,pickup_ampm,trip_duration_mins,pre_tip_amount,Is_Airport_Trip,is_airport_peak_hour,is_pm_peak_hour,avg_speed,farePerMile,PickUpHr_sin,PickUpHr_cos
0,2,1.0,1.75,1.0,162,236,12.1,2.5,0.0,2.5,0.0,2,0,1,10.366667,18.6,0,0,1,0.16881,6.914286,-0.965926,-0.258819
1,2,3.0,1.09,1.0,237,142,9.3,1.0,0.0,2.5,0.0,6,1,1,7.483333,14.3,0,0,0,0.145657,8.53211,-0.707107,0.707107
2,1,1.0,3.4,1.0,161,125,21.9,5.0,0.0,2.5,0.0,1,0,1,23.116667,30.9,0,0,1,0.14708,6.441176,-0.866025,-0.5
3,2,1.0,0.59,1.0,239,142,5.1,1.0,0.0,2.5,0.0,6,1,1,2.55,10.1,0,0,0,0.231373,8.644068,-0.866025,0.5
4,2,2.0,19.91,2.0,132,246,70.0,0.0,6.55,2.5,1.25,0,0,1,50.433333,81.8,1,0,0,0.394779,3.515821,-0.707107,-0.707107


In [61]:
import pandas as pd
import numpy as np
import joblib

class Inference_transform:
    def __init__(self, preprocessor_path="artifacts/data_transformation/train/preprocessor.pkl"):
        # Load the preprocessor once when the object is created
        self.preprocessor = joblib.load(preprocessor_path)

    def dateTime_extraction(self, df: pd.DataFrame) -> pd.DataFrame:
        """ Extracting Date & time features """
        df['pickupDateTime'] = pd.to_datetime(df['pickupDateTime'], format='ISO8601')
        df['dropoffDateTime'] = pd.to_datetime(df['dropoffDateTime'], format='ISO8601')

        df['pickup_dow'] = df['pickupDateTime'].dt.dayofweek

        df['is_weekend'] = df['pickup_dow'].isin([5, 6]).astype(int)

        df['pickup_hr'] = df['pickupDateTime'].dt.hour

        df['pickup_ampm'] = (df['pickup_hr'] >= 12).astype(int)

        df['trip_duration_mins'] = (df['dropoffDateTime'] - df['pickupDateTime']).dt.total_seconds() / 60
    
        df = df.drop(columns=['pickupDateTime','dropoffDateTime'])
        return df
    
    def feature_extraction(self, df: pd.DataFrame) -> pd.DataFrame:
        """ Business logic feature engineering """
        df['farePerMile'] = df['fare_amount'] / df['trip_distance']
        
        # Safely handle division by zero if duration is exactly 0
        df['avg_speed'] = df['trip_distance'] / (df['trip_duration_mins'] / 60).replace(0, 0.001)
        
        df['pre_tip_amount'] = (
            df['fare_amount'] + df['extra'] + df['mta_tax'] + 
            df['tolls_amount'] + df['improvement_surcharge'] + 
            df['congestion_surcharge'] + df['airport_fee'] 
        )
        
        df = df.drop(columns=['mta_tax','improvement_surcharge'])
        
        aritport_zones = [1, 132, 138, 139] 
        df['Is_Airport_Trip'] = (
            df['pulocationid'].isin(aritport_zones) | df['dolocationid'].isin(aritport_zones)
        ).astype(int)
        
        df['is_airport_peak_hour'] = df['pickup_hr'].apply(lambda hr: 1 if 4 <= hr <= 6 else 0)
        df['is_pm_peak_hour'] = df['pickup_hr'].apply(lambda hr: 1 if 16 <= hr <= 18 else 0) 
        
        df['PickUpHr_sin'] = np.sin(2 * np.pi * df['pickup_hr'] / 24.0)
        df['PickUpHr_cos'] = np.cos(2 * np.pi * df['pickup_hr'] / 24.0)
        
        df = df.drop(columns=['pickup_hr'])
        return df

    def encode_feature(self, df: pd.DataFrame) -> pd.DataFrame:
        """ Apply the loaded preprocessor """
        # Only transform the data!
        encode_array = self.preprocessor.transform(df)
        names = self.preprocessor.get_feature_names_out()
        encoded_df = pd.DataFrame(encode_array, columns=names)

        # Passthrough columns
        additional_cols = ['is_weekend', 'pickup_ampm', 'Is_Airport_Trip', 'is_airport_peak_hour', 'is_pm_peak_hour', 'PickUpHr_sin', 'PickUpHr_cos']
        transformed_df = pd.concat([encoded_df, df[additional_cols].reset_index(drop=True)], axis=1)

        return transformed_df

    def process(self, input_dict: dict) -> pd.DataFrame:
        """ The master function to orchestrate the pipeline """
        df = pd.DataFrame(input_dict)
        df = self.dateTime_extraction(df)
        df = self.feature_extraction(df)
        final_df = self.encode_feature(df)
        return final_df

# ==========================================
# How to execute this code
# ==========================================

# 1. Note the proper datetime strings
input_dict = {
    'vendorid': [2],
    'ratecodeid': [1.0],
    'pulocationid': [138],
    'dolocationid': [33],
    'passenger_count': [2.0],
    'extra': [6.0],
    'tolls_amount': [0.0],
    'congestion_surcharge': [0.0],
    'improvement_surcharge': [1.0],
    'mta_tax': [0.5],
    'airport_fee': [1.25],
    'trip_distance': [11.43],
    'fare_amount': [44.3],
    'pickupDateTime': ["2023-01-01T00:09:29.000"], 
    'dropoffDateTime': ["2023-01-01T00:29:23.000"]
}


# 2. Instantiate the class
transformer = Inference_transform()

# 3. Call the master process function
model_ready_data = transformer.process(input_dict)

print("Data successfully transformed and ready for the model!")
# prediction = model.predict(model_ready_data)

Data successfully transformed and ready for the model!


In [62]:
model_ready_data.shape

(1, 31)

In [42]:
model_ready_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 31 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   onehot__vendorid_2                    1 non-null      float64
 1   onehot__ratecodeid_2.0                1 non-null      float64
 2   onehot__ratecodeid_3.0                1 non-null      float64
 3   onehot__ratecodeid_4.0                1 non-null      float64
 4   onehot__ratecodeid_5.0                1 non-null      float64
 5   onehot__pickup_dow_1                  1 non-null      float64
 6   onehot__pickup_dow_2                  1 non-null      float64
 7   onehot__pickup_dow_3                  1 non-null      float64
 8   onehot__pickup_dow_4                  1 non-null      float64
 9   onehot__pickup_dow_5                  1 non-null      float64
 10  onehot__pickup_dow_6                  1 non-null      float64
 11  target__pulocationid   

In [43]:
df.shape

(71364, 31)

In [63]:
class Inference_prediction:
    def prediction(df,model=joblib.load("artifacts/model_training/sgdModel.pkl")):
        tip=model.predict(df)
        print(f"Tip amount is {tip}")


In [64]:
Inference_prediction.prediction(model_ready_data)

Tip amount is [14.4826019]


In [57]:
original_df=pd.read_csv("artifacts\data_ingestion\data.csv")
original_df.head()

  original_df=pd.read_csv("artifacts\data_ingestion\data.csv")


Unnamed: 0,vendorid,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecodeid,store_and_fwd_flag,pulocationid,dolocationid,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01T00:32:10.000,2023-01-01T00:40:36.000,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01T00:55:08.000,2023-01-01T01:01:27.000,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01T00:25:04.000,2023-01-01T00:37:49.000,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01T00:03:48.000,2023-01-01T00:13:25.000,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01T00:10:29.000,2023-01-01T00:21:19.000,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [None]:
testing_df = {
    'vendorid': [2],
    'ratecodeid': [1.0],
    'pulocationid': [138],
    'dolocationid': [33],
    'passenger_count': [2.0],
    'extra': [6.0],
    'tolls_amount': [0.0],
    'congestion_surcharge': [0.0],
    'improvement_surcharge': [1.0],
    'mta_tax': [0.5],
    'airport_fee': [1.25],
    'trip_distance': [11.43],
    'fare_amount': [44.3],
    'pickupDateTime': ["2023-01-01T00:09:29.000"], 
    'dropoffDateTime': ["2023-01-01T00:29:23.000"]
}

#  pre_tipamount(66.31-13.26):53.05 ,  y_pred:14.48 , y_true:13.26 , TIP Diff: $1.22
