### This NoteBook illustrates Inference/Scoring process


In [52]:
import os
import math
import pickle
import sys
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from xgboost import plot_importance
import pickle
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_percentage_error
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict,cross_validate,GridSearchCV,RandomizedSearchCV
from xgboost import plot_importance
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler,PowerTransformer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from vincenty import vincenty
from datetime import datetime
import reverse_geocoder as rg
from shapely.geometry import mapping, shape
from shapely.prepared import prep
from shapely.geometry import Point
import requests

### Inference/Scoring pipeline

- During inference will use stored object(models, transformers etc.) as we should not fit this again and hence use this information from training phase
- This pipeline would be shorter

Loading data frame and having a look at the data 

In [53]:
lat_1,lon_1,lat_2,lon_2 = 'origin_latitude','origin_longitude','destination_latitude','destination_longitude'
test_df = pd.read_csv('../data/test_data.csv',sep=';', parse_dates=['shipping_date'])
test_df.head()

Unnamed: 0,origin_latitude,origin_longitude,destination_latitude,destination_longitude,weight,loading_meters,is_adr,shipping_date
0,25.33,110.99,22.66,109.6,0.205,0.195,True,2017-01-02
1,26.06,106.17,24.87,111.16,0.155,0.195,True,2017-01-02
2,24.71,107.21,23.42,106.87,0.484,0.195,True,2017-01-02
3,25.76,109.61,23.2,107.02,0.445,0.069,True,2017-01-02
4,26.91,108.67,23.26,109.42,0.497,0.067,True,2017-01-02


### We need to apply sidentical treatment,feature engg. and transformation except for fitting


In [54]:
class FeatenggTransformer(BaseEstimator, RegressorMixin, TransformerMixin):

    def transform(self, X):
        
        """This pipeline transforms the dataframe to the desired
         features and shape"""

        X = self.add_date_feature(X)
        X = self.add_travel_features(X)
        X = self.get_country_feats(X)
        X = self.drop_unrequired_fields(X)
        return X

    def add_travel_features(self, X):

        """This Method creates distance features"""
        
        X['abs_diff_longitude'] = (X[lon_2] - X[lon_1]).abs()
        X['abs_diff_latitude'] = (X[lat_2] - X[lat_1]).abs()
        X['Vincenty_distance'] = X.apply(lambda x: vincenty((x[lat_1], \
            x[lon_1]), (x[lat_2], x[lon_2])\
                ,miles=True), axis = 1)

        # bearing (in degrees converted to radians)
        def bearing_array(lat1, lng1, lat2, lng2):
            AVG_EARTH_RADIUS = 6371  # in km
            lng_delta_rad = np.radians(lng2 - lng1)
            lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
            y = np.sin(lng_delta_rad) * np.cos(lat2)
            x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
            return np.degrees(np.arctan2(y, x))

        X['Bearing'] = bearing_array(X[lat_1], \
                    X[lon_1], X[lat_2], X[lon_2])
        X.loc[:, 'center_latitude'] = (X[lat_1].values + X[lat_2].values) / 2
        X.loc[:, 'center_longitude'] = (X[lon_1].values + X[lon_2].values) / 2
        return X

    def add_date_feature(self, X):
        
        """This Method creates time series/date features"""
        
        ref_date = '2017-01-01'
        X['weekday'] = X['shipping_date'].dt.day_of_week
        X["is_weekend"] = np.where(X["weekday"] < 5, 0, 1)
        X['week_in_month'] = pd.to_numeric(X['shipping_date'].dt.day/7).\
        apply(lambda x: math.ceil(x))
        X['month'] = X['shipping_date'].dt.month
        X['day_count'] = (X['shipping_date'] - datetime.strptime(ref_date, "%Y-%m-%d"))/np.timedelta64(1, 'D')
        return X

    def get_country_feats(self, X):
        data = requests.get("https://raw.githubusercontent.com/datasets/geo-countries/master/data/countries.geojson").json()
        countries = {}
        for feature in data["features"]:
            geom = feature["geometry"]
            country = feature["properties"]["ADMIN"]
            countries[country] = prep(shape(geom))
        def get_country(row):
            point = Point(row[0], row[1])
            for country, geom in countries.items():
                if geom.contains(point):
                    return country
            return "unknown"
        X['destn_country'] = X[[lon_2,lat_2]].apply(get_country,axis=1)
        X['origin_country'] = X[[lon_1,lat_1]].apply(get_country,axis=1)
        X['diff_country'] = np.where(X['origin_country'] == X['destn_country'],0,1) 
        return X   

    def drop_unrequired_fields(self, X):

        """Method for dropping unrequired fields"""

        return X.drop(['shipping_date','is_adr',lat_1,lon_1,lat_2,lon_2], axis=1)

In [55]:
fet =  FeatenggTransformer()
X = fet.transform(test_df)
X.head()

Unnamed: 0,weight,loading_meters,weekday,is_weekend,week_in_month,month,day_count,abs_diff_longitude,abs_diff_latitude,Vincenty_distance,Bearing,center_latitude,center_longitude,destn_country,origin_country,diff_country
0,0.205,0.195,0,0,1,1,1.0,1.39,2.67,203.685107,-154.277977,23.995,110.295,China,China,0
1,0.155,0.195,0,0,1,1,1.0,4.99,1.19,322.374715,103.713538,25.465,108.665,China,China,0
2,0.484,0.195,0,0,1,1,1.0,0.34,1.29,91.343822,-166.399004,24.065,107.04,China,China,0
3,0.445,0.069,0,0,1,1,1.0,2.59,2.56,240.108206,-136.817956,24.48,108.315,China,China,0
4,0.497,0.067,0,0,1,1,1.0,0.75,3.65,255.595228,169.296974,25.085,109.045,China,China,0


- Please make sure all necessary objects(models, transformers etc.) are stored at the designated path

In [56]:
cat_cols = ['week_in_month', 'weekday', 'is_weekend', 'month','destn_country','origin_country']
num_cols = [col for col in list(X.columns) if col not in cat_cols]

class FeatureTransformer(BaseEstimator, RegressorMixin, TransformerMixin):

    """This Class is for FeatureTransform
        a) categorical feature encoding
        b) feature scaling
    """
    
    def __init__(self,cat_cols,num_cols,path):
        self.cat_cols,self.num_cols,self.path = cat_cols,num_cols,path
        if not os.path.exists(self.path):
            os.mkdir(self.path)

    def transform(self, X):

        """This pipeline transforms the categorical features to One-Hot Encoded and scales the 
         features"""

        X = self.encode_categorical(X)
        return X

    def encode_categorical(self, X):

        """This Method creates absolute difference features
        and deserializes(unpickle) the necessary objects for inference"""

        with open(os.path.join(self.path,'ohe.pkl') ,'rb') as fout:
            ohe = pickle.load(fout)
        x_cat_df = pd.DataFrame(ohe.transform(X[self.cat_cols]))
        x_cat_df.columns = ohe.get_feature_names_out(self.cat_cols)
        X = pd.concat([X[num_cols],x_cat_df],axis=1)
        return X

In [57]:
ft = FeatureTransformer(cat_cols=cat_cols,num_cols=num_cols,path='../artifacts_final/')
X_transformed = ft.transform(X)
X_transformed.head()

Unnamed: 0,weight,loading_meters,day_count,abs_diff_longitude,abs_diff_latitude,Vincenty_distance,Bearing,center_latitude,center_longitude,diff_country,...,month_7,month_8,month_9,month_10,month_11,month_12,destn_country_China,destn_country_Vietnam,origin_country_China,origin_country_Vietnam
0,0.205,0.195,1.0,1.39,2.67,203.685107,-154.277977,23.995,110.295,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.155,0.195,1.0,4.99,1.19,322.374715,103.713538,25.465,108.665,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.484,0.195,1.0,0.34,1.29,91.343822,-166.399004,24.065,107.04,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.445,0.069,1.0,2.59,2.56,240.108206,-136.817956,24.48,108.315,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.497,0.067,1.0,0.75,3.65,255.595228,169.296974,25.085,109.045,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


### Finally the Model prediction pipeline

- Please make sure all necessary objects(models, transformers etc.) are stored at the designated path

In [58]:
class Prediction(BaseEstimator,TransformerMixin):

    """Object to make prediction/inference using Model
    """

    def __init__(self, model_path = None, data_path = None):
        self.model_path  = model_path
        self.data_path = data_path

    def prediction(self,X):

        """Method to make prediction loading model and saves data"""

        with open(self.model_path, 'rb') as fout:
            self.model = pickle.load(fout)
        pred = self.model.predict(X)
        pred = pd.DataFrame({'cost':list(pred)})
        pred.to_csv(self.data_path, index=False)
        return pred

In [59]:
model_path = os.path.join('../artifacts_final/','stacked_model_final.pkl')
data_path = os.path.join('../cost_prediction.csv')
p = Prediction(model_path,data_path)
pred = p.prediction(X_transformed)


##                                                           Q&A