In [90]:
# importing required libraries
import pandas as pd
import pickle
import numpy as np
from time import time

In [2]:
# loading test data
building_metadata = pd.read_csv("building_metadata.csv")
test_data = pd.read_csv("test.csv",parse_dates=["timestamp"])
weather_test_data = pd.read_csv("weather_test_data_processed.csv",parse_dates=["timestamp"])

# Data Pipeline

Under this section we will be building the final data pipeline. We will be **processing data, creating features, scaling data and predicting using the given model.**


## Defining Utility Functions

In [36]:
def datetime_sine(x,num_unique):
    '''
    Function returns sine transformation of a datetime feature
    '''
    
    return np.sin(np.deg2rad((360/num_unique) * x))


def datetime_cosine(x,num_unique):
    '''
    Function returns cosine transformation of a datetime feature
    '''
    
    return np.cos(np.deg2rad((360/num_unique) * x))


def weekend_binary(x):
    '''
    Function returns binary values based on day type
    '''
    
    if x in list(range(5)):
        return 0
    else:
        return 1

## Final Pipeline

In [96]:
def final_pipeline(X): 
    '''
    Final data pipeline that processes a single data point, 
    performs necessary function and returns the predicted
    value
    
    =====================================================
    Parameters:
    X: single datapoint with raw features
    
    =====================================================
    Returns:
    None 
    
    =====================================================
    '''
    
    # staring clock
    start = time()
    
    ## processing and merging data
    
    # merging with the buidling metadata
    X = X.merge(building_metadata,how="inner",on=["building_id"])
    
    # merging with the weather data
    X = X.merge(weather_test_data,how="inner",on=["timestamp","site_id"])
    
    # getting the meter id
    meter_id = X["meter"].values[0]
    
    # dropping columns not required
    X.drop(labels=["row_id","year_built","floor_count","index","meter"],axis=1,inplace=True)
    
    
    ## engineering date time features
    
    # sine and cosine based on hour
    X["hour"] = X["timestamp"].apply(lambda x: x.hour)
    X["hour_x"] = X["hour"].apply(func=datetime_sine,args=(24,))
    X["hour_y"] = X["hour"].apply(func=datetime_cosine,args=(24,))
    
    # sine and cosine based on day
    X["day"] = X["timestamp"].apply(lambda x: x.day) - 1
    X["day_x"] = X["day"].apply(func=datetime_sine,args=(31,))
    X["day_y"] = X["day"].apply(func=datetime_cosine,args=(31,))
    
    # sine and cosine based on day of week
    X["dayofweek"] = X["timestamp"].apply(lambda x: x.dayofweek)
    X["dayofweek_x"] = X["dayofweek"].apply(func=datetime_sine,args=(7,))
    X["dayofweek_y"] = X["dayofweek"].apply(func=datetime_cosine,args=(7,))
    
    # sine and cosine based on month
    X["month"] = X["timestamp"].apply(lambda x: x.month) - 1
    X["month_x"] = X["month"].apply(func=datetime_sine,args=(12,))
    X["month_y"] = X["month"].apply(func=datetime_cosine,args=(12,))
    
    # binary feature is_weekend
    X["is_weekend"] = X["dayofweek"].apply(func=weekend_binary)
    
    # dropping columns not required further
    X.drop(labels=["timestamp","hour","day","dayofweek","month"],axis=1,inplace=True)
    
    ## encoding categorical features
    
    # loading the encoder
    file = f"meter_{meter_id}\encoder.pkl"
    with open(file,"rb") as f:
        encoder = pickle.load(f)
        
    # trasnforming the categorical features
    X = encoder.transform(X)
    
    ## scaling numerical features
    
    # storing dataframe column names
    columns = X.columns
    
    # loading the scaler
    file = f"meter_{meter_id}\scaler.pkl"
    with open(file,"rb") as f:
        scaler = pickle.load(f)
    
    # standardizing 
    transformed_data = scaler.transform(X)
    
    # storing as dataframe
    X = pd.DataFrame(data=transformed_data,columns=columns)  
    
    ## predicting using best model (lgbm regressor)
    
    # loading the model
    file = f"meter_{meter_id}\lgbm.pkl"
    with open(file,"rb") as f:
        model = pickle.load(f)
        
    # predicting using loaded model
    y = model.predict(X)[0]
    
    # exp to reverse log transformation
    y = np.expm1(y)
    
    # ending clock
    end = time()
    
    # finding time for function run
    time_difference = round((end - start),2)
    
    # printing results 
    print(f"Time to run function: {time_difference}s")
    print(f"Predicted Energy Consumption: {y}")

In [102]:
# sampling point from test data
sample = test_data.sample(n=1)

# running the pipeline
final_pipeline(X=sample)

Time to run function: 0.05s
Predicted Energy Consumption: 74.59897364902699
