In [28]:
%reload_ext autoreload
%autoreload 2

from google.cloud import bigquery
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import calendar
from sklearn.linear_model import LinearRegression, Ridge
from datetime import datetime
import numpy as np
import calendar
from tqdm.auto import tqdm
import time
from ts_diagnostics import cross_validation, performance_metrics
from mlflow.models.signature import infer_signature

# connect to mlflow server
import mlflow
TRACKING_SERVER_HOST = "10.128.0.2:5000"
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}")
mlflow.set_experiment("linear_regression_model")

<Experiment: artifact_location='gs://mlflow-runs-mlops-zoomcamp-354700/2', experiment_id='2', lifecycle_stage='active', name='linear_regression_model', tags={}>

In [44]:
def pull_training_data(data_start, data_end):
    q = "SELECT * FROM `mlops-zoomcamp-354700.energy_data_prod.joined_temp_and_demand`"
    df_raw = pd.read_gbq(q, project_id='mlops-zoomcamp-354700')
    return df_raw[df_raw['energy_timestamp_mtn'].between(data_start, data_end)]
    

def trim_data(df, min_val, max_val):
    return df[df['energy_demand'].between(min_val, max_val)].set_index('energy_timestamp_mtn')


def make_features(df, min_y_val, max_y_val):
    df_train = (trim_data(df, min_y_val, max_y_val)
                                         .reset_index()
                                         .dropna(subset=['energy_demand', 'temp_F'])
                                         .assign(
                                                year=lambda df_: df_['energy_timestamp_mtn'].dt.year,
                                                day_of_year=lambda df_: df_['energy_timestamp_mtn'].dt.day_of_year,
                                                hour=lambda df_: df_['energy_timestamp_mtn'].dt.hour,
                                                is_weekend=lambda df_: df_['energy_timestamp_mtn'].dt.day_of_week >= 5, # saturady day_of_week = 5, sunday = 6
                                                is_summer=lambda df_: df_['energy_timestamp_mtn'].dt.month.between(5, 9, inclusive='both'),
                                                month=lambda df_: df_['energy_timestamp_mtn'].dt.month,
                                                temp_F_squared=lambda df_: df_['temp_F'] * df_['temp_F'],
                                                hour_squared=lambda df_: df_['hour'] ** 2,
                                                hour_cubed=lambda df_: df_['hour'] ** 3,
                                        )

                                    .set_index('energy_timestamp_mtn')                                    
  )


    for month in calendar.month_name[1:]:
        df_train[month] = pd.to_numeric(df_train.index.month_name == month)
        
    return df_train

    

In [41]:
# define the max and min dates to pull training data from BQ
data_start = datetime(2015, 8, 1)
data_end = datetime(2021, 6, 1)

# define start and end of training data for the model
train_start_date = datetime(2015, 8, 1)
train_end_date = datetime(2021, 1, 1)

min_y_val = 2_000
max_y_val = 11_000

# define the features to include in the model
features_to_include = ['temp_F', 
                       'year', 
                       'day_of_year', 
                       'hour', 
                       'is_weekend', 
                       'is_summer', 
                       'month', 
                       'temp_F_squared',
                       'hour_squared',
                       'hour_cubed',
                      ]

cv_horizon = '2 days'
cv_initial = f'{4 * 365} days'
cv_period = '55 days'

df_raw = pull_training_data(data_start, data_end)
df_train = make_features(df_raw, min_y_val, max_y_val)

# filter 
time_filter =  (df_train.index > train_start_date) & (df_train.index < train_end_date)
X = df_train.loc[time_filter, features_to_include]
y = df_train.loc[time_filter, 'energy_demand']
 

In [43]:
with mlflow.start_run():
    model = LinearRegression(fit_intercept=True)

    train_start = time.time()
    model.fit(X, y)
    train_end = time.time()

    cv_start = time.time()
    df_cv = cross_validation(model, 
                 X, 
                 y, 
                 cv_horizon, 
                 cv_period, 
                 cv_initial)
    
    df_p = (performance_metrics(df_cv, rolling_window=1)
                            # convert time_delta to seconds
                           .assign(horizon=lambda df_: df_['horizon'].dt.total_seconds())
                       )
    cv_end = time.time()

    params=dict()
    params['train_data_start_date'] = train_start_date
    params['train_data_end_date'] = train_end_date
    params['cv_initial'] = cv_initial
    params['cv_horizon'] = cv_horizon
    params['cv_period'] = cv_period
    params['features'] = features_to_include
    
    
    metrics = df_p.to_dict('records')[0]
    metrics['train_duration_minutes'] = (train_end - train_start) / 60
    metrics['cv_duration_minutes'] = (cv_end - cv_start) / 60

    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    
    signature = infer_signature(X, model.predict(X))
    mlflow.sklearn.log_model(model, artifact_path="models", signature=signature)
    # mlflow.log_artifacts('ts_diagnostics.py')
    # mlflow.sklearn.log_model(model, 'model')



Making 10 forecasts with cutoffs between 2019-08-22 23:00:00 and 2020-12-29 23:00:00


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:24<00:00,  2.47s/it]
  inputs = _infer_schema(model_input)


In [34]:
model.coef_

array([ -88.04979886,  137.40104012,    0.66634504,   61.96212739,
       -340.23553386,   51.6999897 ,   -4.07105147,    0.95781905])

In [35]:
print(features_to_include)

['temp_F', 'year', 'day_of_year', 'hour', 'is_weekend', 'is_summer', 'month', 'temp_F_squared']
