# Generate PKLs for weekly testing

1. Import sensor and IBM data
2. Sync the data in time
3. Create PKLs for 1->52 weeks (or max time length available)
4. Test across 30 of them 
5. Export and save PKLs

In [1]:
import sqlalchemy as db
import pandas as pd
import numpy as np
import datetime
import pickle

from Namelist import db_info
from Namelist import Account_Station
from Namelist import query_build

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import sys
sys.path.insert(0, '/Users/mborrus/Benchmark/Downscaling#2/Downscaling/')
from olm import *

sys.path.insert(0, '/Users/mborrus/Benchmark/GCP/ds-common/')
import API_Function_Library.SQL_Functions as SQL_bl

AS_Index = 0;
AccountUid = Account_Station.AccountUid[AS_Index]
StationUid = Account_Station.StationUid[AS_Index]
Provider = Account_Station.Provider[AS_Index]

hostname = db_info['hostname']
dbname = db_info['dbname']
uname = db_info['uname']
pwd = db_info['pwd']

# Create SQLAlchemy engine to connect to MySQL Database
engine = db.create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
                       .format(host=hostname, db=dbname, user=uname, pw=pwd))
connection = engine.connect()
metadata = db.MetaData()
Station_Data_table = db.Table('Station_Data', metadata, autoload=True, autoload_with=engine)
IBM_Data_table = db.Table('IBM_Data', metadata, autoload=True, autoload_with=engine)

#### Import Data
Import sensor data, then import IBM data for one station

In [2]:
## Station Data
query = db.select([Station_Data_table]).where(Station_Data_table.columns.stationUid == StationUid)
ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
Station_Data = pd.DataFrame(ResultSet)
Station_Data.columns = ResultSet[0].keys()
Station_Data = Station_Data.sort_values(by=['Local_datetime'])

In [3]:
query = db.select([IBM_Data_table]).where(IBM_Data_table.columns.stationUid == StationUid)
ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
IBM_Data = pd.DataFrame(ResultSet)
IBM_Data.columns = ResultSet[0].keys()
IBM_Data = IBM_Data.sort_values(by=['Local_datetime'])

#### Sync the Data

In [4]:
Training_Data_All = pd.merge(Station_Data,IBM_Data, how='inner', on = 'Local_datetime',suffixes=('_Sensor', '_IBM'))

#### Create the pkls

In [48]:
def Train_Model(Input_Data, Training_var):
    Input_Data=Input_Data.dropna()
    Xdata=Input_Data['%s_IBM' % (Training_var)].to_numpy()
    Ydata=Input_Data['%s_Sensor' % (Training_var)].to_numpy()
    
    comparison = Input_Data['%s_IBM' % (Training_var)].astype(float)
    comparison_lectures = Input_Data['%s_Sensor' % (Training_var)].astype(float)

    XData=Xdata.reshape(Xdata.shape[0],1).astype(float) #first version
    YData=Ydata.reshape(Ydata.shape[0],1).astype(float)
    n_blocks=5
    rowstodrop=len(Xdata)%n_blocks
    XData=XData[rowstodrop:]
    YData=YData[rowstodrop:]
    hn = 15 # define number of hidden neurons (it can change)

    TrainedElm=Elm_train(XData,YData,hn,1,1)
    # TrainedElm=Elm_cross_val(XData,YData,hn)
    logits = Elm_predict(TrainedElm,XData)

    Q1 = comparison_lectures.quantile(0.10)
    TrainedElm['q1'] = Q1

    Q3 = comparison_lectures.quantile(0.90)
    TrainedElm['q3'] = Q3

    weeks = abs(round((Input_Data['Local_datetime'][rowstodrop:].min()-Input_Data['Local_datetime'][rowstodrop:].max()).days/7))
    

    performance_stats = {'MSE_Forecast': [mean_squared_error(YData, logits[0])],
        'MSE_IBM': [mean_squared_error(YData, comparison[rowstodrop:])],
            'MAE_Forecast':[mean_absolute_error(YData, logits[0])],
            'MAE_IBM':[mean_absolute_error(YData, comparison[rowstodrop:])],
            'r2_Forecast':[r2_score(YData, logits[0])],
            'r2_IBM':[r2_score(YData, comparison[rowstodrop:])],
            'weeks': weeks,
            'Model_Date': datetime.date.today(),
            # 'Model_Date': datetime.date.today() - datetime.timedelta(days = 1),
            'Days_Since': (datetime.date.today() - datetime.date.today()).days
        }
    df_performance_temp = pd.DataFrame(performance_stats, columns=['MSE_Forecast','MSE_IBM','MAE_Forecast','MAE_IBM','r2_Forecast','r2_IBM','weeks','Model_Date','Days_Since'])
    return(TrainedElm, df_performance_temp)

def Gladiator_Ring(Input_Data, compare_count=20):
    [model_candidate, perf_candidate]= Train_Model(Input_Data, Training_var)
    for compare in range(0,compare_count):
        [model_gladiator, perf_gladiator]= Train_Model(ST_cut, Training_var)
        if perf_gladiator['MSE_Forecast'].item() <= perf_candidate["MSE_Forecast"].item():
            model_candidate=model_gladiator;
            perf_candidate=perf_gladiator;
            print("Are you not entertained")
    return(model_candidate, perf_candidate)

In [49]:
first_date = Training_Data_All.Local_datetime.min()
last_date = Training_Data_All.Local_datetime.max()
date_range = round((last_date - first_date).days/7)

In [50]:
if date_range>=60:
    weeks_count = 60;
    weeks_range = [1,2,3,4,5,6,7,*range(8,weeks_count+1,2)];
elif date_range<=0:
    print("issues with dates (0 or neg)")
    weeks_range = []
elif date_range>=0 & date_range<=8:
    weeks_count = date_range
    weeks_range = [*range(1,weeks_count+1)];
else:
    weeks_count = date_range


In [51]:
# weeks_count = 5; weeks_range = [*range(1,weeks_count+1)];
Training_var = 'TEMPERATURE'
compare_count = 20;

df_performance_all = pd.DataFrame()
File_Names = [];

for week_length in weeks_range:
    print(str(week_length))
    Start_date = datetime.date.today()
    # Start_date = datetime.date.today() - datetime.timedelta(days = 1)
    End_date = Start_date - datetime.timedelta(weeks=week_length)
    range_eval = pd.eval('(Training_Data_All.Local_datetime.dt.date > End_date) & (Training_Data_All.Local_datetime.dt.date < Start_date)')
    Data_cut = Training_Data_All[range_eval].reset_index()
    [model_candidate, perf_candidate]= Train_Model(Data_cut, Training_var)
    for compare in range(0,compare_count):
        [model_gladiator, perf_gladiator]= Train_Model(Data_cut, Training_var)
        if perf_gladiator['MSE_Forecast'].item() <= perf_candidate["MSE_Forecast"].item():
            model_candidate=model_gladiator;
            perf_candidate=perf_gladiator;
            print("Are you not entertained")
    # TrainedElm_array.append(model_candidate)
    df_performance_all = df_performance_all.append(perf_candidate, ignore_index = True)

    filename = AccountUid+'_'+StationUid+'_'+Training_var+'_'+str(Start_date)+'_'+str(week_length)+".pkl"
    File_Names.append(filename)
    f = open('./pkls/'+filename,"wb")
    pickle.dump(model_candidate,f)
    f.close()

df_performance_all['stationUid'] = StationUid;
df_performance_all['Metric'] = Training_var;
df_performance_all['pkl_file_name'] = File_Names;

1
Are you not entertained
Are you not entertained
Are you not entertained
Are you not entertained
Are you not entertained
2
Are you not entertained
Are you not entertained
Are you not entertained
Are you not entertained
3
Are you not entertained
Are you not entertained
Are you not entertained
Are you not entertained
4
Are you not entertained
Are you not entertained
5
Are you not entertained
Are you not entertained
Are you not entertained
Are you not entertained
Are you not entertained
6
Are you not entertained
Are you not entertained
Are you not entertained
7
Are you not entertained
Are you not entertained
8
10
Are you not entertained
Are you not entertained
12
Are you not entertained
Are you not entertained
14
Are you not entertained
Are you not entertained
16
Are you not entertained
Are you not entertained
Are you not entertained
Are you not entertained
18
Are you not entertained
20
Are you not entertained
Are you not entertained
Are you not entertained
Are you not entertained
22
Are

In [35]:
df_performance_all

Unnamed: 0,MSE_Forecast,MSE_IBM,MAE_Forecast,MAE_IBM,r2_Forecast,r2_IBM,weeks,Model_Date,Days_Since,stationUid,Metric,pkl_file_name
0,1.037224,4.860121,0.785196,1.881674,0.961541,0.819791,1,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
1,3.652501,7.444932,1.322877,2.093707,0.889282,0.774321,2,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
2,7.147951,12.499962,2.157424,2.748317,0.836783,0.714574,3,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
3,6.473779,12.95002,2.03952,2.828847,0.8549,0.709745,4,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
4,5.568011,11.177891,1.813099,2.587185,0.867949,0.734906,5,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
5,5.228374,10.094475,1.769668,2.4568,0.876635,0.761819,6,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
6,4.835717,9.382609,1.673199,2.363026,0.889162,0.784943,7,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
7,5.326162,9.153275,1.717275,2.318663,0.889557,0.810198,8,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
8,4.524101,7.973762,1.502126,2.122982,0.909345,0.840219,10,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
9,4.449808,7.914021,1.54478,2.111611,0.921394,0.860198,12,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...


In [52]:
# Create mysql.connection to do merge tables
connection_out = SQL_bl.create_db_connection(hostname, uname, pwd, dbname) # Connect to the Database

df_performance_all.to_sql('Pkl_Performance_Data_Temp', engine, index=False, if_exists = 'replace')

name = "Pkl_Performance";
query_merge = '''
INSERT INTO %s_Data
SELECT %s_Data_Temp.*
FROM %s_Data_Temp
WHERE NOT EXISTS(SELECT * FROM %s_Data 
WHERE %s_Data_Temp.weeks = %s_Data.weeks 
AND %s_Data_Temp.Model_Date = %s_Data.Model_Date 
AND %s_Data_Temp.Days_Since = %s_Data.Days_Since
AND %s_Data_Temp.Metric = %s_Data.Metric
AND %s_Data_Temp.stationUid = %s_Data.stationUid);
''' % (name,name,name,name,name,name,name,name,name,name,name,name,name,name)

SQL_bl.execute_query(connection_out, query_merge) # Execute our defined query

MySQL Database connection successful
Query successful
