# Calculate the error statistics on the new forecasts

1. Calculate error stats for forecast

In [1]:
from Namelist import db_info
from Namelist import Account_Station

import sys
sys.path.insert(0, '/Users/mborrus/Benchmark/GCP/ds-common/')
import API_Function_Library.Forecast_Functions as api_bl
import API_Function_Library.SQL_Functions as SQL_bl
from olm import *

import sqlalchemy as db
from sqlalchemy import create_engine
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import datetime

from Namelist import Training_vars, Lead_Time_bins

In [2]:
AS_Index = 0;
AccountUid = Account_Station.AccountUid[AS_Index]
StationUid = Account_Station.StationUid[AS_Index]
Provider = Account_Station.Provider[AS_Index]

hostname = db_info['hostname']
dbname = db_info['dbname']
uname = db_info['uname']
pwd = db_info['pwd']

# Create SQLAlchemy engine to connect to MySQL Database
engine = db.create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
                       .format(host=hostname, db=dbname, user=uname, pw=pwd))

# in for data from database/table, out for data to database/table 
connection_in = engine.connect() 
connection_out = SQL_bl.create_db_connection(hostname, uname, pwd, dbname) # Connect to the Database

###############

MySQL Database connection successful


#### 2. Import data tables

I want to calculate the errors for the pkls, but I can't compare them to the original statistics because those don't factor in lead time, so I need to create a new table with Lead time averages  

In [3]:
# +---------------------------+
# | Tables_in_training        |
# +---------------------------+
# | BL_Forecast_Data          |
# | BL_Forecast_Data_Temp     |
# | IBM_Data                  |
# | IBM_Data_Temp             |
# | IBM_Forecast_Data         |
# | IBM_Forecast_Data_Temp    |
# | Pkl_Performance_Data      |
# | Pkl_Performance_Data_Temp |
# | Station_Data              |
# | Station_Data_Temp         |
# +---------------------------+
# Pkl_Performance_Data
# +--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+------------+------------+--------------------------------------+-------------+--------------------------------------------------------------------------------------------------------+
# | MSE_Forecast       | MSE_IBM            | MAE_Forecast       | MAE_IBM            | r2_Forecast        | r2_IBM             | weeks | Model_Date | Days_Since | stationUid                           | Metric      | pkl_file_name                                                                                          |
# +--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+------------+------------+--------------------------------------+-------------+--------------------------------------------------------------------------------------------------------+
# | 1.2481721589836794 | 4.0062474647215645 | 0.9280550517769642 | 1.5798015873014286 | 0.9434939804122955 | 0.8186331139615587 |     1 | 2022-12-06 |          0 | c8a9d548-7fc4-40bb-954e-389988526593 | TEMPERATURE | 2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-7fc4-40bb-954e-389988526593_TEMPERATURE_2022-12-06_1.pkl |

metadata = db.MetaData()

Pkl_table = db.Table('Pkl_Performance_Data', metadata, autoload=True, autoload_with=engine)


#### 3. Calculate Error stats based on training method (no lead time)

The way we currently do this is select all the zero hour forecasts and calculate MAE

#### 4. Calculate Error at different Lead Times

The Pkl_Performance_Data table will be just for the initial statistics (the hindcast), and PKL_Online_Performance_Data will be for the various lead times (a new column), where Days_Since is the days since the model was created (based on Model_Date)

In [4]:
query = db.select([Pkl_table]).where(Pkl_table.columns.stationUid == StationUid)
query = db.select([Pkl_table]).where(db.and_(Pkl_table.columns.stationUid == StationUid,Pkl_table.columns.Model_Date < datetime.date.today()))

Pkl_table_df = SQL_bl.query_to_df(connection_in, query)

In [5]:
# I can query this via pandas by the pkl file name and get the info about model date and days since from there.
Pkl_table_df

Unnamed: 0,MSE_Forecast,MSE_IBM,MAE_Forecast,MAE_IBM,r2_Forecast,r2_IBM,weeks,Model_Date,Days_Since,stationUid,Metric,pkl_file_name
0,1.2481721590,4.0062474647,0.9280550518,1.5798015873,0.9434939804,0.8186331140,1,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
1,5.1473850482,9.4490917214,1.7223712443,2.3821236559,0.8743362457,0.7693181431,2,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
2,7.3664167355,14.2822735179,2.2299767505,2.9556076389,0.8446768573,0.6988539086,3,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
3,6.2766885744,12.8391052373,1.9976866940,2.8114685616,0.8596019467,0.7128126781,4,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
4,5.5141811853,11.0736432440,1.7937908316,2.5655964554,0.8687504773,0.7364231712,5,2022-12-06,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
...,...,...,...,...,...,...,...,...,...,...,...,...
403,6.4350864926,9.5582242703,1.8181300091,2.1250202765,0.9107980693,0.8675057344,52,2022-12-18,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
404,6.2999810052,9.3755908563,1.7962790684,2.1049425967,0.9130363152,0.8705812085,54,2022-12-18,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
405,6.4712589450,9.4340159080,1.8304777994,2.1197285751,0.9095714350,0.8681702389,56,2022-12-18,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
406,6.6451738867,9.4011561868,1.8483352026,2.1159345550,0.9055438521,0.8663696369,58,2022-12-18,0,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...


In [6]:
pkl_index = 0
first_test_pkl = Pkl_table_df.pkl_file_name[pkl_index]
Training_var = Pkl_table_df.Metric[pkl_index]
Training_var

'TEMPERATURE'

In [7]:
# BL_Forecast_Data
# +-----------+---------------------+-------------------+--------------+-------------------+--------------------------------------+-------------+--------------------------------------------------------------------------------------------------------+
# | Lead_Time | Local_datetime      | dirty_forecast    | IBM_forecast | forecast          | stationUid                           | metric      | pkl_file_name                                                                                          |
# +-----------+---------------------+-------------------+--------------+-------------------+--------------------------------------+-------------+--------------------------------------------------------------------------------------------------------+
# |         0 | 2022-12-07 17:00:00 | 11.50144818238914 |           12 | 11.50144818238914 | c8a9d548-7fc4-40bb-954e-389988526593 | TEMPERATURE | 2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-7fc4-40bb-954e-389988526593_TEMPERATURE_2022-12-06_1.pkl |

BL_Forecast_table = db.Table('BL_Forecast_Data', metadata, autoload=True, autoload_with=engine)
query = db.select([BL_Forecast_table]).where(BL_Forecast_table.columns.pkl_file_name == first_test_pkl )
BL_Forecast_table_df = SQL_bl.query_to_df(connection_in, query)

In [9]:
BL_Forecast_table_df

Unnamed: 0,Lead_Time,Local_datetime,dirty_forecast,IBM_forecast,forecast,stationUid,metric,pkl_file_name
0,0,2022-12-08 01:00:00,11.5014481824,12,11.5014481824,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
1,1,2022-12-08 02:00:00,9.4817644618,10,9.4817644618,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
2,2,2022-12-08 03:00:00,7.1034906767,9,7.1034906767,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
3,3,2022-12-08 04:00:00,7.9920253828,8,7.9920253828,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
4,4,2022-12-08 05:00:00,7.9920253828,8,7.9920253828,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
...,...,...,...,...,...,...,...,...
8635,122,2022-12-24 13:00:00,120.1835937500,23,23.0000000000,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
8636,139,2022-12-25 06:00:00,9.4838867188,10,9.4838867188,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
8637,41,2022-12-21 04:00:00,-99265312.8256835938,4,4.0000000000,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
8638,81,2022-12-22 20:00:00,9.4838867188,10,9.4838867188,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...


In [28]:
Training_var = Training_vars[0]
lead_time_bin = 0; #for loop this
query = db.select([BL_Forecast_table]).where(db.and_(BL_Forecast_table.columns.pkl_file_name == first_test_pkl,BL_Forecast_table.columns.Lead_Time.in_(Lead_Time_bins[lead_time_bin])))
Forecast_data_binned = SQL_bl.query_to_df(connection_in, query)
Forecast_data_binned

Unnamed: 0,Lead_Time,Local_datetime,dirty_forecast,IBM_forecast,forecast,stationUid,metric,pkl_file_name
0,0,2022-12-08 01:00:00,11.5014481824,12,11.5014481824,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
1,1,2022-12-08 02:00:00,9.4817644618,10,9.4817644618,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
2,2,2022-12-08 03:00:00,7.1034906767,9,7.1034906767,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
3,3,2022-12-08 04:00:00,7.9920253828,8,7.9920253828,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
4,4,2022-12-08 05:00:00,7.9920253828,8,7.9920253828,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
...,...,...,...,...,...,...,...,...
571,19,2022-12-20 06:00:00,11242354.2807617188,3,3.0000000000,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
572,7,2022-12-19 18:00:00,9.4838867188,10,9.4838867188,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
573,15,2022-12-20 02:00:00,-99265312.8256835938,4,4.0000000000,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
574,11,2022-12-19 22:00:00,-175560.4033203125,6,6.0000000000,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...


In [30]:
conditions = (BL_Forecast_table_df['Lead_Time'] >= min(Lead_Time_bins[lead_time_bin])) & (BL_Forecast_table_df['Lead_Time'] <= max(Lead_Time_bins[lead_time_bin]))
Forecast_data_binned = BL_Forecast_table_df.loc[conditions]

In [31]:
# Station_Data
# +---------------------+------------------------+-------------------+--------------------+----------------+------------+--------------------------------------+------------+
# | Local_datetime      | PRECIPITATION_QUANTITY | RELATIVE_HUMIDITY | TEMPERATURE        | WIND_DIRECTION | WIND_SPEED | stationUid                           | identifier |
# +---------------------+------------------------+-------------------+--------------------+----------------+------------+--------------------------------------+------------+
# | 2022-11-01 00:00:00 |                      0 | 90.96666666666665 | 13.962962962962962 |              0 |          0 | c8a9d548-7fc4-40bb-954e-389988526593 | 80318      |
Station_Data_table = db.Table('Station_Data', metadata, autoload=True, autoload_with=engine)

date_overlap = Forecast_data_binned['Local_datetime']

query = db.select([Station_Data_table]).where(db.and_(Station_Data_table.columns.stationUid == StationUid, Station_Data_table.columns.Local_datetime.in_(date_overlap)))
Sensor_data_binned = SQL_bl.query_to_df(connection_in, query)
Sensor_data_binned

Unnamed: 0,Local_datetime,PRECIPITATION_QUANTITY,RELATIVE_HUMIDITY,TEMPERATURE,WIND_DIRECTION,WIND_SPEED,stationUid,identifier
0,2022-12-08 06:00:00,0E-10,94.6000000000,4.2361111111,0E-10,0E-10,c8a9d548-7fc4-40bb-954e-389988526593,80318
1,2022-12-08 07:00:00,0E-10,95.0500000000,3.9027777778,0E-10,0E-10,c8a9d548-7fc4-40bb-954e-389988526593,80318
2,2022-12-08 16:00:00,0E-10,97.4000000000,3.0833333333,0E-10,0E-10,c8a9d548-7fc4-40bb-954e-389988526593,80318
3,2022-12-08 19:00:00,0E-10,52.4750000000,18.7361111111,203.5000000000,0.3743853375,c8a9d548-7fc4-40bb-954e-389988526593,80318
4,2022-12-08 01:00:00,0E-10,79.0750000000,11.3333333333,97.5000000000,0.0134108181,c8a9d548-7fc4-40bb-954e-389988526593,80318
...,...,...,...,...,...,...,...,...
270,2022-12-19 11:00:00,0E-10,34.5000000000,19.4583333333,170.7500000000,0.0894054537,c8a9d548-7fc4-40bb-954e-389988526593,80318
271,2022-12-19 07:00:00,0E-10,97.1000000000,-0.1111111111,0E-10,0E-10,c8a9d548-7fc4-40bb-954e-389988526593,80318
272,2022-12-19 08:00:00,0E-10,97.4750000000,1.0555555556,0E-10,0E-10,c8a9d548-7fc4-40bb-954e-389988526593,80318
273,2022-12-19 05:00:00,0E-10,96.7750000000,0.2916666667,0E-10,0E-10,c8a9d548-7fc4-40bb-954e-389988526593,80318


In [32]:
# MSE_Forecast       |
#  MSE_IBM            |
#  MAE_Forecast       |
#  MAE_IBM            |
#  r2_Forecast        |
#  r2_IBM             |
#  weeks |
#  Model_Date |
#  Days_Since |
#  stationUid                           |
#  Metric      |
#  pkl_file_name    

Data_All = pd.merge(Sensor_data_binned,Forecast_data_binned, how='inner', on = 'Local_datetime')
IBM = Data_All["IBM_forecast"]
BL = Data_All["forecast"]
Sensor = Data_All[Training_var]

week_og = Pkl_table_df.weeks[pkl_index]
day_og = Pkl_table_df.Model_Date[pkl_index]

# Insert a clause that requires there to be > 24 data points

performance_stats = {
            'MSE_Forecast': [mean_squared_error(Sensor, BL)],
            'MSE_IBM': [mean_squared_error(Sensor, IBM)],
            'MAE_Forecast':[mean_absolute_error(Sensor, BL)],
            'MAE_IBM':[mean_absolute_error(Sensor, IBM)],
            'r2_Forecast':[r2_score(Sensor, BL)],
            'r2_IBM':[r2_score(Sensor, IBM)],
            'weeks': week_og,
            'Model_Date': day_og,
            'Days_Since': abs(day_og - Data_All["Local_datetime"].max().date())
        }
df_performance_temp = pd.DataFrame(performance_stats, columns=['MSE_Forecast','MSE_IBM','MAE_Forecast','MAE_IBM','r2_Forecast','r2_IBM','weeks','Model_Date','Days_Since'])

df_performance_temp['stationUid'] = Pkl_table_df.stationUid[pkl_index] ;
df_performance_temp['Metric'] = Pkl_table_df.Metric[pkl_index];
df_performance_temp['pkl_file_name'] = Pkl_table_df.pkl_file_name[pkl_index];
df_performance_temp['Lead_Time_Bin'] = lead_time_bin; 

df_performance_temp
# Sensor_data_binned[Training_var] 

Unnamed: 0,MSE_Forecast,MSE_IBM,MAE_Forecast,MAE_IBM,r2_Forecast,r2_IBM,weeks,Model_Date,Days_Since,stationUid,Metric,pkl_file_name,Lead_Time_Bin
0,11.527534,12.304233,2.468785,2.673934,0.685491,0.6643,1,2022-12-06,13 days,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...,0


In [74]:
abs(day_og - Data_All["Local_datetime"].max().date())

datetime.timedelta(days=2)

In [34]:

# pkl_list = Pkl_table_df.pkl_file_name.unique()
df_performance_all = pd.DataFrame()
BL_Forecast_table = db.Table('BL_Forecast_Data', metadata, autoload=True, autoload_with=engine)
Station_Data_table = db.Table('Station_Data', metadata, autoload=True, autoload_with=engine)

# for pkl_index in range(0,len(Pkl_table_df.pkl_file_name.unique())):
for pkl_index in range(0,2):
    # pkl_index = 0
    print(pkl_index)
    first_test_pkl = Pkl_table_df.pkl_file_name[pkl_index]
    Pkl_table_df

    query = db.select([BL_Forecast_table]).where(BL_Forecast_table.columns.pkl_file_name == first_test_pkl )
    BL_Forecast_table_df = SQL_bl.query_to_df(connection_in, query)

    Training_var = Training_vars[0]
    
    for lead_time_bin in range(0,15):
    # lead_time_bin = 0; #for loop this
        conditions = (BL_Forecast_table_df['Lead_Time'] >= min(Lead_Time_bins[lead_time_bin])) & (BL_Forecast_table_df['Lead_Time'] <= max(Lead_Time_bins[lead_time_bin]))
        Forecast_data_binned = BL_Forecast_table_df.loc[conditions]

        date_overlap = Forecast_data_binned['Local_datetime']

        query = db.select([Station_Data_table]).where(db.and_(Station_Data_table.columns.stationUid == StationUid, Station_Data_table.columns.Local_datetime.in_(date_overlap)))
        Sensor_data_binned = SQL_bl.query_to_df(connection_in, query)
        if len(Sensor_data_binned) > 15:
            Data_All = pd.merge(Sensor_data_binned,Forecast_data_binned, how='inner', on = 'Local_datetime')
            IBM = Data_All["IBM_forecast"]
            BL = Data_All["forecast"]
            Sensor = Data_All[Training_var]

            week_og = Pkl_table_df.weeks[pkl_index]
            day_og = Pkl_table_df.Model_Date[pkl_index]

            # Insert a clause that requires there to be > 24 data points

            performance_stats = {
                        'MSE_Forecast': [mean_squared_error(Sensor, BL)],
                        'MSE_IBM': [mean_squared_error(Sensor, IBM)],
                        'MAE_Forecast':[mean_absolute_error(Sensor, BL)],
                        'MAE_IBM':[mean_absolute_error(Sensor, IBM)],
                        'r2_Forecast':[r2_score(Sensor, BL)],
                        'r2_IBM':[r2_score(Sensor, IBM)],
                        'weeks': week_og,
                        'Model_Date': day_og,
                        'Days_Since': abs((day_og - Data_All["Local_datetime"].max().date()).days)
                    }
            df_performance_temp = pd.DataFrame(performance_stats, columns=['MSE_Forecast','MSE_IBM','MAE_Forecast','MAE_IBM','r2_Forecast','r2_IBM','weeks','Model_Date','Days_Since'])

            df_performance_temp['stationUid'] = Pkl_table_df.stationUid[pkl_index] ;
            df_performance_temp['Metric'] = Pkl_table_df.Metric[pkl_index];
            df_performance_temp['pkl_file_name'] = Pkl_table_df.pkl_file_name[pkl_index];
            df_performance_temp['Lead_Time_Bin'] = lead_time_bin; 
            df_performance_all = df_performance_all.append(df_performance_temp, ignore_index = True)

df_performance_all
# Sensor_data_binned[Training_var] 

0
No rows returned
No rows returned
No rows returned
1
No rows returned
No rows returned
No rows returned
No rows returned


Unnamed: 0,MSE_Forecast,MSE_IBM,MAE_Forecast,MAE_IBM,r2_Forecast,r2_IBM,weeks,Model_Date,Days_Since,stationUid,Metric,pkl_file_name,Lead_Time_Bin
0,11.527534,12.304233,2.468785,2.673934,0.685491,0.6643,1,2022-12-06,13,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...,0
1,20.044059,19.951418,3.119765,3.307231,0.460381,0.462875,1,2022-12-06,13,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...,1
2,36.888794,33.963966,4.085668,4.164557,-0.032213,0.049629,1,2022-12-06,13,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...,2
3,50.024992,44.927408,5.151659,5.045024,-0.42983,-0.284129,1,2022-12-06,13,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...,3
4,67.983126,59.678528,6.408894,6.115686,-0.83184,-0.608068,1,2022-12-06,13,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...,4
5,74.860196,64.816052,7.038666,6.667724,-0.792835,-0.552286,1,2022-12-06,13,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...,5
6,82.716555,74.254967,7.838853,7.528239,-0.879212,-0.686975,1,2022-12-06,13,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...,6
7,79.50041,73.644876,7.82907,7.612042,-0.757171,-0.627748,1,2022-12-06,13,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...,7
8,86.125011,81.190242,8.300356,8.209483,-0.695954,-0.59878,1,2022-12-06,13,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...,8
9,65.93699,66.499382,7.265846,7.394187,-0.468561,-0.481086,1,2022-12-06,13,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...,9


In [119]:
# Create mysql.connection to do merge tables
connection_out = SQL_bl.create_db_connection(hostname, uname, pwd, dbname) # Connect to the Database

df_performance_all.to_sql('Pkl_Online_Performance_Data', engine, index=False, if_exists = 'replace')

name = "Pkl_Online_Performance";
query_merge = '''
INSERT INTO %s_Data
SELECT %s_Data_Temp.*
FROM %s_Data_Temp
WHERE NOT EXISTS(SELECT * FROM %s_Data 
WHERE %s_Data_Temp.pkl_file_name = %s_Data.pkl_file_name 
AND %s_Data_Temp.Days_Since = %s_Data.Days_Since 
AND %s_Data_Temp.Lead_Time_Bin = %s_Data.Lead_Time_Bin);
''' % (name,name,name,name,name,name,name,name,name,name)

SQL_bl.execute_query(connection_out, query_merge) # Execute our defined query

MySQL Database connection successful
Query successful


In [108]:
pkl_index = 3
first_test_pkl = Pkl_table_df.pkl_file_name[pkl_index]
print(first_test_pkl)
query = db.select([BL_Forecast_table]).where(BL_Forecast_table.columns.pkl_file_name == first_test_pkl )
BL_Forecast_table_df = SQL_bl.query_to_df(connection_in, query)
BL_Forecast_table_df

2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-7fc4-40bb-954e-389988526593_TEMPERATURE_2022-12-06_4.pkl


Unnamed: 0,Lead_Time,Local_datetime,dirty_forecast,IBM_forecast,forecast,stationUid,metric,pkl_file_name
0,0,2022-12-08 13:00:00,14.7843322754,15,14.7843322754,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
1,1,2022-12-08 14:00:00,14.7843322754,15,14.7843322754,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
2,2,2022-12-08 15:00:00,13.4525756836,14,13.4525756836,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
3,3,2022-12-08 16:00:00,13.4525756836,14,13.4525756836,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
4,4,2022-12-08 17:00:00,8.2496337891,11,8.2496337891,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
...,...,...,...,...,...,...,...,...
355,355,2022-12-23 08:00:00,7.4783020020,10,7.4783020020,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
356,356,2022-12-23 09:00:00,8.9312133789,12,8.9312133789,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
357,357,2022-12-23 10:00:00,11.1656188965,13,11.1656188965,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...
358,358,2022-12-23 11:00:00,14.7843322754,15,14.7843322754,c8a9d548-7fc4-40bb-954e-389988526593,TEMPERATURE,2a6997a7-a9a2-4b2e-b87a-656d6e85cbaf_c8a9d548-...


In [None]:
len(Pkl_table_df.pkl_file_name.unique())

102