In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import MaxPooling1D,Flatten, Dense, LSTM, Conv1D, TimeDistributed, GRU
from tensorflow.keras.optimizers import Adam
import pandas as pd
import model_fun
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
#https://machinelearningmastery.com/cnn-long-short-term-memory-networks/


from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

import tensorflow as tf
from pathlib import Path

from sklearn.model_selection import GridSearchCV

import itertools

import model_fun as fun

from adapt.feature_based import CORAL
from adapt.instance_based import KLIEP
from adapt.instance_based import KMM
from adapt.instance_based import TrAdaBoostR2
from adapt.instance_based import NearestNeighborsWeighting
from adapt.feature_based import FA

import tensorflow as tf

from sklearn.neural_network import MLPRegressor

import time
import json

import calendar

!python --version

### Required Functions

In [65]:
def calc_kpi(test_X,test_y,pred_result, title="NOTDEFINED",additional_info=None,output=False):
    rmse = float(format(np.sqrt(mean_squared_error(test_y, pred_result)), '.3f'))
    mape = float(format(mean_absolute_percentage_error(test_y, pred_result), '.3f'))
    max_min = test_y.max()-test_y.min()
    nrmse = float(format(rmse / (max_min[0])*100, '.3f'))
    if output:
        print("-----------------------------------------")
        print(title)
        print("RMSE:\t\t\t\t"+ str(rmse))
        print("NRMSE:\t\t\t\t"+ str(nrmse))
        print("MAPE:\t\t\t\t"+ str(mape))
        print("-----------------------------------------")
    
    DICT_METRICS[title] = (rmse,nrmse,mape)
    
    if additional_info!=None:
        DICT_METRICS[title] = (rmse,nrmse,mape)+additional_info
    
# functions needed to calculate similiar features (only select same features)
def find_intersection(lists)-> list:
    intersection = set(lists[0])
    for li in lists[1:]:
        intersection = intersection.intersection(li)
    return list(intersection)

def get_colname_intersection(dict_stations, li_station_names):
    li_col_names = []
    for station_name in li_station_names:
        li_col_names.append(list(dict_stations[station_name].keys()))
    return find_intersection(lists=li_col_names)

def select_by(df,y,m=None,d=None):
    if m==None and d==None:
        return df[df.index.year.isin(y)]
    if m==None:
        return df[(df.index.year.isin(y))&(df.index.day.isin(d))]
    if d==None:
        return df[(df.index.month.isin(m))&(df.index.year.isin(y))]
    return df[(df.index.day.isin(d))&(df.index.month.isin(m))&(df.index.year.isin(y))]

def show_prediction_plot(df_y, dict_predictions=None, li_specific_cols = None, title=""):
    df_result = pd.DataFrame()
    df_result = df_y.copy()

    if dict_predictions!=None:
        for key in dict_predictions:
            df_result[key] = dict_predictions[key]
    if li_specific_cols != None:
        df_result = df_result [li_specific_cols]
    return model_fun.create_prediction_plot(df=df_result,add_dots=True,period="M",title=title)
def save_dict_to_json(dictionary, file_path):
    try:
        with open(file_path, 'a+') as file:
            file.seek(0)
            data = file.read()
            if data:
                file.seek(0, 2)  # Move the cursor to the end of the file
                file.write(',')
            json.dump(dictionary, file)
            file.write('\n')
    except IOError as e:
        print(f"Error writing to JSON file: {str(e)}")
def saveDF(df, exp_desc, filename="UNDEFINED"):
    dict_exp_desc = dict()
    current_timestamp = int(time.time())
    exp_path = Path('../datasets/experiments_results/'+str(current_timestamp)+'_'+filename+'.csv')
    df.to_csv(exp_path)
    dict_exp_desc[str(current_timestamp)+'_'+filename+'.csv'] = exp_desc
    file_path = "../datasets/experiments_results/exp_description.json"
    save_dict_to_json(dict_exp_desc,file_path)
    #with open(file_path, "a") as json_file:
    #    json.dump(dict_exp_desc)


### Read data per measurement station

In [None]:
PATH_DATA = '../datasets/data_per_station'


DICT_DF_STATIONS = {'d':pd.DataFrame(),'w':pd.DataFrame(),'s':pd.DataFrame(),'n':pd.DataFrame(),'e':pd.DataFrame(),'z':pd.DataFrame()}
DICT_DF_STATIONS_ID = {'d':1,'w':2,'s':3,'n':4,'e':5,'z':6}
DICT_DF_STATIONS = model_fun.read_all(path=PATH_DATA,DICT_DF_STATIONS=DICT_DF_STATIONS,DICT_DF_STATIONS_ID=DICT_DF_STATIONS_ID,use_lags=True)
ALL_POLLUTANTS = ["pm10","nox","no","no2","pm2.5","pm1","o3"]

# drop features with a high number of NaN which are non-imputeable

if "windsp" in DICT_DF_STATIONS["e"].columns:
    DICT_DF_STATIONS["e"].drop(columns=["windDirDeg","windsp","windPeak","windDirClass"],inplace=True)
if "radiation" in DICT_DF_STATIONS["e"].columns:
    DICT_DF_STATIONS["n"].drop(columns=["radiation"],inplace=True)


DICT_DF_STATIONS = model_fun.use_traffic_data(dict_stations=DICT_DF_STATIONS, use_traffic_bins=False, use_traffic_continous=False)
DICT_DF_STATIONS = model_fun.use_traffic_lags(dict_stations=DICT_DF_STATIONS, only_use_lags=False, use_traffic_lags=False)

# Drop few NaN values found in pre-processing
for station in DICT_DF_STATIONS:
    DICT_DF_STATIONS[station].dropna(inplace=True)

### Select Features according to EDA
- to get a baseline with pm10lags, set use_pm10_lag=True
- to encode cyclic nature of dayOfYear in sine, set dayOfYear_sine_transform = True

In [None]:
use_categorical = True
use_degree = False
use_zagreb_year = False


use_traffic_continous = False
use_traffic_bins = False


use_traffic_lags = False
only_use_lags = False

use_pm10_lags = False

delete_holiday_features = False

# exclude 2020
exclude_year = None

exclude_years_zagreb = True

dayOfYear_sine_transform = False

###################################################################################################################################################

if exclude_year!= None:
    for station in DICT_DF_STATIONS:
        df_temp = DICT_DF_STATIONS[station]
        DICT_DF_STATIONS[station] = df_temp[df_temp.index.year != exclude_year]
        print(set(DICT_DF_STATIONS[station].index.year))
        
# to exlude 2009,2010,2011,2012,2013
if exclude_years_zagreb:
    for year in [2009,2010,2011,2012,2013]:
        df_temp = DICT_DF_STATIONS["z"]
        DICT_DF_STATIONS["z"] = df_temp[df_temp.index.year != year]
        print("Station Zagreb: delete year: "+ str(year))

if not use_categorical:
    for station in DICT_DF_STATIONS:
        if "windDirClass" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('windDirClass',axis=1,inplace=True)
            print("DELETE [windDirClass] done for station '"+fun.get_station_name_by_indice(station)+"'")

if not use_categorical:
    for station in DICT_DF_STATIONS:
        if "windDirClass" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('windDirClass',axis=1,inplace=True)
            print("DELETE [windDirClass] done for station '"+fun.get_station_name_by_indice(station)+"'")
if not use_degree:
    for station in DICT_DF_STATIONS:
        if "windDirDeg" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station] = DICT_DF_STATIONS[station].drop('windDirDeg',axis=1)
            print("DELETE [windDirDeg] done for station '"+fun.get_station_name_by_indice(station)+"'")
            
if not use_zagreb_year:
    if "year" in DICT_DF_STATIONS["z"].columns:
            DICT_DF_STATIONS["z"] = DICT_DF_STATIONS["z"].drop('year',axis=1)
            print("DELETE [year] done for station '"+fun.get_station_name_by_indice("z")+"'")

if not use_traffic_continous:
    for station in DICT_DF_STATIONS:
        if "traffic" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('traffic',axis=1,inplace=True)
            print("DELETE [traffic] done for station '"+fun.get_station_name_by_indice(station)+"'")
if not use_traffic_bins:
    for station in DICT_DF_STATIONS:
        if "trafficClass" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('trafficClass',axis=1,inplace=True)
            print("DELETE [trafficClass] done for station '"+fun.get_station_name_by_indice(station)+"'")
            
if only_use_lags:
    for station in DICT_DF_STATIONS:
        if "traffic" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('traffic',axis=1,inplace=True)
            print("DELETE [traffic] done for station '"+ fun.get_station_name_by_indice(station)+"'")
# drop lag columns
if not use_traffic_lags:
    for station in DICT_DF_STATIONS:
        li_lags = ["trafficLag1","trafficLag2","trafficLag3","trafficLag4"]
        for traffic_lag in li_lags:
            if traffic_lag in DICT_DF_STATIONS[station].columns:
                DICT_DF_STATIONS[station].drop(traffic_lag ,axis=1,inplace=True)
                print("DELETE"+ traffic_lag + "done for station '"+fun.get_station_name_by_indice(station)+"'")
                
# drop lag columns
if not use_pm10_lags:
    for station in DICT_DF_STATIONS:
        if "pm10Lag" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('pm10Lag',axis=1,inplace=True)
            print("DELETE done for station '"+fun.get_station_name_by_indice(station)+"'")
            
if delete_holiday_features:
    for station in DICT_DF_STATIONS:
        if "holiday" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('holiday',axis=1,inplace=True)
            print("DELETE [holiday] done for station '"+fun.get_station_name_by_indice(station)+"'")
        if "dayBeforeHoliday" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('dayBeforeHoliday',axis=1,inplace=True)
            print("DELETE [dayBeforeHoliday] done for station '"+fun.get_station_name_by_indice(station)+"'")
        if "dayAfterHoliday" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('dayAfterHoliday',axis=1,inplace=True)
            print("DELETE [dayAfterHoliday] done for station '"+fun.get_station_name_by_indice(station)+"'")

# Function to get the day of the year dynamically considering leap years
def get_day_of_year_adjusted(date):
    year = date.year
    days_in_year = 366 if calendar.isleap(year) else 365
    return date.timetuple().tm_yday, days_in_year

if dayOfYear_sine_transform:
      for station in DICT_DF_STATIONS:
        DICT_DF_STATIONS[station]["dayOfYear"] = [
            np.sin(2 * np.pi * get_day_of_year_adjusted(date)[0] / get_day_of_year_adjusted(date)[1])
            for date in DICT_DF_STATIONS[station].index.to_pydatetime()]

### Impute RH and Temp with TL from stations West,North,DB,South to station East
- only done if new_TL_imputations_required = True

In [72]:
new_TL_imputations_required = False

if new_TL_imputations_required:
    DICT_METRICS = {}


    li_all_station_graz = ['w', 'n', 's' ,'e']
        
    dict_predictions = {}

    liFeaturesToImpute = ["temp","rh"]

    li_source = [x for x in li_all_station_graz if x!="z"]
    target =  "z"

    print(str([fun.get_station_name_by_indice(station) for station in li_source]) +" -> "+fun.get_station_name_by_indice(target))

    for featureToImpute in liFeaturesToImpute:

        df_global = model_fun.create_global_model(dict_stations=DICT_DF_STATIONS,li_station_combination=li_all_station_graz)

        df_s_train_X, df_s_train_y, df_t_test_X, df_t_test_y, _, _, dict_info = model_fun.split_train_test_by_station(df=df_global,
                                                                                                                                li_stations=li_all_station_graz,
                                                                                                                                pollutant_to_predict="pm10",
                                                                                                                                station_test="e",
                                                                                                                                station_validation=None,
                                                                                                                                use_validation = False,
                                                                                                                                output=True)

        X_TRAIN = df_s_train_X.loc[:, df_s_train_X.columns != 'temp']
        X_TRAIN = X_TRAIN.loc[:, X_TRAIN.columns != 'rh']
        X_TRAIN["pm10"] = df_s_train_y

        X_TRAIN

        Y_TRAIN = df_s_train_X.loc[:, df_s_train_X.columns == featureToImpute]

        X_TEST = df_t_test_X.loc[:, df_t_test_X.columns != 'temp']
        X_TEST = X_TEST.loc[:, X_TEST.columns != 'rh']
        X_TEST["pm10"] = df_t_test_y

        Y_TEST = df_t_test_X.loc[:, df_t_test_X.columns == featureToImpute]

        years = [2018,2019,2020]


        yt_samples_for_TL = select_by(df=Y_TEST,y=years)
        Xt_samples_for_TL = select_by(df=X_TEST,y=years)

        print("Years",str(years))
        print("No. of injections: ", len(Xt_samples_for_TL))
        no_injections = len(Xt_samples_for_TL)

        # Random Forest
        regr = RandomForestRegressor(random_state=42,n_estimators=180,min_samples_split=5, min_samples_leaf=2, max_features="sqrt", max_depth=60, bootstrap=True)
        regr.fit(X=X_TRAIN, y=Y_TRAIN.values.ravel())
        predictions_RF = regr.predict(X_TEST)

        # Random Forest with TrAdaBoostR2
        model = TrAdaBoostR2(regr, Xt=Xt_samples_for_TL, yt=yt_samples_for_TL,n_estimators=20, random_state=0,verbose=0,lr=2)
        model.fit(X_TRAIN, Y_TRAIN)
        predictions_RF_TL = model.predict(X_TEST)



        #dict_predictions["Oo_DG_"+featureToImpute] = predictions_RF
        dict_predictions["TrABR2_"+featureToImpute] = predictions_RF_TL.flatten()



    # only uncomment if new TL computations are required! 
    #df = pd.DataFrame(data=dict_predictions,index=X_TEST.index)
    #df.to_csv("../datasets/MissingValImputeTL.csv")

### Read TL imputed data and combine with dataset 

In [None]:
PATH_DATA = '../datasets/data_per_station_TL_impute_station_east'


DICT_DF_STATIONS = {'d':pd.DataFrame(),'w':pd.DataFrame(),'s':pd.DataFrame(),'n':pd.DataFrame(),'e':pd.DataFrame(),'z':pd.DataFrame()}
DICT_DF_STATIONS_ID = {'d':1,'w':2,'s':3,'n':4,'e':5,'z':6}
DICT_DF_STATIONS = model_fun.read_all(path=PATH_DATA,DICT_DF_STATIONS=DICT_DF_STATIONS,DICT_DF_STATIONS_ID=DICT_DF_STATIONS_ID,use_lags=True)
ALL_POLLUTANTS = ["pm10","nox","no","no2","pm2.5","pm1","o3"]

# drop features with a high number of NaN which are non-imputeable

if "windsp" in DICT_DF_STATIONS["e"].columns:
    DICT_DF_STATIONS["e"].drop(columns=["windDirDeg","windsp","windPeak","windDirClass"],inplace=True)
if "radiation" in DICT_DF_STATIONS["e"].columns:
    DICT_DF_STATIONS["n"].drop(columns=["radiation"],inplace=True)


dfTLImputed_RH_temp = pd.read_csv("../datasets/MissingValImputeTL.csv",index_col=0, parse_dates=True)


def set_value_if_none(row):
    if row[1] == -99:
        return row[0]
    else:
        return row[1]
    

DICT_DF_STATIONS["e"] = DICT_DF_STATIONS["e"].combine_first(dfTLImputed_RH_temp)


DICT_DF_STATIONS['e']["temp"] = DICT_DF_STATIONS['e'][["TrABR2_temp","temp"]].apply(set_value_if_none,axis=1)
DICT_DF_STATIONS['e']["rh"] = DICT_DF_STATIONS['e'][["TrABR2_temp","rh"]].apply(set_value_if_none,axis=1)


DICT_DF_STATIONS['e'].drop(columns=["TrABR2_rh","TrABR2_temp"],inplace=True)


# Drop few NaN values found in pre-processing
for station in DICT_DF_STATIONS:
    DICT_DF_STATIONS[station].dropna(inplace=True)



### kdkdkd

In [None]:
use_categorical = True
use_degree = False
use_zagreb_year = False


use_traffic_continous = False
use_traffic_bins = False


use_traffic_lags = False
only_use_lags = False

use_pm10_lags = False

delete_holiday_features = False

# exclude 2020
exclude_year = None

exclude_years_zagreb = True

dayOfYear_sine_transform = False

###################################################################################################################################################

if exclude_year!= None:
    for station in DICT_DF_STATIONS:
        df_temp = DICT_DF_STATIONS[station]
        DICT_DF_STATIONS[station] = df_temp[df_temp.index.year != exclude_year]
        print(set(DICT_DF_STATIONS[station].index.year))
        
# to exlude 2009,2010,2011,2012,2013
if exclude_years_zagreb:
    for year in [2009,2010,2011,2012,2013]:
        df_temp = DICT_DF_STATIONS["z"]
        DICT_DF_STATIONS["z"] = df_temp[df_temp.index.year != year]
        print("Station Zagreb: delete year: "+ str(year))

if not use_categorical:
    for station in DICT_DF_STATIONS:
        if "windDirClass" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('windDirClass',axis=1,inplace=True)
            print("DELETE [windDirClass] done for station '"+fun.get_station_name_by_indice(station)+"'")

if not use_categorical:
    for station in DICT_DF_STATIONS:
        if "windDirClass" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('windDirClass',axis=1,inplace=True)
            print("DELETE [windDirClass] done for station '"+fun.get_station_name_by_indice(station)+"'")
if not use_degree:
    for station in DICT_DF_STATIONS:
        if "windDirDeg" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station] = DICT_DF_STATIONS[station].drop('windDirDeg',axis=1)
            print("DELETE [windDirDeg] done for station '"+fun.get_station_name_by_indice(station)+"'")
            
if not use_zagreb_year:
    if "year" in DICT_DF_STATIONS["z"].columns:
            DICT_DF_STATIONS["z"] = DICT_DF_STATIONS["z"].drop('year',axis=1)
            print("DELETE [year] done for station '"+fun.get_station_name_by_indice("z")+"'")

if not use_traffic_continous:
    for station in DICT_DF_STATIONS:
        if "traffic" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('traffic',axis=1,inplace=True)
            print("DELETE [traffic] done for station '"+fun.get_station_name_by_indice(station)+"'")
if not use_traffic_bins:
    for station in DICT_DF_STATIONS:
        if "trafficClass" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('trafficClass',axis=1,inplace=True)
            print("DELETE [trafficClass] done for station '"+fun.get_station_name_by_indice(station)+"'")
            
if only_use_lags:
    for station in DICT_DF_STATIONS:
        if "traffic" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('traffic',axis=1,inplace=True)
            print("DELETE [traffic] done for station '"+ fun.get_station_name_by_indice(station)+"'")
# drop lag columns
if not use_traffic_lags:
    for station in DICT_DF_STATIONS:
        li_lags = ["trafficLag1","trafficLag2","trafficLag3","trafficLag4"]
        for traffic_lag in li_lags:
            if traffic_lag in DICT_DF_STATIONS[station].columns:
                DICT_DF_STATIONS[station].drop(traffic_lag ,axis=1,inplace=True)
                print("DELETE"+ traffic_lag + "done for station '"+fun.get_station_name_by_indice(station)+"'")
                
# drop lag columns
if not use_pm10_lags:
    for station in DICT_DF_STATIONS:
        if "pm10Lag" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('pm10Lag',axis=1,inplace=True)
            print("DELETE done for station '"+fun.get_station_name_by_indice(station)+"'")
            
if delete_holiday_features:
    for station in DICT_DF_STATIONS:
        if "holiday" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('holiday',axis=1,inplace=True)
            print("DELETE [holiday] done for station '"+fun.get_station_name_by_indice(station)+"'")
        if "dayBeforeHoliday" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('dayBeforeHoliday',axis=1,inplace=True)
            print("DELETE [dayBeforeHoliday] done for station '"+fun.get_station_name_by_indice(station)+"'")
        if "dayAfterHoliday" in DICT_DF_STATIONS[station].columns:
            DICT_DF_STATIONS[station].drop('dayAfterHoliday',axis=1,inplace=True)
            print("DELETE [dayAfterHoliday] done for station '"+fun.get_station_name_by_indice(station)+"'")

# Function to get the day of the year dynamically considering leap years
def get_day_of_year_adjusted(date):
    year = date.year
    days_in_year = 366 if calendar.isleap(year) else 365
    return date.timetuple().tm_yday, days_in_year

if dayOfYear_sine_transform:
      for station in DICT_DF_STATIONS:
        DICT_DF_STATIONS[station]["dayOfYear"] = [
            np.sin(2 * np.pi * get_day_of_year_adjusted(date)[0] / get_day_of_year_adjusted(date)[1])
            for date in DICT_DF_STATIONS[station].index.to_pydatetime()]

### Station level OODG (e->z) using newly imputed missing values

In [None]:
DICT_METRICS = {}

# all station test on Zagreb
li_combinations = [["e","z"]]

dict_predictions = {}

df_results = pd.DataFrame()

for combination in li_combinations:
   
    source = combination[0]
    target = combination[1]
        
    print(fun.get_station_name_by_indice(source)+" -> "+fun.get_station_name_by_indice(target))

    df_s_train_X, df_s_train_y, _, _, dict_s_info = fun.split_train_test_by_year(df=DICT_DF_STATIONS[source],pollutant_to_predict="pm10",split_ratio=1.0,output=False)
    _, _, df_t_test_X, df_t_test_y, dict_t_info = fun.split_train_test_by_year(df=DICT_DF_STATIONS[target],pollutant_to_predict="pm10",split_ratio=1.0,output=False)
    
    # only consider equal features
    li_columns = get_colname_intersection(dict_stations=DICT_DF_STATIONS,li_station_names=[source,target])
    
    li_columns = [x for x in li_columns if x not in ALL_POLLUTANTS]
    df_s_train_X = df_s_train_X[li_columns]
    df_t_test_X =  df_t_test_X[li_columns]
   
    
    # Random Forest 
    regr = RandomForestRegressor(random_state=42,n_estimators=180,min_samples_split=5, min_samples_leaf=2, max_features="sqrt", max_depth=60, bootstrap=True)    
    regr.fit(X=df_s_train_X, y=df_s_train_y.values.ravel())
    predictions_RF = regr.predict(df_t_test_X)
    
    calc_kpi(pred_result=predictions_RF,test_X=df_t_test_X,test_y=df_t_test_y,title="RF_pred",output=False)
        
    dict_predictions["("+source+"->"+target+")"] = predictions_RF

    df_results["("+source+"->"+target+")"] = pd.DataFrame(DICT_METRICS, index=['rmse', 'nrmse', 'mape'])
    display(df_results)


show_prediction_plot(df_y=df_t_test_y,dict_predictions=dict_predictions)

### Station-level TL 

In [None]:
DICT_METRICS = {}

li_combinations = [["e","z"]]

for combination in li_combinations:
    
    
    dict_predictions = {}
    
    source = combination[0]
    target = combination[1]
        
    print(fun.get_station_name_by_indice(source)+" -> "+fun.get_station_name_by_indice(target))

    df_s_train_X, df_s_train_y, _, _, dict_s_info = fun.split_train_test_by_year(df=DICT_DF_STATIONS[source],pollutant_to_predict="pm10",split_ratio=1.0,output=False)
    _, _, df_t_test_X, df_t_test_y, dict_t_info = fun.split_train_test_by_year(df=DICT_DF_STATIONS[target],pollutant_to_predict="pm10",split_ratio=1.0,output=False)
    
    # only consider equal features
    li_columns = get_colname_intersection(dict_stations=DICT_DF_STATIONS,li_station_names=[source,target])
    
    li_columns = [x for x in li_columns if x not in ALL_POLLUTANTS]
    df_s_train_X = df_s_train_X[li_columns]
    df_t_test_X =  df_t_test_X[li_columns]
   
    years = [2014,2015,2016,2017,2018,2019,2020]
    months = [1,7]
    
    yt_samples_for_TL = select_by(df=df_t_test_y,y=years,m=months)
    Xt_samples_for_TL = select_by(df=df_t_test_X,y=years,m=months)
    
    print("No. of injections: ", len(Xt_samples_for_TL))
    
    # Random Forest 
    regr = RandomForestRegressor(random_state=42,n_estimators=180,min_samples_split=5, min_samples_leaf=2, max_features="sqrt", max_depth=60, bootstrap=True)    
    regr.fit(X=df_s_train_X, y=df_s_train_y.values.ravel())
    predictions_RF = regr.predict(df_t_test_X)
    
    # Random Forest with TrAdaBoostR2
    model_ada_RF = TrAdaBoostR2(regr, Xt=Xt_samples_for_TL, yt=yt_samples_for_TL,n_estimators=20, random_state=0,verbose=0,lr=1)
    model_ada_RF.fit(df_s_train_X, df_s_train_y)
    predictions_RF_TL = model_ada_RF.predict(df_t_test_X)
    
    
    
    calc_kpi(pred_result=predictions_RF,test_X=df_t_test_X,test_y=df_t_test_y,title="RF_pred",output=False)
    calc_kpi(pred_result=predictions_RF_TL,test_X=df_t_test_X,test_y=df_t_test_y,title="RF_ada_pred",output=False)

    
    dict_predictions["OoDG_pred"] = predictions_RF
    dict_predictions["TrAdaBR2_pred"] = predictions_RF_TL
  
    
    
    df_results = pd.DataFrame(DICT_METRICS, index=['rmse', 'nrmse', 'mape'])
    display(df_results)

#dict_predictions

show_prediction_plot(df_y=df_t_test_y,dict_predictions=dict_predictions)

### City-level OODG

In [None]:
from tensorflow.keras.optimizers.legacy import Adam

DICT_METRICS = {}

li_all_station_graz = [['z','n', 'e', 's', 'w', 'd'],['z','e','s', 'w', 'd'],['z','n','s','w']]
#li_all_station_graz = ['z','n','d']
df_results = pd.DataFrame()

dict_predictions = {}


for li_combinations in li_all_station_graz:
    

    li_source = [x for x in li_combinations if x!="z"]
    target =  "z"

    print(str([fun.get_station_name_by_indice(station) for station in li_source]) +" -> "+fun.get_station_name_by_indice(target))


    df_global = model_fun.create_global_model(dict_stations=DICT_DF_STATIONS,li_station_combination=li_combinations)

    df_s_train_X, df_s_train_y, df_t_test_X, df_t_test_y, _, _, dict_info = model_fun.split_train_test_by_station(df=df_global,
                                                                                                                            li_stations=li_combinations,
                                                                                                                            pollutant_to_predict="pm10",
                                                                                                                            station_test=target,
                                                                                                                            station_validation=None,
                                                                                                                            use_validation = False,
                                                                                                                            output=True)

    # Random Forest
    regr = RandomForestRegressor(random_state=42,n_estimators=180,min_samples_split=5, min_samples_leaf=2, max_features="sqrt", max_depth=60, bootstrap=True)
    regr.fit(X=df_s_train_X, y=df_s_train_y.values.ravel())
    predictions_RF = regr.predict(df_t_test_X)


    calc_kpi(pred_result=predictions_RF,test_X=df_t_test_X,test_y=df_t_test_y,title="RF_pred",output=False)

    strUniqueKey = str(li_source) +" -> "+target
    dict_predictions[strUniqueKey] = predictions_RF

    df_results[strUniqueKey] = pd.DataFrame(DICT_METRICS, index=['rmse', 'nrmse', 'mape'])
    display(df_results)

    

show_prediction_plot(df_y=df_t_test_y,dict_predictions=dict_predictions)

### City-level TL

In [None]:
from tensorflow.keras.optimizers.legacy import Adam

DICT_METRICS = {}


li_all_station_graz = ['z','w', 'n', 's' ,'e','d']
    
dict_predictions = {}


li_source = [x for x in li_all_station_graz if x!="z"]
target =  "z"

print(str([fun.get_station_name_by_indice(station) for station in li_source]) +" -> "+fun.get_station_name_by_indice(target))


df_global = model_fun.create_global_model(dict_stations=DICT_DF_STATIONS,li_station_combination=li_all_station_graz)
df_s_train_X, df_s_train_y, df_t_test_X, df_t_test_y, _, _, dict_info = model_fun.split_train_test_by_station(df=df_global,
                                                                                                                        li_stations=li_all_station_graz,
                                                                                                                        pollutant_to_predict="pm10",
                                                                                                                        station_test=target,
                                                                                                                        station_validation=None,
                                                                                                                        use_validation = False,
                                                                                                                        output=True)


years = [2014,2015,2016,2017,2018,2019,2020]
months = [1,7]

yt_samples_for_TL = select_by(df=df_t_test_y,y=years,m=months)
Xt_samples_for_TL = select_by(df=df_t_test_X,y=years,m=months)

print("Years",str(years))
print("No. of injections: ", len(Xt_samples_for_TL))
no_injections = len(Xt_samples_for_TL)

# Random Forest
regr = RandomForestRegressor(random_state=42,n_estimators=180,min_samples_split=5, min_samples_leaf=2, max_features="sqrt", max_depth=60, bootstrap=True)
regr.fit(X=df_s_train_X, y=df_s_train_y.values.ravel())
predictions_RF = regr.predict(df_t_test_X)


# Random Forest with TrAdaBoostR2
model = TrAdaBoostR2(regr, Xt=Xt_samples_for_TL, yt=yt_samples_for_TL,n_estimators=20, random_state=0,verbose=0,lr=2)
model.fit(df_s_train_X, df_s_train_y)
predictions_RF_TL = model.predict(df_t_test_X)



calc_kpi(pred_result=predictions_RF,test_X=df_t_test_X,test_y=df_t_test_y,title="RF_pred",output=False)

calc_kpi(pred_result=predictions_RF_TL,test_X=df_t_test_X,test_y=df_t_test_y,title="RF_ada_pred",output=False)


dict_predictions["Oo_DG"] = predictions_RF
dict_predictions["TrABR2"] = predictions_RF_TL
# dict_predictions["CORAL_pred"] = predictions_coral_RF
# dict_predictions["NNW_pred"] = predictions_NN_RF
# dict_predictions["KLIEP_pred"] = predictions_kliep_RF



df_results = pd.DataFrame(DICT_METRICS, index=['rmse', 'nrmse', 'mape'])
display(df_results)



#show_prediction_plot(df_y=df_t_test_y,dict_predictions=dict_predictions)