In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import os
import csv

from datetime import datetime
from datetime import timedelta

#machine learning preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from pandas.api.types import is_object_dtype, is_numeric_dtype


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
def rmse(y, yhat):
    return np.sqrt(mean_squared_error(y, yhat))

def dtype_as_category(df):

    for column in df.keys():
        if is_object_dtype(df[column]):
            df[column] = df[column].astype('category')

    return df

## 5 Correct biases and predictions (noraml models)
### 5.1 Create the dataframes

In [None]:
g_dir : str         = "g_drive_data"
ip_dir : str        = "ip_forecast_data"
main_dir :str       = "main_data"

In [None]:
df = pd.read_csv(os.path.join(main_dir, "df_main_interpolated_fe.csv"))


In [None]:
pos_mapping : dict = {
    "front" : 0,
    "back" : 1,
}
df["position"] = df["position"].map(pos_mapping)
df = dtype_as_category(df)

In [None]:
#define the relevant columns for dataframes

relevant_all : list     = ["wt_id", "power_kw", "day_sin", "day_cos", "year_sin", "year_cos", "position", "month"]

relevant_3 : list       = [x for x in df.columns.tolist() if ("3" in x and "deviation" not in x)]
relevant_6 : list       = [x for x in df.columns.tolist() if ("6" in x and "deviation" not in x)]
relevant_12 : list      = [x for x in df.columns.tolist() if ("12" in x and "deviation" not in x)]

In [None]:
df_3        = df[relevant_all + relevant_3]
df_6        = df[relevant_all + relevant_6]
df_12       = df[relevant_all + relevant_12]

df_3.columns.to_list()

In [None]:
#create a validation set for later use

In [None]:
#creat dfs with raw forecast data

def get_raw_df(df : object, g_dir : str = g_dir, t : str = None) -> object:
    
    relevant_timestamps = pd.read_csv(os.path.join(g_dir, "forecasts_temp.csv"))
    data_points = pd.to_datetime(relevant_timestamps["init"]).dt.tz_localize(None)
    df_raw = df[pd.to_datetime(df[f"init_{t}"]).dt.tz_localize(None).isin(data_points)]

    print(f"df_{t}_raw:\t{df_raw.shape}")
    return df_raw

df_3_raw    = get_raw_df(df_3, g_dir = g_dir, t = 3)
df_6_raw    = get_raw_df(df_6, g_dir = g_dir, t = 6)
df_12_raw   = get_raw_df(df_12, g_dir = g_dir, t = 12)

for df_n in [df_3, df_3_raw, df_6, df_6_raw, df_12, df_12_raw]:
    cols = [x for x in df_n.columns.tolist() if "init" in x]
    df_n.drop(labels = cols, axis = 1,inplace = True)

### 5.2 Define a ML model
For a first try the random forest regressor is used

### 5.3 Train a ML model with

Question: Should the error for the machine learning model be meassured and optimsed for mean deviation or mse or rmse?

In [None]:
#defining the benchmark metrics for the judgment of the prediction

bench_mark : dict = {
    "fc_range" : [],
    "type" : [],
    "mse" : [],
    "rmse" : [],
}

for df, type, fc in zip([df_3, df_6, df_12, df_3_raw ,df_6_raw, df_12_raw], ["ip", "ip", "ip", "raw", "raw", "raw"], ["3","6","12","3","6","12"]):

        mse_bench : float       = mean_squared_error(df[f"power_{fc}.00"], df["power_kw"])
        rmse_bench : float      = np.sqrt(mse_bench)

        bench_mark["fc_range"].append(fc)
        bench_mark["type"].append(type)
        bench_mark["mse"].append(mse_bench)
        bench_mark["rmse"].append(rmse_bench)

df_bench = pd.DataFrame(bench_mark)

In [None]:
#framework for autmated training (6 models)

class RFR(): #Random Forest Regressor

    log_file : str = "random_forest_results.csv"

    def __init__ (self, df, random_state : int = 42, test_size : float = 0.3, y_col : str = "power_kw", offset : int = None, raw : bool = None, n_jobs = 1) -> None:

        self.df                     = df
        self.y_col : str            = y_col

        self.random_state : int     = random_state
        self.test_size : float      = test_size
        self.n_jobs : int           = n_jobs

        self.offset = offset
        self.raw = raw

        self._split()

        return

    def _split(self) -> None:
        """autmatically splits the dataframe into a train and test"""

        y = self.df[self.y_col]
        x = self.df.drop(labels = self.y_col, axis = 1)

        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(
            x,
            y,
            test_size = self.test_size,
            random_state = self.random_state
        )

        return

    def _log_results(self, estimator : int, leaf : int, mse : int, rmse : int) -> None:
        """logs the results and model parameter"""

        #check if file exists and create it if it is missing
        try:
            f = pd.read_csv(RFR.log_file)
            del f

        except FileNotFoundError:

            header : list = "offset,raw,n_estimators,n_leafs,random_state,mse,rmse".split(",")

            f = open(RFR.log_file, "a", newline = "")
            writer_obj = csv.writer(f)
            writer_obj.writerow(header)
            f.close()

        #log message
        row = [self.offset, self.raw,estimator,leaf,self.random_state,mse,rmse]

        f = open(RFR.log_file, "a", newline = "")
        writer_obj = csv.writer(f)
        writer_obj.writerow(row)
        f.close()

        return

    @staticmethod
    def _generate_lists(opt_estimator : bool, n_fib : int, n_estimators : int, n_leafs : int) -> list:
        """Generates the list for the leaves and estimators to iterate through"""

        fib_list : list = [1,2]
        list(map(lambda x : fib_list.append(fib_list[-1] + fib_list[-2]),range(n_fib)))

        if opt_estimator == True:
            estimators : list       = fib_list
            leafs : list           = [n_leafs]

        elif opt_estimator == False:
            estimators : list       = [n_estimators]
            leafs : list           = fib_list

        elif (n_estimators != 1) and (n_leafs != 1):
            estimators = [n_estimators]
            leafs = [n_leafs]

        return estimators, leafs

    def train(self, opt_estimator : bool = True, n_fib : int = None, n_estimators : int = 1, n_leafs : int = 1):
        """trains the model with a list of parameters"""

        estimators, leafs = RFR._generate_lists(
                opt_estimator = opt_estimator,
                n_fib = n_fib,
                n_estimators = n_estimators,
                n_leafs = n_leafs,
        )

        total_itterations : int     = max(len(estimators),len(leafs)) - 2
        current_itteration : int    = 0

        for estimator in estimators:
            for leaf in leafs:

                #print to see progress
                current_itteration += 1
                progress : float = round(current_itteration/total_itterations,4)
                print(f"progress:\t{progress}", end = "\r")

                #instantciate model
                model = RandomForestRegressor(
                    n_estimators    = estimator,
                    bootstrap       = False, #not nescecarry, beauce there is enough data
                    oob_score       = False,
                    random_state    = self.random_state,
                    n_jobs = self.n_jobs,
                    min_samples_leaf= leaf,
                )

                #fit model and make predictions
                model.fit(self.X_train, self.y_train)
                y_hat_valid = model.predict(self.X_valid)


                mse = mean_squared_error(self.y_valid, y_hat_valid)
                rmse = np.sqrt(mse)

                self._log_results(estimator = estimator, leaf = leaf, mse = mse, rmse = rmse)

    def cross_validate(self):
        pass

    def fitting_graph(self):
        pass

In [None]:
#preformance test
import time

def performance_comp():

    #single core performance
    start = time.time()

    model_3 = RFR(df = df_3_raw, offset = 3, raw = True, n_jobs = 1)
    model_3.train(opt_estimator = True, n_fib = 10)
    df_3_raw.head()

    print(f"runtime:\t{int(time.time() - start)}\t sec")

    #multi core performance
    start = time.time()

    model_3 = RFR(df = df_3_raw, offset = 3, raw = True, n_jobs = -1)
    model_3.train(opt_estimator = True, n_fib = 10)
    df_3_raw.head()

    print(f"runtime:\t{int(time.time() - start)}\t sec")

#performance_comp()

In [None]:
#testing the class

#model_3 = RFR(df = df_3_raw, offset = 3, raw = True)
#model_3.train(fibonacci = True, n_fib = 10)
#df_3_raw.head()

In [None]:
#create validaiton dfs for later use
valid_size : float      = 0.1 #percentag of size
random_state : int      = 42 
df_valid_dict : dict         = {} #continer for validation objects

for df, name in zip([df_3,df_3_raw,df_6,df_6_raw,df_12,df_12_raw],
                    "df_3,df_3_raw,df_6,df_6_raw,df_12,df_12_raw".split(",")):

    print (f"{name}\nInitial df size:\t{df.shape}")

    df_subset : object = df.sample(frac = 0.1, replace = False, random_state = random_state)
    df.drop(index = df_subset.index, inplace = True)

    df_valid_dict[name] = df_subset
    print(f"cropped df size:\t{df.shape}\nvalidation df size:\t{df_subset.shape}\n")

In [None]:
#runtime ca. 5 hours (?)
#calcuates the optimal number of leves and estimators for the three offsets, once with and once without interpolated data points
#def train(self, opt_estimator : bool = True, n_fib : int = None, n_estimators : int = 1, n_leafs : int = 1)

run_automation : bool = False

if run_automation:

    print("Training with ip data")

    for df, offset in zip([df_3,df_6,df_12],[3,6,12]):

        print(f"{offset} in porgress")
        model = RFR(df = df, offset = offset, raw = False)
        model.train(opt_estimator = True, n_fib = 10)

    print("training with ip data")

    for df, offset in zip([df_3_raw,df_6_raw,df_12_raw],[3,6,12]):

        print(f"{offset} in porgress")
        model = RFR(df = df, offset = offset, raw = True)
        model.train(opt_estimator = True, n_fib = 10)

In [None]:
df_results : object     = pd.read_csv(RFR.log_file)

In [None]:
#fitting graph
fig = px.line(
    data_frame = df_results,
    y = "rmse",
    x = "n_estimators",
    color = "offset",
    facet_col = "raw"
)
fig.show()

the following n_estimators will be chosen

In [None]:
estimators : dict = {
    "df_3_raw"      : 55,
    "df_6_raw"      : 55,
    "df_12_raw"     : 34,
    "df_3"          : 34,
    "df_6"          : 34,
    "df_12"         : 34,
}

In [None]:
#optimze for leafs

run_automation : bool = True

if run_automation:

    print("Training with ip data")

    for df, offset, n_estimators in zip([df_3,df_6,df_12],[3,6,12],[34,34,34]):

        print(f"{offset} in porgress")
        model = RFR(df = df, offset = offset, raw = False)
        model.train(opt_estimator = False, n_fib = 10, n_estimators = n_estimators)

    print("training with ip data")

    for df, offset, n_estimators in zip([df_3_raw,df_6_raw,df_12_raw],[3,6,12],[55,55,34]):

        print(f"{offset} in porgress")
        model = RFR(df = df, offset = offset, raw = True)
        model.train(opt_estimator = False, n_fib = 10, n_estimators = n_estimators)