# Data Preperation

In [24]:
import pandas as pd
import numpy as np
import glob
import copy
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


In [25]:
countries={ 'Austria': 'AT', 'Belgium': 'BE',  'Bulgaria': 'BG', 'Switzerland': 'CH', 'Czech Republic': 'CZ', 'Germany': 'DE', 'Denmark': 'DK', 'Estonia': 'EE', 'Spain': 'ES', 'Finland': 'FI', 'France': 'FR',  'Greece': 'GR', 'Hungary': 'HU', 'Ireland': 'IE', 'Italy': 'IT', 'Lithuania': 'LT', 'Latvia': 'LV', 'Montenegro': 'ME','Netherlands': 'NL', 'Norway': 'NO', 'Poland': 'PL', 'Portugal': 'PT', 'Serbia': 'RS', 'Sweden': 'SE', 'Slovenia': 'SI', 'Slovakia': 'SK', 'United Kingdom': 'UK'}

abbr_list=list(countries.values())

## Making the first,second,third and fourth columns of the dataframe as date,month,year and time

In [26]:
# data=pd.DataFrame()
# temp=pd.read_csv('../Data Sources/ENTSO-E/2018/Load/Croatia.csv')
# data['Day']=temp['Time (CET)'].str[:2]
# data['Month']=temp['Time (CET)'].str[3:5]
# data['Year']=temp['Time (CET)'].str[6:10]
# data['Time']=temp['Time (CET)'].str[11:16]+' - '+temp['Time (CET)'].str[29:35]
# data = data.drop(range(1994, 1995)).reset_index(drop=True)

In [8]:
def polynomial(selected_values,selected_index):

    # We get all the indexes in the 'selected_values' to a 2D numpy array 'X'.
    # We get all the values in the 'selected_values'to a 1D numpy array 'y'
    # Then we fill the null values in array 'y' with the mean value of the array.
    # Then we divide 'X' and 'y' values in the ratio of 30% test and 70% train data.
    # We create an array of degree values from 1 to 10.
    # Then we iterate the 'degrees' one by one and create polynomial values of 'x_train' data called 'x_poly_train' based on the value of the degree
    # Then we fit the polynomial linear regression function using 'x_poly_train' data and 'y_train' data.
    # Then based on the polynomial function, using the 'x_poly_test' data we predict the values of the 'y_test' data
    # Then based on the predicted values and 'y_test" data we calculate the Root Mean Square Error.
    # Applying the last 4 steps for each degree value, we select the degree value which gives the Lowest Root Mean Square Error.
    # Then we fit the polynomial linear regression function again using that degree which gives the Lowest Root Mean Square Error.
    # Based on the polynimial function we get the predicted value of the null value.


    X = np.array([i[0] for i in selected_values]).reshape(len(selected_values), 1)
    y = [i[1] for i in selected_values]
    mean = np.nanmean(y)
    y = [mean if pd.isna(x) else x for x in y]

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

    degrees = np.arange(1, 11)
    min_rmse, min_deg = 1e10, 0

    for degree in degrees:

        # Preparing polynomial Train features based on x_train
        poly_features = PolynomialFeatures(degree=degree, include_bias=False)
        x_poly_train = poly_features.fit_transform(x_train)

        # Polynomial linear regression based on train data
        poly_reg = LinearRegression()
        poly_reg.fit(x_poly_train, y_train)

        # Predicting y values and getting root mean squared error based on predicted y values and y_test values
        x_poly_test = poly_features.fit_transform(x_test)
        poly_predict = poly_reg.predict(x_poly_test)
        poly_mse = mean_squared_error(y_test, poly_predict)
        poly_rmse = np.sqrt(poly_mse)

        # Selecting the best degree of the polynimial function based on lowest root mean squared error
        if min_rmse > poly_rmse:
            min_rmse = poly_rmse
            min_deg = degree

    # Fitting the regression function again based on the selected best degree above
    poly_features = PolynomialFeatures(degree=min_deg, include_bias=False)
    x_poly_train = poly_features.fit_transform(x_train)
    poly_reg = LinearRegression()
    poly_reg.fit(x_poly_train, y_train)

    prediction = poly_reg.predict(poly_features.fit_transform([[selected_index]]))[0]
    if prediction<0:
        prediction = 0
        
    return(prediction)

def mean(selected_values):
    if pd.isnull(selected_values).sum() != len(selected_values):
        mean_value = np.nanmean(selected_values)
        selected_values = [mean_value if pd.isna(x) else x for x in selected_values]
        prediction = np.mean(np.array(selected_values))
    else:
        prediction = 0

    return prediction


In [10]:
def fill_missing_data(df,length):

    # 1. In the following section, we get the indexes and values of a column of a dataframe to a dictionary called 'column_data'.
    # 2. Then we iterate the "column_data" dictionary line by line untill a null value is found (We called this index as 'selected_index').
    # 3. If the index of the value is within the range of after first 3 hours and before the last 3 hours of the column we create a empty list called 'selected_values'.
    # 4. Else if the index is null but it is not in the above range, we get the mean value of first 3 hours or mean value of last 3 hours according to the position of the index of the null value as the missing null value.
    # 5. In 3, We append the 'seleced_values' list with the indexes of the 3 hours before the index of the null value and 3 hours after the index of the null value and respective values of those indexes.
    # 6. In the 6 values of this list if more than 3 are null values and the 'selected_index' is within the range of after first 27 hours and before the last 27 hours of the column we create another empty list called 'selected_values'.
    # 7. Else if more than 3 are null values but not in the range mentioned above, get the mean value of the remaining values in the 'seleced_values' list as the missing null value.
    # 8. If both the two conditions in 6 and 7 are not met, get the missing null value by using the polynomial function.
    # 9. In 6, We append the 'seleced_values' list with the indexes of the (1,2,3,21,22,23,25,26,27) hours before and after the index of the null value and respective values of those indexes.
    # 10. In the 18 values of this llist, if more than 12 are null values and the 'selected_index' is within the range of after first 51 hours and before the last 51 hours of the column we create another empty list called 'selected_values'.
    # 11. Else if more than 12 are null values but not in the range mentioned above, get the mean value of the remaining values in the 'seleced_values' list as the missing null value.
    # 12. If both the two conditions in 10 and 11 are not met, get the missing null value by using the polynomial function.
    # 13. In 10, We append the 'seleced_values' list with the indexes of the (1,2,3,21,22,23,25,26,27,45,46,47,49,50,51) hours before and after the index of the null value and respective values of those indexes.
    # 14. In the 18 values of this llist, if more than 24 are null values and but not all 30 are null values, get the mean value of the remaining values in the 'seleced_values' list as the missing null value.
    # 15. If all 30 are null values, make the missing null value 0.
    # 16. Then we replace 'n/e' and any remining null values with 0.

    counter =0  
    for column in df.columns.values:
        column_data = {}
        for index, value in enumerate(df.loc[:, column]):
            column_data[index] = value

        for selected_index, selected_value in column_data.items():

            if pd.isnull(column_data[selected_index]) and selected_index in range(3*length, len(df[column])-3*length):
                selected_values = []
                for i in [x for x in range(-3,4) if x!=0]:
                    selected_values.append([selected_index + i*length,column_data[selected_index + i*length]])
                if pd.isnull(selected_values).sum() >=3 and selected_index in range(27*length, len(df[column])-27*length):
                    selected_values = []
                    for i in [x for x in range(-3, 4) if x != 0]:
                        for j in [-24,0,24]:
                            selected_values.append([selected_index + (i+j)*length,column_data[selected_index + (i+j)*length]])
                    if pd.isnull(selected_values).sum() >= 12 and selected_index in range(51*length, len(df[column])-51*length):
                        selected_values = []
                        for i in [x for x in range(-3, 4) if x != 0]:
                            for j in [-48,-24,0, 24,48]:
                                selected_values.append([selected_index + (i+j)*length,column_data[selected_index + (i+j)*length]])
                        # print(selected_values)
                        if pd.isnull(selected_values).sum() >= 24 and pd.isnull(selected_values).sum() < len(selected_values):
                            prediction = mean([i[1] for i in selected_values])
                            df.loc[selected_index,column] = prediction
                            counter +=1

                        elif pd.isnull(selected_values).sum() < 24:
                            prediction = polynomial(selected_values,selected_index)
                            df.loc[selected_index, column] = prediction
                            counter += 1

                        else:
                            df.loc[selected_index, column] = 0
                            counter += 1

                    elif pd.isnull(selected_values).sum() >= 12:
                        prediction = mean([i[1] for i in selected_values])
                        df.loc[selected_index, column] = prediction
                        counter += 1

                    else:
                        prediction = polynomial(selected_values,selected_index)
                        df.loc[selected_index, column] = prediction
                        counter += 1

                elif pd.isnull(selected_values).sum() >= 3:
                    prediction = mean([i[1] for i in selected_values])
                    df.loc[selected_index, column] = prediction
                    counter += 1

                else:
                    prediction = polynomial(selected_values, selected_index)
                    df.loc[selected_index, column] = prediction
                    counter += 1

            elif pd.isnull(column_data[selected_index]) and selected_index < 3*length:
                selected_values =  [column_data[i] for i in range(3*length)]
                prediction = mean(selected_values)
                df.loc[selected_index, column] = prediction
                counter += 1

            elif pd.isnull(column_data[selected_index]) and selected_index >= (len(df[column])-3*length): 
                selected_values = [column_data[i] for i in range(len(df[column])-3*length,len(df[column]))]
                prediction = mean(selected_values)
                df.loc[selected_index, column] = prediction
                counter += 1

    # df = df.replace(['n/e', np.nan], 0)
  
    # Following command creates a numpy array with a length similar to the length of the dataframe.
    # Values of the array are obtained by getting the floor division of the length value.
    # For example, when divider=4, this array will be [0,0,0,0,1,1,1,1,2,2,2,2,....].
    # Then the rows of the dataframe will be grouped according to the order of the numpy array with the mean value of those 4 rows.
    # For example, in the numpy array first 4 values are similar. Accordingly first 4 rows of the dataframe will be grouped and get the mean value of those rows

    df = df.groupby(np.arange(len(df))//length).mean()
    return df,counter


In [29]:
# In the country speciic Load data set, Austria,Belgium,Germany,Hungary,Netherlands report data every 15 minutes. 
# Therefore, these countries have 35044 data points per year. 
# UK and Ireland report data every 30 minutes henece these countries have 17522 datapoints per year. 
# All the others report every 1 hour hence have 8761 datapoints per year. 
# In Genearion dataset, situation is same as abobe except Belgium reports hourly data hence have 8761 datapoints. 
# In Transmission dataset, all countries report data hourly except Germany which reports every 15 minutes. 
# Therefroe,it is easy if all the data are converted to hourly data. 
# To do that in the countries with 35044 datapoints, mean is calculated in every successive 4 datapoints. 
# In the countries with 17522 datapoints, mean is calculated in every successive 2 datapoints.`

# Due to day light saving, all the datasets have null values on 25th March from 02:00 - 03:00.
# Considering the time intervals each country update the data, a total number of rows of 4,2 or 1 are dropped from the 31st March, 02:00 - 03:00 time interval.
# Also this returns a integer ('divider') based on the file length to get the energy values in a later step. In 15 min interval files this is 4, in 30 min interval files this is 2 and in 1 hour interval files this is 1.

def omit_dst(df):
    length=len(df.index)
    if length==35044:
        df = df.drop(range(7976, 7980)).reset_index(drop=True)
        divider=4
    elif length==17522:
        df = df.drop(range(3988, 3990)).reset_index(drop=True)
        divider=2
    else:
        df = df.drop(range(1994, 1995)).reset_index(drop=True)
        divider=1

    return(df,divider)    


# 1. Preparing Load Data

In [30]:
def load(countries):

    load_dic = {}

    # In the following command we open the csv file of each country and save the data in the 'temp' dataframe. 
    # Then we call the 'omit_dst' function to remove the null data in DST changing date. 
    # Then we call the 'fill_missing_data' function to fill the missing values
    # Then the change the column name to 'demand. 
    # Then we save the gap filled csv and update the 'load_dic' dictionary using the 'temp' dataframe.
    
    for country,abbr in countries.items():
        temp,length=omit_dst(pd.read_csv(f'../Data Sources/ENTSO-E/2018/Load/{country}.csv').iloc[:, 2:])
        temp, counter = fill_missing_data(temp, length)
        display(f'{country} - load: {counter} missing data filled')
        temp.columns = ['demand']
        temp.to_csv(f'../Data Sources/output/Polynomial Linear Regression/Load/{abbr}.csv')
        load_dic[abbr] = temp

    return(load_dic)


# 2. Preparing Generation Data

In [31]:
def generation(countries):
    
    generation_dic = {}

    # In the following command we open the csv file of each country and save the data in the 'temp' dataframe. 
    # then we remove the 'Hydro Pumped Storage  - Actual Consumption [MW]' column
    # Then we call the 'omit_dst' function to remove the null data in DST changing date.
    # Then we call the 'fill_missing_data' function to fill the missing values
    # Then we remove the columns in which a single data is not recorded
    # Then we get the column names of the 'temp' dataframe into a numpy array called 'fuels' and get the column name without the '- Actual Aggregated [MW]' part.
    # Then we change the column names of the dataframe with the edited names in the previous step.
    # Then we save the gap filled csv and update the 'generation_dic' dictionary using the 'temp' dataframe.
        
    for country,abbr in countries.items():
        temp = pd.read_csv(f'../Data Sources/ENTSO-E/2018/Generation/{country}.csv',low_memory=False).iloc[:, 2:]
        temp = temp.drop(['Hydro Pumped Storage  - Actual Consumption [MW]'],axis=1)
        temp, length = omit_dst(temp)
        temp, counter = fill_missing_data(temp, length)
        display(f'{country} - Generation: {counter} missing data filled')
        
        for column in temp.columns.values:
            if(temp[column]==0).all():
                temp=temp.drop(column,axis=1)

        fuels = [x[:-26] for x in temp.columns.values]
        temp.columns = fuels
        temp.to_csv(f'../Data Sources/output/Polynomial Linear Regression/Generation/{abbr}.csv')
        generation_dic[abbr] = temp

    return(generation_dic)


# 3. Preparing Cross-border Transmission Data

In [16]:
def cross_border():

    transmission_data = pd.DataFrame()

# In the following command we get the list of the paths of all files in the directory. 
# Then we call the 'omit_dst' function to remove the null data in DST changing date.
# Then we rename the two new column in the 'temp' dataframe as the two country codes the power transmission occurs. 
# We use string editing to get the two country codes from the file path. 
# For example, in the power transmission occur between Germany and Austria, we name the column as 'DE -> AT' and if the power transmission occur between Austria and Germany, we name the column as 'AT -> DE '.
# Then we call the 'fill_missing_data' function to fill the missing values
# Then we get all the gap filled transmission data to a single dataframe called 'transmission_data' and save the gap filled csv

    csvs = glob.glob("../Data Sources/ENTSO-E/2018/Transmission/*.csv")

    for csv in csvs:
        temp, length = omit_dst(pd.read_csv(csv).iloc[:, 1:])
        temp = temp.rename(columns={temp.columns[0]: f'{csv[45:47]} - > {csv[42:44]}', temp.columns[1]: f'{csv[42:44]} - > {csv[45:47]}'})
        temp, counter = fill_missing_data(temp, length)
        transmission_data = pd.concat([transmission_data, temp], axis=1)
        display(f'{csv[42:44]} - > {csv[45:47]} - transmission: {counter} missing data filled')
    transmission_data.to_csv(f'../Data Sources/output//Polynomial Linear Regression/Transmission/all_transmissions.csv')

    return transmission_data


# 4. Internal sigma calculation

## 4.1 Sigma calculation

In [33]:
def calculate_sigma(load_data, generation_data, transmission_data, abbr_list):

    eph = 0.1
    A=100
    load_gen_data = {}
    sigma = {}

    # First we create a new dictionary called 'load_gen_data' and in that dictionary keys are country_abbreviations and as value of each key we add the combined demand column of each country and generation columns from all the sources in that country.
    # Then in each column we replace the value with 0.1, if the current value is less than 0.1. 
    # Then in each column we change the value as 100/current_value.
    # We save the resultant dataframe as a value in a dictionary called 'sigma' with the key as country_abbreviation.
    # We do the same procedure for transmission_data and save the resultant dataframe as the value of 'transmission_data' key of 'sigma' dictionary.

    for abbr, df in generation_data.items():
        load_gen_data[abbr] = pd.concat([df, load_data[abbr]],axis=1)

    for abbr, df in load_gen_data.items():

        for column in df.columns.values:
            df[column] = df[column].apply(lambda x: eph if x < eph else x)
            df[column] = df[column].apply(lambda x:  A/x)
        sigma[abbr] = df

    for transmission in transmission_data.columns.values:
        transmission_data[transmission] = transmission_data[transmission].apply(lambda x: eph if x < eph else x)
        transmission_data[transmission] = transmission_data[transmission].apply(lambda x:  A/x)
    sigma["transmission_data"] = transmission_data

    return(sigma)


## 4.2 Internal data consolidation

In [34]:
def data_consolidation(generation_dic, load_dic, transmission_data, intermediary_var, unit_var):
    consolidated_gen_data = {}
    consolidated_load_data = {}
    consolidated_transmission_data = {}

    # In the following command, we fill each generation by source value in each timestep in each country with ('intermediary_var["generation"]' + 'original_value' * 'unit_var['generation]') value of that value
    # Similarly we fill each load value in each timestep in each country with ('intermediary_var['load'] + 'original_value' * 'unit_var['load']') value of that value
    # Then we save the consolidated generation and load values in seperate csv files.
    # We follow similar steps to obtain consolidated transmission values and save the result in a seperate csv file. 

    for abbr, df in generation_dic.items():
        for column in df.columns:
            df[column] = intermediary_var["generation"][abbr][column] + \
                df[column] * unit_var["generation"][abbr][column]
        consolidated_gen_data[abbr] = df
        consolidated_load_data[abbr] = intermediary_var["load"][abbr]['demand'] + \
            load_dic[abbr]['demand'] * unit_var["load"][abbr]['demand']
        consolidated_gen_data[abbr].to_csv(f"../Data Sources/output/Sigma/Generation/{abbr}.csv")
        consolidated_load_data[abbr].to_csv(f"../Data Sources/output/Sigma/Load/{abbr}.csv")

    for column in transmission_data.columns:
        transmission_data[column] = intermediary_var["transmission"][column] + \
            transmission_data[column] * unit_var["transmission"][column]
    consolidated_transmission_data = transmission_data
    consolidated_transmission_data.to_csv('../Data Sources/output/Sigma/Transmission/all_transmissions.csv')

    return(consolidated_gen_data, consolidated_load_data, consolidated_transmission_data)


In [35]:
# sigma(load(countries)[1], generation(countries)[1], cross_border(abbr_list)[1],abbr_list)