# Data Preperation

In [3]:
import pandas as pd
import numpy as np
import glob
import copy

In [4]:
countries={ 'Austria': 'AT', 'Belgium': 'BE',  'Bulgaria': 'BG', 'Switzerland': 'CH', 'Czech Republic': 'CZ', 'Germany': 'DE', 'Denmark': 'DK', 'Estonia': 'EE', 'Spain': 'ES', 'Finland': 'FI', 'France': 'FR',  'Greece': 'GR', 'Hungary': 'HU', 'Ireland': 'IE', 'Italy': 'IT', 'Lithuania': 'LT', 'Latvia': 'LV', 'Montenegro': 'ME','Netherlands': 'NL', 'Norway': 'NO', 'Poland': 'PL', 'Portugal': 'PT', 'Serbia': 'RS', 'Sweden': 'SE', 'Slovenia': 'SI', 'Slovakia': 'SK', 'United Kingdom': 'UK'}

abbr_list=list(countries.values())

## Making the first,second,third and fourth columns of the dataframe as date,month,year and time

In [5]:
data=pd.DataFrame()
temp=pd.read_csv('../Data Sources/ENTSO-E/2018/Load/Croatia.csv')
data['Day']=temp['Time (CET)'].str[:2]
data['Month']=temp['Time (CET)'].str[3:5]
data['Year']=temp['Time (CET)'].str[6:10]
data['Time']=temp['Time (CET)'].str[11:16]+' - '+temp['Time (CET)'].str[29:35]

##

In [6]:
# In the country speciic Load data set, Austria,Belgium,Germany,Hungary,Netherlands report data every 15 minutes. 
# Therefore, these countries have 35044 data points per year. 
# UK and Ireland report data every 30 minutes henece these countries have 17522 datapoints per year. 
# All the others report every 1 hour hence have 8761 datapoints per year. 
# In Genearion dataset, situation is same as abobe except Belgium reports hourly data hence have 8761 datapoints. 
# In Transmission dataset, all countries report data hourly except Germany which reports every 15 minutes. 
# Therefroe,it is easy if all the data are converted to hourly data. 
# To do that in the countries with 35044 datapoints, mean is calculated in every successive 4 datapoints. 
# In the countries with 17522 datapoints, mean is calculated in every successive 2 datapoints.`

def hourly_data(df):
    length=len(df.index)
    if length==35044:
        divider=4
    elif length==17522:
        divider=2
    else:
        divider=1

    # Following command creates a numpy array with a length similar to the length of the dataframe. 
    # Values of the array are obtained by getting the floor division of the length value. 
    # For example, when divider=4, this array will be [0,0,0,0,1,1,1,1,2,2,2,2,....]. 
    # Then the rows of the dataframe will be grouped according to the order of the numpy array with the mean value of those 4 rows. 
    # For example, in the numpy array first 4 values are similar. Accordingly first 4 rows of the dataframe will be grouped and get the mean value of those rows 
    
    df=df.groupby(np.arange(length)//divider).mean()
    return(df)    

# 1. Preparing Load Data

In [7]:
def load(countries):

    load_dic = {}
    load_data=pd.DataFrame()
    load_data[['Day','Month','Year','Time']]=data[['Day','Month','Year','Time']]

    # In the following command we open the csv file of each country and save the data in the 'temp' dataframe. 
    # Then we call the 'hourly_data' function to make all the time steps to hourly data. 
    # Then the column 'Actual Total Load [MW] - {country_name} ({country_code})' is saved in the new dataframe 'load_data' under the column name '{country_code}'. 
    # For example, the column 'Actual Total Load [MW] - Germany (DE)' in the 'temp' dataframe will be saved in the 'load_data' dataframe under the column name 'DE'.
    
    for country,abbr in countries.items():
        temp=pd.read_csv(f'../Data Sources/ENTSO-E/2018/Load/{country}.csv')
        temp = temp.replace(['n/e',np.nan] ,0)
        temp = hourly_data(temp)
        load_data[f'{abbr}']=temp[f'Actual Total Load [MW] - {country} ({abbr})']
        temp['demand'] = temp[f'Actual Total Load [MW] - {country} ({abbr})']
        load_dic[abbr] = temp[['demand']]

    return(load_data,load_dic)


# 2. Preparing Generation Data

In [8]:
def generation(load_dic,countries):
    
    load_dic_copy = copy.deepcopy(load_dic)
    generation_dic = {}
    generation_data=pd.DataFrame()
    generation_data[['Day','Month','Year','Time']]=data[['Day','Month','Year','Time']]

    # In the following command we open the csv file of each country and save the data in the 'temp' dataframe. 
    # Then we make all the 'n/e' values of the 'temp' 0.
    # Then we copy the hydro pumped storage consumption data to load_data of the respective country
    # then we remove the 'Hydro Pumped Storage  - Actual Consumption [MW]' column
    # Then we call the 'hourly_data' function to make all the time steps to hourly data. 
    # Then we remove the columns in which a single data is not recorded
    # Then we get the column names of the 'temp' dataframe into a numpy array called 'fuels' and get the column name without the '- Actual Aggregated [MW]' part.
    # Then we change the column names of the dataframe with the edited names in the previous step.
    # Then we update the 'generation_data' dataframe and 'generation_dic' dictionary using the 'temp' dataframe.
        
    for country,abbr in countries.items():
        temp=pd.read_csv(f'../Data Sources/ENTSO-E/2018/Generation/{country}.csv',low_memory=False)
        temp = temp.replace(['n/e',np.nan] ,0)
        temp = temp.drop(['Hydro Pumped Storage  - Actual Consumption [MW]'],axis=1)

        temp=hourly_data(temp)
        
        for column in temp.columns.values:
            if(temp[column]==0).all():
                temp=temp.drop(column,axis=1)

        fuels = [x[:-26] for x in temp.columns.values]
        temp.columns = fuels

        for fuel in fuels:
            generation_data[f'{abbr} - {fuel}'] = temp[fuel]

        generation_dic[abbr] = temp

    return(generation_data,generation_dic,load_dic_copy)


# 3. Preparing Cross-border Transmission Data

In [9]:
def cross_border(abbr_list):

    transmission_data = pd.DataFrame()
    cross_border_data = pd.DataFrame()
    cross_border_data[['Day','Month','Year','Time']]=data[['Day','Month','Year','Time']]

# In the following command we get the list of the paths of all files in the directory. 
# Then one by one we copy each csv to 'temp' dataframe and make all the 'n/e' values of the 'temp' 0. 
# Then we cretae two new column in the 'transmission_data' dataframe and name it as the two country codes the power transmission occurs. 
# We use string editing to get the two country codes from the file path. 
# For example, in the power transmission occur between Germany and Austria, we name the column as 'DE -> AT' and if the power transmission occur between Austria and Germany, we name the column as 'AT -> DE '.
# We use 'pd.to_numeric' function to convert the string values to numerical values if any numeric values have been recorded as string in the datasets. 
# Then we send the numeric converted column to 'hourly_data' function because cross border trasnmissions occur between Germany and a some countries have 35044 data points but We need to convert them to hourly values.
# Then we make a list of column heads of imports and exports associated with a given country_abbreviation 
# After this step we assume imports power transmission as a negative value and exports transmission as a positive value of a given country. 
# Therefore we multiply the 'imports' columns of the 'transmission_data' dataframe by -1 and add the 'exports' columns of the 'transmission_data' to get the net inbound/outbound in that country in that particular time step and save that in the 'cross_border_data' dataframe.
# Then we filter the transmission links between given two countries in which if both countries associated with the power transmission are included in our country_abbreviation list.
# For example power import(export) occurs from(to) a country other than the countries in the abbreviation_list (for ex: 'Cyprus','Turkey' etc.) are omitted. 
# Then we select only those filtered columns in the 'transmission_data' dataframe.

    csvs = glob.glob("../Data Sources/ENTSO-E/2018/Transmission/*.csv")

    for csv in csvs:
        temp = pd.read_csv(csv)
        temp = temp.replace(['n/e', np.nan], 0)

        transmission_data[f'{csv[42:44]} - > {csv[45:47]}'] = hourly_data(pd.to_numeric(temp.iloc[:, 2]))
        transmission_data[f'{csv[45:47]} - > {csv[42:44]}'] = hourly_data(pd.to_numeric(temp.iloc[:, 1]))

    for abbr in abbr_list:
        imports = [x for x in transmission_data.columns.values if abbr in x[-2:]]
        exports = [x for x in transmission_data.columns.values if abbr in x[:2]]

        cross_border_data[f'{abbr}'] = transmission_data[exports].sum(axis=1) + (transmission_data[imports].sum(axis=1))*-1

    transmission_lines = list([x for x in transmission_data.columns.values if x[:2] in abbr_list and x[-2:] in abbr_list])
    transmission_data = transmission_data[transmission_lines]

    return cross_border_data, transmission_data


## 3.1 Calculating net imports/exports based on generation and load data 

In [11]:
def import_export_using_load_gen(load_data, generation_data, abbr_list):

    import_export_data = pd.DataFrame()
    import_export_data[['Day', 'Month', 'Year', 'Time']] = load_data[['Day', 'Month', 'Year', 'Time']]

    # In the following command we calculate net import/export in each time step of each country by subtracting '{country_code}' column of 'load_data' dataframe from '{country_code} - Total' of 'generation_data' dataframe and save the result in '{country_code} - [gen - load]' column of 'import_export_data' dataframe.
    # for example, import_export_data['DE - [gen - load]']=generation_data['DE - Total'] - load_data['DE'].
    # Then we create a new column in the 'import_export_data' dataframe called '{country_code} - import/export' and make that column 'Net Export' if '{country_code} - [gen - load]' is greater than 0 and make the '{country_code} - import/export' column 'Net Import' if '{country_code} - [gen - load]' is equal or lower than 0

    for abbr in abbr_list:
        import_export_data[f'{abbr}'] = generation_data.filter(like=abbr).sum(axis=1) - load_data[f'{abbr}']

    return(import_export_data)


## 3.2 Calculating net imports/exports based on cross-border transmission data 

In [12]:
def import_export_using_crossborder(crossborder_data, abbr_list):

    import_export_data = pd.DataFrame()
    import_export_data[['Day', 'Month', 'Year', 'Time']] = crossborder_data[['Day', 'Month', 'Year', 'Time']]

# In the following command we copy the '{country_code}' column of 'crossborder_data' dataframe to '{country_code} - [exp - imp]' column of 'import_export_data' dataframe.
# for example, import_export_data['DE - [exp - imp]'].
# Then we create a new column in the 'import_export_data' dataframe called '{country_code} - import/export' and make that column 'Net Export' if '{country_code} - [exp - imp]' is greater than 0 and make the '{country_code} - import/export' column 'Net Import' if '{country_code} - [exp - imp]' is equal or lower than 0

    for abbr in abbr_list:
        import_export_data[f'{abbr}'] = crossborder_data[f'{abbr}']

    return(import_export_data)


# 4. Internal sigma calculation

## 4.1 Sigma calculation

In [13]:
def sigma(load_data,generation_data,transmission_data,abbr_list):

    eph = 0.1
    A=100
    load_gen_data = {}
    sigma = {}

    # First we create a new dictionary called 'load_gen_data' and in that dictionary keys are country_abbreviations and as value of each key we add the combined demand column of each country and generation columns from all the sources in that country.
    # Then in each value of the dictionary (which is a dataframe), we add new columns with the name '{Original_column_name} moving_average' and make it 0.
    # Then we get the moving average of each timestep using the values in that same time period in the last 10 days.
    # This creates null values in the first 10 days of the year. We fill them with the values from the original column.
    # Then in these 'moving_average' columns we replace the value with 0.1, if the current value is less than 0.1. 
    # Then in these 'moving_average' columns we change the value as 100/current_value.
    # Then we filter only the column names which have 'moving_avareg' as column name and remove the 'moving_average' part from the column name.
    # We save the resultant dataframe as a value in a dictionary called 'sigma' with the key as country_abbreviation.
    # We do the same procedure for transmission_data and save the resultant dataframe as the value of 'transmission_data' key of 'sigma' dictionary.

    for abbr, df in generation_data.items():
        load_gen_data[abbr] = pd.concat([df, load_data[abbr]],axis=1)

    for abbr, df in load_gen_data.items():
        for column in df.columns.values:
            df[column + "_moving_average"] = 0
            for i in range(10):
                df[column + "_moving_average"] += df[column].shift(24 * i) / 10
            df[column + "_moving_average"].fillna(df[column], inplace=True)
            df[column + "_moving_average"] = df[column + "_moving_average"].apply(lambda x: eph if x < eph else x)
            df[column + "_moving_average"] = df[column + "_moving_average"].apply(lambda x:  A/x)
        df = df[[x for x in df.columns.values if "_moving_average" in x]]
        df.columns = [[x[:-15] for x in df.columns.values]]
        sigma[abbr] = df

    for transmission in transmission_data.columns.values:
        transmission_data[transmission + "_moving_average"] = 0
        for i in range(10):
            transmission_data[transmission + "_moving_average"] += transmission_data[transmission].shift(24 * i) /10
        transmission_data[transmission + "_moving_average"].fillna(transmission_data[transmission], inplace=True)
        transmission_data[transmission + "_moving_average"] = transmission_data[transmission + "_moving_average"].apply(lambda x: eph if x < eph else x)
        transmission_data[transmission + "_moving_average"] = transmission_data[transmission + "_moving_average"].apply(lambda x:  A/x)
    transmission_data = transmission_data[[x for x in transmission_data.columns.values if "_moving_average" in x]]
    transmission_data.columns = [[x[:-15] for x in transmission_data.columns.values]]
    sigma["transmission_data"] = transmission_data

    return(sigma)


## 4.2 Internal data consolidation

In [14]:
def data_consolidation(generation_dic_copy, load_dic, transmission_data_copy, intermediary_var, unit_var):
    consolidated_gen_data = {}
    consolidated_load_data = {}
    consolidated_transmission_data = {}

    # In the following command, we fill each generation by source value in each timestep in each country with ('intermediary_var["generation"]' + 'original_value' * 'unit_var['generation]') value of that value
    # Similarly we fill each load value in each timestep in each country with ('intermediary_var['load'] + 'original_value' * 'unit_var['load']') value of that value
    # Then we save the consolidated generation and load values in seperate csv files.
    # We follow similar steps to obtain consolidated transmission values and save the result in a seperate csv file. 

    for abbr, df in generation_dic_copy.items():
        for column in df.columns:
            df[column] = intermediary_var["generation"][abbr][column] + \
                df[column] * unit_var["generation"][abbr][column]
        consolidated_gen_data[abbr] = df
        consolidated_load_data[abbr] = intermediary_var["load"][abbr]['demand'] + \
            load_dic[abbr]['demand'] * unit_var["load"][abbr]['demand']
        consolidated_gen_data[abbr].to_csv(f"../Data Sources/output/Internal Sigma/Reconciled data/Generation/{abbr}.csv")
        consolidated_load_data[abbr].to_csv(f"../Data Sources/output/Internal Sigma/Reconciled data/Load/{abbr}.csv")

    for column in transmission_data_copy.columns:
        transmission_data_copy[column] = intermediary_var["transmission"][column] + \
            transmission_data_copy[column] * unit_var["transmission"][column]
    consolidated_transmission_data = transmission_data_copy
    consolidated_transmission_data.to_csv('../Data Sources/output/Internal Sigma/Reconciled data/Transmission/all_transmissions.csv')

    return(consolidated_gen_data, consolidated_load_data, consolidated_transmission_data)


In [15]:
# sigma(load(countries)[1], generation(countries)[1], cross_border(abbr_list)[1],abbr_list)