# Data Preperation

In [6]:
import pandas as pd
import numpy as np
import glob

In [4]:
countries={ 'Austria': 'AT', 'Belgium': 'BE',  'Bulgaria': 'BG', 'Switzerland': 'CH', 'Czech Republic': 'CZ', 'Germany': 'DE', 'Denmark': 'DK', 'Estonia': 'EE', 'Spain': 'ES', 'Finland': 'FI', 'France': 'FR',  'Greece': 'GR', 'Hungary': 'HU', 'Ireland': 'IE', 'Italy': 'IT', 'Lithuania': 'LT', 'Latvia': 'LV', 'Montenegro': 'ME','Netherlands': 'NL', 'Norway': 'NO', 'Poland': 'PL', 'Portugal': 'PT', 'Serbia': 'RS', 'Sweden': 'SE', 'Slovenia': 'SI', 'Slovakia': 'SK', 'United Kingdom': 'UK'}

abbr_list=list(countries.values())

## Making the first,second,third and fourth columns of the dataframe as date,month,year and time

In [8]:
data=pd.DataFrame()
temp=pd.read_csv('../Data Sources/ENTSO-E/2018/Load/Croatia.csv')
data['Day']=temp['Time (CET)'].str[:2]
data['Month']=temp['Time (CET)'].str[3:5]
data['Year']=temp['Time (CET)'].str[6:10]
data['Time']=temp['Time (CET)'].str[11:16]+' - '+temp['Time (CET)'].str[29:35]

##

In [10]:
# In the country speciic Load data set, Austria,Belgium,Germany,Hungary,Netherlands report data every 15 minutes. 
# Therefore, these countries have 35044 data points per year. 
# UK and Ireland report data every 30 minutes henece these countries have 17522 datapoints per year. 
# All the others report every 1 hour hence have 8761 datapoints per year. 
# In Genearion dataset, situation is same as abobe except Belgium reports hourly data hence have 8761 datapoints. 
# In Transmission dataset, all countries report data hourly except Germany which reports every 15 minutes. 
# Therefroe,it is easy if all the data are converted to hourly data. 
# To do that in the countries with 35044 datapoints, mean is calculated in every successive 4 datapoints. 
# In the countries with 17522 datapoints, mean is calculated in every successive 2 datapoints.`

def hourly_data(df):
    length=len(df.index)
    if length==35044:
        divider=4
    elif length==17522:
        divider=2
    else:
        divider=1

    # Following command creates a numpy array with a length similar to the length of the dataframe. 
    # Values of the array are obtained by getting the floor division of the length value. 
    # For example, when divider=4, this array will be [0,0,0,0,1,1,1,1,2,2,2,2,....]. 
    # Then the rows of the dataframe will be grouped according to the order of the numpy array with the mean value of those 4 rows. 
    # For example, in the numpy array first 4 values are similar. Accordingly first 4 rows of the dataframe will be grouped and get the mean value of those rows 
    
    df=df.groupby(np.arange(length)//divider).mean()
    return(df)    

# 1. Preparing Load Data

In [5]:
def load(countries):

    load_data=pd.DataFrame()
    load_data[['Day','Month','Year','Time']]=data[['Day','Month','Year','Time']]

    # In the following command we open the csv file of each country and save the data in the 'temp' dataframe. 
    # Then we call the 'hourly_data' function to make all the time steps to hourly data. 
    # Then the column 'Actual Total Load [MW] - {country_name} ({country_code})' is saved in the new dataframe 'load_data' under the column name '{country_code}'. 
    # For example, the column 'Actual Total Load [MW] - Germany (DE)' in the 'temp' dataframe will be saved in the 'load_data' dataframe under the column name 'DE'.
    
    for country,abbr in countries.items():
        temp=pd.read_csv(f'../Data Sources/ENTSO-E/2018/Load/{country}.csv')
        temp=hourly_data(temp)
        load_data[f'{abbr}']=temp[f'Actual Total Load [MW] - {country} ({abbr})']
        
    return(load_data)

# 2. Preparing Generation Data

In [22]:
def generation(countries):
    
    generation_data=pd.DataFrame()
    generation_data[['Day','Month','Year','Time']]=data[['Day','Month','Year','Time']]

    # In the following command we open the csv file of each country and save the data in the 'temp' dataframe. 
    # Then we call the 'hourly_data' function to make all the time steps to hourly data. 
    # Then we get the column names of the 'temp' dataframe into a numpy array called 'fuels'. 
    # Then we get the column name without the '- Actual Aggregated [MW]' part into a new variable called 'edited_fuel'. 
    # Then the column 'fuel' is saved in the new dataframe 'load_data' under the column name '{country_code} - {edited_fuel}'. For example, the column 'Biomass  - Actual Aggregated [MW]' of Germany in the 'temp' dataframe will be saved in the 'load_data' dataframe under the column name 'DE - Biomass'. 
    # Then we filter the column heads which have the string '{country_code}' in the column name and get the sum of each row in those columns and save it in a new column called '{country_code} - Total'. 
    # For example, 'DE - Total', 'UK - Total' 
        
    for country,abbr in countries.items():
        temp=pd.read_csv(f'../Data Sources/ENTSO-E/2018/Generation/{country}.csv',low_memory=False)
        temp=hourly_data(temp)

        fuels=np.array(temp.columns)
        for fuel in fuels:
            edited_fuel=fuel[:-26]
            generation_data[f'{abbr} - {edited_fuel}']=temp[fuel]
        generation_data[f'{abbr} - Total'] = generation_data.filter(like=abbr).sum(axis=1)

    return(generation_data)

# 3. Preparing Cross-border Transmission Data

In [16]:
def cross_border(abbr_list):

    transmission_data = pd.DataFrame()
    transmission_data = data[['Day','Month','Year','Time']]
    cross_border_data = pd.DataFrame()
    cross_border_data[['Day','Month','Year','Time']]=data[['Day','Month','Year','Time']]

# In the following command we get the list of the paths of all files in the directory. 
# Then one by one the csv associated with the path is copied to the dataframe 'temp' only if the file path string includes the country_code sent by 'abbr_list'. 
# For this we check the characters from 37 to 39 in the file path. 
# Then we make all the 'n/e' values of the 'temp' 0. 
# Then we cretae two new column in the 'transmission_data_temp' dataframe and name it as the two country codes the power transmission occurs. 
# We use string editing to get the two country codes from the file path. 
# For example, in the power transmission occur between Germany and Austria, we name the column as 'DE --> AT' and if the power transmission occur between Austria and Germany, we name the column as 'DE <-- AT'.
# We use 'pd.to_numeric' function to convert the string values to numerical values if any numeric values have been recorded as string in the datasets. 
# Then we send the numeric converted column to 'hourly_data' function because cross border trasnmissions occur between Germany and a some countries have 35044 data points but We need to convert them to hourly values.
# In the 'transmission_data_temp' dataframe, columns with '<' sign shows the inbound transmission to the country and columns with '>' sign shows the outbound transmission from the country. 
# After this step we assume inbound power transmission as a negative value and outbound transmission as a positive value. 
# Inbound and outbound both do not occur at the same time in a given time step. 
# Therefore we multiply the '<' columns of the 'transmission_data_temp' dataframe by -1 and add the '>' columns of the 'transmission_data_temp' to get the net inbound/outbound in that country in that particular time step and save that in the above mentioned column of the 'cross_border_data' dataframe. 
# After all the csvs from a single country are read, we join the 'transmission_data_temp' dataframe to 'transmission_data' dataframe.

    csvs = glob.glob("../Data Sources/ENTSO-E/2018/Transmission/*.csv")

    for abbr in abbr_list:
        transmission_data_temp = pd.DataFrame()
        for csv in csvs:
            if csv[42:44] == abbr:
                temp = pd.read_csv(csv)
                temp = temp.replace(['n/e',np.nan] ,0)
                
                transmission_data_temp[f'{csv[42:44]} -- > {csv[45:47]}'] = hourly_data(pd.to_numeric(temp.iloc[:,2]))
                transmission_data_temp[f'{csv[42:44]} < -- {csv[45:47]}'] = hourly_data(pd.to_numeric(temp.iloc[:,1]))  
            
        transmission_data = pd.concat([transmission_data,transmission_data_temp],axis=1)
        cross_border_data[f'{abbr}'] = transmission_data_temp.filter(like='>').sum(axis=1) + (transmission_data_temp.filter(like='<').sum(axis=1))*-1
        
    
    
    return cross_border_data,transmission_data

## 3.1 Calculating net imports/exports based on generation and load data 

In [8]:
def import_export_using_load_gen(load_data, generation_data, abbr_list):

    import_export_data = pd.DataFrame()
    import_export_data[['Day', 'Month', 'Year', 'Time']] = load_data[['Day', 'Month', 'Year', 'Time']]

    # In the following command we calculate net import/export in each time step of each country by subtracting '{country_code}' column of 'load_data' dataframe from '{country_code} - Total' of 'generation_data' dataframe and save the result in '{country_code} - [gen - load]' column of 'import_export_data' dataframe.
    # for example, import_export_data['DE - [gen - load]']=generation_data['DE - Total'] - load_data['DE'].
    # Then we create a new column in the 'import_export_data' dataframe called '{country_code} - import/export' and make that column 'Net Export' if '{country_code} - [gen - load]' is greater than 0 and make the '{country_code} - import/export' column 'Net Import' if '{country_code} - [gen - load]' is equal or lower than 0

    for abbr in abbr_list:
        import_export_data[f'{abbr}'] = generation_data[f'{abbr} - Total'] - \
            load_data[f'{abbr}']
#         import_export_data.loc[import_export_data[f'{abbr} - [gen - load]'] > 0, f'{abbr} - import/export'] = 'Net Export'
#         import_export_data.loc[import_export_data[f'{abbr} - [gen - load]'] <= 0, f'{abbr} - import/export'] = 'Net Import'

    return(import_export_data)


## 3.2 Calculating net imports/exports based on cross-border transmission data 

In [9]:
def import_export_using_crossborder(crossborder_data, abbr_list):

    import_export_data = pd.DataFrame()
    import_export_data[['Day', 'Month', 'Year', 'Time']] = crossborder_data[['Day', 'Month', 'Year', 'Time']]

# In the following command we copy the '{country_code}' column of 'crossborder_data' dataframe to '{country_code} - [exp - imp]' column of 'import_export_data' dataframe.
# for example, import_export_data['DE - [exp - imp]'].
# Then we create a new column in the 'import_export_data' dataframe called '{country_code} - import/export' and make that column 'Net Export' if '{country_code} - [exp - imp]' is greater than 0 and make the '{country_code} - import/export' column 'Net Import' if '{country_code} - [exp - imp]' is equal or lower than 0

    for abbr in abbr_list:
        import_export_data[f'{abbr}'] = crossborder_data[f'{abbr}']
#         import_export_data.loc[import_export_data[f'{abbr} - [exp - imp]'] > 0, f'{abbr} - import/export'] = 'Net Export'
#         import_export_data.loc[import_export_data[f'{abbr} - [exp - imp]'] <= 0, f'{abbr} - import/export'] = 'Net Import'

    return(import_export_data)
