# Internal Inconsistencies

In [27]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
from IPython.display import display


In [6]:
import import_ipynb
import data_preperation as dp

countries={ 'Austria': 'AT', 'Belgium': 'BE',  'Bulgaria': 'BG', 'Switzerland': 'CH', 'Czech Republic': 'CZ', 'Croatia': 'HR',  'Germany': 'DE', 'Denmark': 'DK', 'Estonia': 'EE', 'Spain': 'ES', 'Finland': 'FI', 'France': 'FR',  'Greece': 'GR', 'Hungary': 'HU', 'Ireland': 'IE', 'Italy': 'IT', 'Lithuania': 'LT', 'Latvia': 'LV', 'Montenegro': 'ME','Netherlands': 'NL', 'Norway': 'NO', 'Poland': 'PL', 'Portugal': 'PT', 'Serbia': 'RS', 'Sweden': 'SE', 'Slovenia': 'SI', 'Slovakia': 'SK', 'United Kingdom': 'UK'}

abbr_list=list(countries.values())

load_data = dp.load(countries)
generation_data = dp.generation(countries)
import_export_using_crossborder_data = dp.cross_border(abbr_list)[0]

importing Jupyter notebook from data_preperation.ipynb


In [2]:
def plot(x1,x2,labels,color1,color2,label1,label2,title):

    # We made this function to draw the comparission graphs. This can create multiple barcharts.

    width = 0.35
    X = np.arange(len(labels))
    plt.figure(figsize=(20,10))
    plt.bar([x-width/2 for x in X],x1,width,color=color1,edgecolor='black', label=label1)
    plt.bar([x+width/2 for x in X], x2, width, color=color2,edgecolor='black', label=label2)
    plt.xlabel('Countries')
    plt.ylabel('Energy [GWh]')
    plt.title(title)
    plt.xticks(X,labels)
    plt.axhline(y=0, color='black', linestyle='-')
    plt.legend()
    plt.grid()
    plt.show()


## 1. Calculating missing values in data files

In [45]:
def omit_dst(df):

    # Due to day light saving, all the datasets have null values on 31st March from 02:00 - 03:00. 
    # Considering the time intervals each country update the data, a total number of rows of 4,2 or 1 are dropped from the 31st March, 02:00 - 03:00 time interval.

    length = len(df)
    if length == 35044:
        df = df.drop(range(8552, 8556)).reset_index(drop=True)
    elif length == 17522:
        df = df.drop(range(4276, 4278)).reset_index(drop=True)
    else:
        df = df.drop(range(2138, 2139)).reset_index(drop=True)
    
    return(df)

def calculate_missing_values(countries):

    load_missing_data = []
    generation_missing_data = []
    transmission_missing_data =[]
    transmission_files = []
    temp1 = pd.DataFrame(columns=pd.MultiIndex.from_product([['Load Data', 'Generation Data'], ['File', 'No. of missing data']]))
    temp2 = pd.DataFrame(columns=pd.MultiIndex.from_product([['Transmission Data'], ['File', 'No. of missing data']]))
    csvs = glob.glob("../Data Sources/ENTSO-E/Transmission/*.csv")

    # In the following command we read each csv file and drop the rows related to day light saving using 'omit_dst' function
    # Then we save the total number of null values in load data of each country in the load_missing_data list.
    # Then we save the total number of null values in generation data of each country in the generation_missing_data list.
    # Then we get the list of the paths of all files in the directory to the 'csvs' variable using 'glob' function.
    # Then one by one the actual csv associated with the path is copied to the dataframe 'df3' only if the file path string includes the country_code sent by 'abbr'. 
    # Then we save the total number of null values in each transmission datafile of each country in the transmission_missing_data list.
    # Then we update all the generation & load missing values in the 'temp1' dataframe and transmission missing values in 'temp2' dataframe.

    for country,abbr in countries.items():
        df1 = omit_dst(pd.read_csv(f'../Data Sources/ENTSO-E/Load/{country}.csv'))
        df2 = omit_dst(pd.read_csv(f'../Data Sources/ENTSO-E/Generation/{country}.csv', low_memory=False))
        load_missing_data.append(df1.isnull().sum().sum())
        generation_missing_data.append(df2.isnull().sum().sum())

        for csv in csvs:
            if csv[37:39] == abbr:
                df3 = omit_dst(pd.read_csv(csv))
                transmission_missing_data.append(df3.isnull().sum().sum())
                transmission_files.append(f'{csv[37:39]} --> {csv[40:42]}')

    temp1['Load Data', 'File']= temp1['Generation Data', 'File'] = list(countries.values())
    temp2['Transmission Data', 'File'] = transmission_files
    temp1['Load Data', 'No. of missing data'] = load_missing_data
    temp1['Generation Data', 'No. of missing data'] = generation_missing_data
    temp2['Transmission Data', 'No. of missing data'] = transmission_missing_data
    return (temp1, temp2)


In [44]:
calculate_missing_values(countries)


Unnamed: 0_level_0,Load Data,Load Data,Generation Data,Generation Data
Unnamed: 0_level_1,File,No. of missing data,File,No. of missing data
0,AT,0,AT,0
1,BE,0,BE,7793
2,BG,1,BG,8760
3,CH,0,CH,8760
4,CZ,4,CZ,221
5,HR,0,HR,5687
6,DE,1,DE,20924
7,DK,0,DK,0
8,EE,29,EE,59
9,ES,0,ES,8860


Unnamed: 0_level_0,Transmission Data,Transmission Data
Unnamed: 0_level_1,File,No. of missing data
0,AT --> CH,0
1,AT --> CZ,0
2,AT --> DE,0
3,AT --> HU,0
4,AT --> IT,0
...,...,...
130,UK --> BE,0
131,UK --> FR,40
132,UK --> IE,0
133,UK --> NL,0


## 2. Calculating internal inconsistencies in ENTSO-E data

## 2.1 Comparission of net imports/exports based on generation and load data and net imports/exports based on cross-border transmission data

In [6]:
def entsoe_comparission_transmission_data(import_export_using_load_gen, import_export_using_crossborder_data, labels):

    load_gen_data= []
    crossborder_data = []
    temp = []
    
# In the following command, we take the yearly sum of data in net import based on generation/load and net imported based on cross border transmission data. 
# These two yearly sum arrays are saved in variables 'load_gen_data' and 'crossborder_data' respectively.
# Then we create a new dataframe with the previous mentioned two values of each country and the percentage difference wrt the cross-border transmission data. 
# Then the graph is drawn using matplotlib.

    for label in labels:
        load_gen_data.append(import_export_using_load_gen[f'{label} - [gen - load]'].sum()/1000)
        crossborder_data.append(import_export_using_crossborder_data[f'{label} - [exp - imp]'].sum()/1000)
        temp.append([label,crossborder_data[-1],load_gen_data[-1], round((crossborder_data[-1] - load_gen_data[-1])*100/crossborder_data[-1],2)])
    
    table_entsoe_comparission_transmission_data = pd.DataFrame(temp,columns=['Country','Value based on (exp - imp)[GWh]','Value based on (gen - load)[GWh]','% difference with respect to Value based on (exp - imp) [%]'])
    display(table_entsoe_comparission_transmission_data)
    
    plot(load_gen_data, crossborder_data, labels, 'magenta', 'aqua',
         'based on generation/load data', 'based on cross-border export/import data', 'Comparission of annual net imports/exports based on generation/load & cross-border transmission data')
    

## 2.2 Comparission of data between ENTSO-E & Eurostat

### In Eurostat data, in the csv with the generation data, it has both gross electricity production and net electricity production. In the transmission data csv, it has imports and exports of each country seperately.

### 2.2.1 Comparission of Generation data between ENTSO-E & Eurostat

In [7]:
def eurostat_comparission_gen_data(generation_data, countries):
    
    gen_eurostat=[]
    gen_entsoe=[]
    labels=[]
    temp = []
    eurostat = pd.read_csv('../Data Sources/Eurostat/Gross & Net Generation .csv')
    eurostat = eurostat.loc[eurostat['TIME']==2019]

    # In the following command we first check if the countries in the ENTSO-E datasets are also available in the Eurostat dataset. 
    # If yes, we append the 'labels' as the country code. 
    # Here also we use 'pd.to_numeric' command because Eurostat data come in string format. 
    # Then we update 'gen_eurostat' list by checking for the 'country' in 'GEO' column and 'Net electricity production' in the 'NRG_BAL' column of eurostat daraframe and take the respective value in the 'Value' column. 
    # Then we get the sum along the '{country_code} - Total'column in the 'generation_data' dataframe and update the 'gen_entsoe' list with that value.
    # Then we create the dataframe which shows ENTSO-E generation, EUrostat generation and the percentage difference in generation wrt ENTSO-Edata for each country. 
    # Then the graph is drawn using matplotlib.
    
    for country,abbr in countries.items():

        if country in eurostat.values:
            labels.append(abbr)
            gen_eurostat.append(pd.to_numeric(eurostat.loc[(
                eurostat['GEO'] == country) & (eurostat['NRG_BAL'] == 'Net electricity production'), 'Value'].iloc[0]))
            gen_entsoe.append(generation_data[f'{abbr} - Total'].sum()/1000)
            temp.append([labels[-1],gen_entsoe[-1],gen_eurostat[-1], round((gen_entsoe[-1] - gen_eurostat[-1])*100/gen_entsoe[-1],2)])
    
    table_eurostat_comparission_gen_data = pd.DataFrame(temp,columns=['Country','Gen data based on ENTSO-E[GWh]','Gen data based on Eurostat[GWh]','% difference with respect to Value based on ENTSO-E [%]'])
    display(table_eurostat_comparission_gen_data)
    
    plot(gen_entsoe, gen_eurostat, labels, 'gold', 'lime',
         'ENTSO-E data', 'Eurostat data', 'Comparission of annual electricity generation data in ENTSO-E and Eurostat data')


### 2.2.2 Comparission of Load data between ENTSO-E & Eurostat

In [8]:
def eurostat_comparission_load_data(load_data, countries):

    load_eurostat =[]
    load_entsoe=[]
    labels=[]
    temp = []
    eurostat = pd.read_csv('../Data Sources/Eurostat/Load.csv')
    eurostat = eurostat.loc[eurostat['TIME']==2019]
       
    # In the following command we first check if the countries in the ENTSO-E datasets are also available in the Eurostat dataset. 
    # If yes, we append the 'labels' as the country code. 
    # Here also we use 'pd.to_numeric' command because Eurostat data come in string format. 
    # Then we update 'load_eurostat' list by checking for the 'country' in 'GEO' column of eurostat daraframe and take the respective value in the 'Value' column.
    # Then we get the sum along the '{country_code}' column in the 'load_data' dataframe and update the 'load_entsoe' list with that value.
    # Then we create the dataframe which shows ENTSO-E load, EUrostat load and the percentage difference in load wrt ENTSO-Edata for each country. 
    # Then the graph is drawn using matplotlib.
    
    for country,abbr in countries.items():

        if country in eurostat.values:
            labels.append(abbr)
            load_eurostat.append(pd.to_numeric(eurostat.loc[eurostat['GEO'] == country,'Value'].iloc[0]))
            load_entsoe.append(load_data[f'{abbr}'].sum()/1000)
            temp.append([labels[-1],load_entsoe[-1],load_eurostat[-1], round((load_entsoe[-1] - load_eurostat[-1])*100/load_entsoe[-1],2)])
            
    table_eurostat_comparission_load_data = pd.DataFrame(temp,columns=['Country','Load data based on ENTSO-E[GWh]','Load data based on Eurostat[GWh]','% difference with respect to Value based on ENTSO-E [%]'])
    display(table_eurostat_comparission_load_data)
    
    plot(load_entsoe, load_eurostat, labels, 'lightcoral', 'deepskyblue',
         'ENTSO-E data', 'Eurostat data', 'Comparission of annual electricity load data in ENTSO-E and Eurostat data')


### 2.2.3 Comparission of Net Imports/Exports data between ENTSO-E & Eurostat

In [9]:
def eurostat_comparission_transmission_data(import_export_using_crossborder_data, countries):
    transmission_eurostat = []
    transmission_entsoe = []
    labels=[]
    temp = []
    eurostat = pd.read_csv('../Data Sources/Eurostat/Transmission.csv')
    eurostat = eurostat.loc[eurostat['TIME']==2019]

    # In the following command we first check if the countries in the ENTSO-E datasets are also available in the Eurostat dataset. 
    # If yes, we append the 'labels' as the country code. 
    # Here also we use 'pd.to_numeric' command because Eurostat data come in string format. 
    # Then we update 'transmission_eurostat' list by checking for the ('country' in 'GEO' column and 'Exports' in the 'NRG_BAL' column of eurostat daraframe) and (the 'country' in 'GEO' column and 'Imports' in the 'NRG_BAL' column of eurostat dataframe) and take the difference of the respective values in the 'Value' column.
    # Then we get the sum along the '{country_code} - [exp-imp] in the 'import_export_using_crossborder_data' dataframe and update the 'transmission_entsoe' list with that value.
    # Then we create the dataframe which shows ENTSO-E net imports/exports, EUrostat net imports/exports and the percentage difference in net imports/exports wrt ENTSO-Edata for each country. 
    # Then the graph is drawn using matplotlib.
    
    
    for country, abbr in countries.items():

        if country in eurostat.values:
            labels.append(abbr)
            transmission_eurostat.append(pd.to_numeric(eurostat.loc[(
                eurostat['GEO'] == country) & (eurostat['NRG_BAL'] == 'Exports'), 'Value'].iloc[0]) - pd.to_numeric(eurostat.loc[(
                    eurostat['GEO'] == country) & (eurostat['NRG_BAL'] == 'Imports'), 'Value'].iloc[0]))
            transmission_entsoe.append(import_export_using_crossborder_data[f'{abbr} - [exp - imp]'].sum()/1000)
            temp.append([labels[-1],transmission_entsoe[-1],transmission_eurostat[-1], round((transmission_entsoe[-1] - transmission_eurostat[-1])*100/transmission_entsoe[-1],2)])
            
    table_eurostat_comparission_transmission_data = pd.DataFrame(temp,columns=['Country','Net imports/exports data based on ENTSO-E[GWh]','Net imports/exports data based on Eurostat[GWh]','% difference with respect to Value based on ENTSO-E [%]'])
    display(table_eurostat_comparission_transmission_data)
    
    plot(transmission_entsoe, transmission_eurostat, labels, 'magenta', 'aqua',
         'ENTSO-E data', 'Eurostat data', 'Comparission of annual electricity net export/import data in ENTSO-E and Eurostat data')
