## Notebook for processing files from Toktarova supplementary materials

In [2]:
import pandas
import os

In [183]:
import pycountry


def search_pycountry(country_name: str) -> str | None:
    try:
        # Try to find the country
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.alpha_2
    except LookupError:
        return None


def get_country_codes(country_names):
    country_codes = []
    for country_name in country_names:
        found_country_code = search_pycountry(country_name)
        if not (found_country_code):
            # print("Not Found:", country_name)
            found_country_code = "None"
            pass

        country_codes.append(found_country_code)
    return country_codes

In [185]:
def get_correct_columns(df_current_year):
    
    df_country_codes = pandas.DataFrame(df_current_year.loc[1]).reset_index(drop=True)
    # Remove the first row as it contains the country names
    df_country_codes.drop(index=0, inplace=True)
    df_country_codes = df_country_codes.reset_index(drop=True)
    df_country_codes.columns = ['Country Name']
    df_country_codes['Country Code'] = get_country_codes(df_country_codes['Country Name'])
    print(df_current_year.head())

    df_country_codes.loc[21,"Country Code"] = "BA"
    df_country_codes.loc[26,"Country Code"] = "MM"
    df_country_codes.loc[28,"Country Code"] = "KH"
    df_country_codes.loc[31,"Country Code"] = "CV"
    df_country_codes.loc[38,"Country Code"] = "CD"
    df_country_codes.loc[67,"Country Code"] = "GW"
    df_country_codes.loc[81,"Country Code"] = "CI"
    df_country_codes.loc[88,"Country Code"] = "KP"
    df_country_codes.loc[89,"Country Code"] = "KR"
    df_country_codes.loc[118,"Country Code"] = "VU"
    df_country_codes.loc[151,"Country Code"] = "VC"
    df_country_codes.loc[153,"Country Code"] = "SZ"
    df_country_codes.loc[158,"Country Code"] = "TZ"
    df_country_codes.loc[163,"Country Code"] = "TR"

    return ["timestep"] + df_country_codes["Country Code"].values.tolist()

In [None]:
def extract_data_for_year(year: int) -> pandas.DataFrame:
    df_current_year = pandas.read_csv(f"./comparison/Toktarova/{year}.csv", delimiter=";")

    df_country_codes = pandas.DataFrame(df_current_year.loc[1]).reset_index(drop=True)
    df_current_year.columns = get_correct_columns(df_current_year)

    # Drop rows of different 
    df_current_year.drop(index=[0,1,2,3], inplace=True)
    df_current_year.reset_index(drop=True, inplace=True)

    print(df_current_year["timestep"])
    df_current_year["timestep"] = [int(str_timestep.split(" ")[0].split("_")[-1]) - 1 for str_timestep in df_current_year["timestep"]]

    df_current_year["year"] = year

    return df_current_year
    

In [190]:
test_2020 = extract_data_for_year(2020)

                           Countrynumber            1            2  \
0                            Countryname  Afghanistan      Albania   
1  annual electricity consumption in TWh   9,19186995  10,11532643   
2                   synthetic peak in MW  1532,950566  1639,319734   
3                           Hour_1 in MW  708,6741472  932,6792476   
4                           Hour_2 in MW  686,6610016  934,3454279   

             3            5                    7            9           10  \
0      Algeria       Angola  Antigua and Barbuda    Argentina      Armenia   
1  99,80658614   33,2079296          0,401350638  196,1392119  9,165144217   
2  17861,83257  5065,420414          58,90868234  26404,98624  1548,742169   
3  6078,437955  3431,831925          39,54652336  20992,73843  942,7135548   
4  5619,712832  3363,349699          37,78816308  20591,13511  902,0642141   

            11           12  ...             193            194          196  \
0    Australia      Austria  .

In [179]:
years_available = sorted([file.split(".")[0] for file in os.listdir("./comparison/Toktarova/") if file.endswith(".csv")])
print(years_available)

years_available.remove("2050")
years_available.remove("2060")
years_available.remove("2070")


['2020', '2030', '2040', '2050', '2060', '2070', '2080', '2090', '2100']


In [180]:
from tqdm import tqdm


for year in tqdm(years_available, desc="Processing Toktarova data per year"):
    print(f"Collecting Toktarova data for year {year}")
    df_current_year = extract_data_for_year(year)

Processing Toktarova data per year:   0%|                                                                                 | 0/6 [00:00<?, ?it/s]

Collecting Toktarova data for year 2020


  df_country_codes.loc[21,"Country Code"] = "BA"
Processing Toktarova data per year:  17%|████████████▏                                                            | 1/6 [00:01<00:09,  1.98s/it]

0          Hour_1 in MW
1          Hour_2 in MW
2          Hour_3 in MW
3          Hour_4 in MW
4          Hour_5 in MW
             ...       
8755    Hour_8756 in MW
8756    Hour_8757 in MW
8757    Hour_8758 in MW
8758    Hour_8759 in MW
8759    Hour_8760 in MW
Name: timestep, Length: 8760, dtype: object
Collecting Toktarova data for year 2030





ValueError: Length mismatch: Expected axis has 1 elements, new values have 15 elements