## Notebook for processing files from Toktarova supplementary materials

In [None]:
import os

import pandas

In [None]:
import pycountry


def search_pycountry(country_name: str) -> str | None:
    try:
        # Try to find the country
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.alpha_2
    except LookupError:
        return None


def get_country_codes(country_names):
    country_codes = []
    for country_name in country_names:
        found_country_code = search_pycountry(country_name)
        if not (found_country_code):
            # print("Not Found:", country_name)
            found_country_code = "None"
            pass

        country_codes.append(found_country_code)
    return country_codes

In [None]:
def get_correct_columns(df_current_year):
    df_country_codes = pandas.DataFrame(df_current_year.loc[1]).reset_index(drop=True)
    # Remove the first row as it contains the country names
    df_country_codes.drop(index=0, inplace=True)
    df_country_codes = df_country_codes.reset_index(drop=True)
    df_country_codes.columns = ["Country Name"]
    df_country_codes["Country Code"] = get_country_codes(
        df_country_codes["Country Name"]
    )

    df_country_codes.loc[21, "Country Code"] = "BA"
    df_country_codes.loc[26, "Country Code"] = "MM"
    df_country_codes.loc[28, "Country Code"] = "KH"
    df_country_codes.loc[31, "Country Code"] = "CV"
    df_country_codes.loc[38, "Country Code"] = "CD"
    df_country_codes.loc[67, "Country Code"] = "GW"
    df_country_codes.loc[81, "Country Code"] = "CI"
    df_country_codes.loc[88, "Country Code"] = "KP"
    df_country_codes.loc[89, "Country Code"] = "KR"
    df_country_codes.loc[118, "Country Code"] = "VU"
    df_country_codes.loc[121, "Country Code"] = "NE"  # misdetected country code
    df_country_codes.loc[151, "Country Code"] = "VC"
    df_country_codes.loc[153, "Country Code"] = "SZ"
    df_country_codes.loc[158, "Country Code"] = "TZ"
    df_country_codes.loc[163, "Country Code"] = "TR"

    return ["timestep"] + df_country_codes["Country Code"].values.tolist()

In [None]:
def extract_data_for_year(year: int) -> pandas.DataFrame:
    df_current_year = pandas.read_csv(f"./comparison/Toktarova/{year}.csv")

    df_country_codes = pandas.DataFrame(df_current_year.loc[1]).reset_index(drop=True)
    df_current_year.columns = get_correct_columns(df_current_year)

    # Drop rows of different
    df_current_year.drop(index=[0, 1, 2, 3], inplace=True)
    df_current_year.reset_index(drop=True, inplace=True)

    df_current_year["timestep"] = [
        int(str_timestep.split(" ")[0].split("_")[-1]) - 1
        for str_timestep in df_current_year["timestep"]
    ]

    df_current_year.insert(0, "year", year)

    return df_current_year

In [None]:
years_available = sorted(
    [
        file.split(".")[0]
        for file in os.listdir("./comparison/Toktarova/")
        if file.endswith(".csv")
    ]
)
print(years_available)

#### 2020

In [None]:
extract_2020 = extract_data_for_year(2020)
print(extract_2020.shape)
extract_2020.head()

#### 2030

In [None]:
extract_2030 = extract_data_for_year(2030)
print(extract_2030.shape)
extract_2030.head()

#### 2040

In [None]:
extract_2040 = extract_data_for_year(2040)
print(extract_2040.shape)
extract_2040.head()

#### 2050

In [None]:
extract_2050 = extract_data_for_year(2050)
print(extract_2050.shape)
extract_2050.head()

#### 2060

In [None]:
extract_2060 = extract_data_for_year(2060)
print(extract_2060.shape)
extract_2060.head()

#### 2070

In [None]:
extract_2070 = extract_data_for_year(2070)
print(extract_2070.shape)
extract_2070.head()

#### 2080

In [None]:
extract_2080 = extract_data_for_year(2080)
print(extract_2080.shape)
extract_2080.head()

#### 2090

In [None]:
extract_2090 = extract_data_for_year(2090)
print(extract_2090.shape)
extract_2090.head()

#### 2100

In [None]:
extract_2100 = extract_data_for_year(2100)
print(extract_2100.shape)
extract_2100.head()

#### Combine years

In [None]:
extract_all_years = pandas.concat(
    [
        extract_2020,
        extract_2030,
        extract_2040,
        extract_2050,
        extract_2060,
        extract_2070,
        extract_2080,
        extract_2090,
        extract_2100,
    ],
    ignore_index=True,
)

In [None]:
extract_all_years.shape

In [None]:
extract_all_years.dtypes

In [None]:
extract_all_years.head()

In [None]:
for col in tqdm(extract_all_years.columns):
    extract_all_years[col] = extract_all_years[col].apply(int)

In [None]:
extract_all_years.dtypes

In [None]:
extract_all_years.head()

In [None]:
extract_all_years.max().max()

In [None]:
extract_all_years.to_parquet(
    "./comparison/Toktarova/all_years.parquet",
)

In [None]:
df_current_year = pandas.read_csv(f"./comparison/Toktarova/{2020}.csv")

In [None]:
df_current_year.head()

In [None]:
df_current_year = pandas.read_csv(f"./comparison/Toktarova/{2020}.csv")

df_country_codes = pandas.DataFrame(df_current_year.loc[1]).reset_index(drop=True)
df_current_year.columns = get_correct_columns(df_current_year)

# Drop rows of different
df_current_year.drop(index=[0, 1, 2, 3], inplace=True)
df_current_year.reset_index(drop=True, inplace=True)

df_current_year["timestep"] = [
    int(str_timestep.split(" ")[0].split("_")[-1]) - 1
    for str_timestep in df_current_year["timestep"]
]

df_current_year.insert(0, "year", 2020)

In [None]:
df_current_year