In [52]:
import pandas as pd
from thefuzz import fuzz, process

In [53]:
# Đọc danh sách các country theo tên chuẩn lấy từ GeoNames
countries = pd.read_csv("../data/country.csv")["Country"]

In [54]:
# Đọc các tập dữ liệu
co2_df = pd.read_csv("../data/co2.csv").rename(columns={"value": "co2"})
climate_change_df = pd.read_csv("../data/climate_change.csv").rename(
    columns={"value": "climate_change"}
)
energy_df = pd.read_csv("../data/energy.csv").rename(columns={"value": "energy"})

In [55]:
def find_best_match_and_score(name):
    best_match = process.extractOne(name, countries, scorer=fuzz.WRatio)
    return best_match[0], best_match[1]

In [56]:
def check_match_score(df, threshold=90):
    df_country = df.loc[:, ["country"]].drop_duplicates(subset="country")
    matches = df_country["country"].apply(find_best_match_and_score)
    df_country[["normalized_country", "match_score"]] = pd.DataFrame(
        matches.to_list(), index=df_country.index
    )
    return df_country[df_country["match_score"] < threshold].sort_values(
        by="match_score"
    )

In [57]:
# co2_df_country = check_match_score(co2_df)
# co2_df_country

In [58]:
# energy_df_country = check_match_score(energy_df)

In [59]:
# climate_change_df_country = check_match_score(climate_change_df)

In [60]:
not_country = [
    "International",
    "World",
    "Global",
    "OECD",
    "OPEC",
    "Europe",
    "Central & South America",
    "U.S. Pacific Islands",
    "Former Yugoslavia",
    "Persian Gulf",
    "North America",
    "IEO - Africa",
    "Other Americas",
    "Asia",
    "Hawaiian Trade Zone",
    "Eurasia",
    "Middle East",
]


# Loại bỏ khu vực
def remove_area_entries(df):
    return df[
        ~df["country"].apply(
            lambda x: any(area.lower() in x.lower() for area in not_country)
        )
    ]


# Áp dụng hàm
co2_df = remove_area_entries(co2_df)
energy_df = remove_area_entries(energy_df)
climate_change_df = remove_area_entries(climate_change_df)

In [61]:
# Chuẩn hóa lại các quốc gia hàm chuẩn hóa chưa thực hiện được
def normalize_country_name(df, column_name="country"):
    country_mapping = {
        "USA": "United States",
        "Côte d'Ivoire": "Ivory Coast",
        "Ryukyu Islands": "Japan",
        "State of Palestine": "Palestinian Territory",
        "Holy See": "Vatican",
        "Korea, Dem. People's Rep. of": "North Korea",
        "Korea, Rep. of	": "South Korea",
        "São Tomé and Príncipe, Dem. Rep. of": "Sao Tome and Principe",
        "Congo, Dem. Rep. of the": "Democratic Republic of the Congo",
        "Congo, Rep. of": "Republic of the Congo",
        "Congo-Brazzaville": "Republic of the Congo",
        "Congo-Kinshasa": "Democratic Republic of the Congo",
        "U.S. Territories": "United States",
        "Burma": "Myanmar",
    }
    # Thay thế tên quốc gia dựa trên từ điển
    df[column_name] = df[column_name].replace(country_mapping)
    return df


# Áp dụng hàm
co2_df = normalize_country_name(co2_df)
energy_df = normalize_country_name(energy_df)
climate_change_df = normalize_country_name(climate_change_df)


In [62]:
def find_best_match(name):
    best_match = process.extractOne(name, countries, scorer=fuzz.WRatio)
    return best_match[0]


co2_df["country"] = co2_df["country"].apply(find_best_match)
energy_df["country"] = energy_df["country"].apply(find_best_match)
climate_change_df["country"] = climate_change_df["country"].apply(find_best_match)

In [63]:
integrated_df = pd.merge(
    co2_df, energy_df, on=["country", "year"], how="outer", suffixes=("_co2", "_energy")
)
integrated_df = pd.merge(
    integrated_df,
    climate_change_df,
    on=["country", "year"],
    how="outer",
    suffixes=("", "_temperature"),
)
integrated_df

Unnamed: 0,country,year,co2,energy,climate_change
0,Afghanistan,1750,,,
1,Afghanistan,1751,,,
2,Afghanistan,1752,,,
3,Afghanistan,1753,,,
4,Afghanistan,1754,,,
...,...,...,...,...,...
62301,Zimbabwe,2019,10.262950,143153.470318,0.925
62302,Zimbabwe,2020,8.494503,120908.574685,0.389
62303,Zimbabwe,2021,10.203630,135489.014556,-0.125
62304,Zimbabwe,2022,10.424940,210167.937360,
