1.处理世界CO2的排放量和热力图，以及相关系数的计算

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import os

plt.rcParams["font.family"] = ["SimHei", "Microsoft YaHei", "PingFang SC"]
sns.set_style("whitegrid")
plt.rcParams["axes.unicode_minus"] = False
plt.switch_backend('Agg') 



1. 数据加载与初步处理
API_NY.GDP.MKTP.KD_DS2_en_csv_v2_2160...：世界银行GDP数据
API_SP.POP.TOTL_DS2_en_csv_v2_246068：世界银行总人口数据
owid-energy-data：Our World in Data能源数据集

In [64]:
TARGET_START = 2010
TARGET_END = 2025

SAVE_DIR = "./data/"
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)
print(f" 保存目录：{os.path.abspath(SAVE_DIR)}")


 保存目录：d:\OneDrive - HKUST (Guangzhou)\桌面\icslab\data


记录下来打开的文件和保存的地方

In [65]:
energy_df = pd.read_csv(os.path.join(SAVE_DIR, "owid-energy-data.csv"))
gdp_df = pd.read_csv(os.path.join(SAVE_DIR, "API_NY.GDP.MKTP.KD_DS2_en_csv_v2_216051.csv"), skiprows=4)
pop_df = pd.read_csv(os.path.join(SAVE_DIR, "API_SP.POP.TOTL_DS2_en_csv_v2_246068.csv"), skiprows=4)


energy_clean = energy_df[
    ["country", "year", "primary_energy_consumption", "greenhouse_gas_emissions"]
].rename(
    columns={
        "country": "Country",
        "year": "Year",
        "primary_energy_consumption": "Total_Energy",
        "greenhouse_gas_emissions": "Total_GHG"
    }
)

energy_clean["Year"] = pd.to_numeric(energy_clean["Year"], errors="coerce")
energy_clean = energy_clean[
    (energy_clean["Year"] >= TARGET_START) & 
    (energy_clean["Year"] <= TARGET_END)
].dropna(subset=["Total_Energy", "Total_GHG"])
existing_countries = energy_clean["Country"].unique()

missing_count = 20 - len(existing_countries)
if missing_count > 0:
    backup_available = [c for c in BACKUP_COUNTRIES if c in energy_df["country"].unique()]
    supplement_countries = backup_available[:missing_count]
    supplement_data = energy_df[
        (energy_df["country"].isin(supplement_countries)) &
        (energy_df["year"] >= TARGET_START) & 
        (energy_df["year"] <= TARGET_END)
    ].rename(columns={"country": "Country", "year": "Year"})
    energy_clean = pd.concat([energy_clean, supplement_data], ignore_index=True)

selected_countries = list(energy_clean["Country"].unique())[:20]


上面初步处理清洗能源数据.接下来准备清洗数据，并且挑选其中的国家，优先选择合适的国家.接下来则是处理GDP和人口数据

In [66]:

gdp_clean = gdp_df.melt(
    id_vars=["Country Name"],
    var_name="Year",
    value_name="Total_GDP"
).rename(columns={"Country Name": "Country"})
gdp_clean["Year"] = pd.to_numeric(gdp_clean["Year"], errors="coerce")
gdp_clean = gdp_clean[
    (gdp_clean["Year"] >= TARGET_START) & 
    (gdp_clean["Year"] <= TARGET_END)
].dropna(subset=["Year", "Total_GDP"]) #去除之间

pop_clean = pop_df.melt(
    id_vars=["Country Name"],
    var_name="Year",
    value_name="Total_Population"
).rename(columns={"Country Name": "Country"})
pop_clean["Year"] = pd.to_numeric(pop_clean["Year"], errors="coerce")
pop_clean = pop_clean[
    (pop_clean["Year"] >= TARGET_START) & 
    (pop_clean["Year"] <= TARGET_END)
].dropna(subset=["Year", "Total_Population"])


final_df = pd.merge(energy_clean, gdp_clean, on=["Country", "Year"], how="outer")
final_df = pd.merge(final_df, pop_clean, on=["Country", "Year"], how="outer")

final_df["Per_Capita_Energy"] = final_df["Total_Energy"] / final_df["Total_Population"].replace(0, 1)
final_df["Per_Capita_GHG"] = final_df["Total_GHG"] / final_df["Total_Population"].replace(0, 1)
final_df["Per_Capita_GDP"] = final_df["Total_GDP"] / final_df["Total_Population"].replace(0, 1)
final_df = final_df.fillna(0)

final_df.to_csv(os.path.join(SAVE_DIR, "final_cleaned_data.csv"), index=False)
print(f"国家数：{final_df['Country'].nunique()}数据量：{final_df.shape}")

国家数：315数据量：(4664, 9)



Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



去除之间不合理的数据
GDP数据处理表格宽表,人口数据的处理,数据合并,计算人均数值,保存清洗之后的数据.

In [67]:

full_trend = final_df[final_df["Year"].between(2010, 2020)].copy()
full_trend = full_trend.sort_values(by=["Country", "Year"])

full_trend[["Per_Capita_Energy", "Per_Capita_GHG"]] = full_trend.groupby("Country")[
    ["Per_Capita_Energy", "Per_Capita_GHG"]
].transform(lambda x: x.interpolate(method="linear")).fillna(0)

global_trend = full_trend.groupby("Year")[["Per_Capita_Energy", "Per_Capita_GHG"]].mean().reset_index()
global_trend["Per_Capita_Energy"] = global_trend["Per_Capita_Energy"] * 10000 

global_trend["Per_Capita_GHG"] = global_trend["Per_Capita_GHG"] * 10000

plt.figure(figsize=(8, 5), dpi=150)
ax1 = plt.gca()

ax1.plot(
    global_trend["Year"], 
    global_trend["Per_Capita_Energy"], 
    color="#2A9D8F", 
    linewidth=2, 
    marker="o", 
    markersize=5,
    label="人均能源消耗（吨/人）"
)
ax1.set_xlabel("年份", fontsize=11)
ax1.set_ylabel("人均能源消耗（吨/人）", color="#2A9D8F", fontsize=10)
ax1.tick_params(axis="y", labelcolor="#2A9D8F", labelsize=9)
ax1.set_xticks(range(2010, 2021, 2))  
ax1.set_xticklabels(range(2010, 2021, 2), fontsize=9)
ax1.grid(alpha=0.2, axis="y")

ax1.spines["top"].set_visible(False)
ax1.spines["right"].set_visible(False)

ax2 = ax1.twinx()
ax2.plot(
    global_trend["Year"], 
    global_trend["Per_Capita_GHG"], 
    color="#E76F51", 
    linewidth=2, 
    marker="s", 
    markersize=5,
    label="人均温室气体排放（吨/人）"
)
ax2.set_ylabel("人均温室气体排放（吨/人）", color="#E76F51", fontsize=10)
ax2.tick_params(axis="y", labelcolor="#E76F51", labelsize=9)
ax2.spines["top"].set_visible(False)
ax2.spines["left"].set_visible(False)

ax1.legend(
    loc="upper right", 
    fontsize=9,
    bbox_to_anchor=(1, 1),
    frameon=False
)
plt.title("2010-2020 人均能源与温室气体排放趋势", fontsize=12, pad=10)

plt.subplots_adjust(top=0.9, bottom=0.15, left=0.1, right=0.9)

trend_path = os.path.join(SAVE_DIR, "trend_2010_2020_final.png")
plt.savefig(
    trend_path, 
    dpi=150, 
    facecolor="white",
    bbox_inches=None  
)
plt.close()
print(f"路径：{trend_path}")

路径：./data/trend_2010_2020_final.png



Glyph 24180 (\N{CJK UNIFIED IDEOGRAPH-5E74}) missing from font(s) Arial.


Glyph 20221 (\N{CJK UNIFIED IDEOGRAPH-4EFD}) missing from font(s) Arial.


Glyph 20154 (\N{CJK UNIFIED IDEOGRAPH-4EBA}) missing from font(s) Arial.


Glyph 22343 (\N{CJK UNIFIED IDEOGRAPH-5747}) missing from font(s) Arial.


Glyph 33021 (\N{CJK UNIFIED IDEOGRAPH-80FD}) missing from font(s) Arial.


Glyph 28304 (\N{CJK UNIFIED IDEOGRAPH-6E90}) missing from font(s) Arial.


Glyph 28040 (\N{CJK UNIFIED IDEOGRAPH-6D88}) missing from font(s) Arial.


Glyph 32791 (\N{CJK UNIFIED IDEOGRAPH-8017}) missing from font(s) Arial.


Glyph 65288 (\N{FULLWIDTH LEFT PARENTHESIS}) missing from font(s) Arial.


Glyph 21544 (\N{CJK UNIFIED IDEOGRAPH-5428}) missing from font(s) Arial.


Glyph 65289 (\N{FULLWIDTH RIGHT PARENTHESIS}) missing from font(s) Arial.


Glyph 28201 (\N{CJK UNIFIED IDEOGRAPH-6E29}) missing from font(s) Arial.


Glyph 23460 (\N{CJK UNIFIED IDEOGRAPH-5BA4}) missing from font(s) Arial.


Glyph 27668 (\N{CJK UNI

加入趋势图,宽高比8,5,使用低dpi,有奇怪长空白,手动删除,(使用尽可能对比鲜明的颜色来对比)
人均温室气体排放,人均能源消耗,双坐标轴,存储路径

In [68]:
ghg_start = final_df[
    (final_df["Year"] == 2010) & 
    (final_df["Per_Capita_GHG"].notna())
][["Country", "Per_Capita_GHG"]].rename(columns={"Per_Capita_GHG": "GHG_2010"})

ghg_end = final_df[
    (final_df["Year"] == 2020) & 
    (final_df["Per_Capita_GHG"].notna())
][["Country", "Per_Capita_GHG"]].rename(columns={"Per_Capita_GHG": "GHG_2020"})

ghg_change = pd.merge(ghg_start, ghg_end, on="Country", how="inner")
ghg_change["Change_Amount"] = (ghg_change["GHG_2020"] - ghg_change["GHG_2010"]) * 10000

ghg_change = ghg_change.reindex(ghg_change["Change_Amount"].abs().sort_values(ascending=False).index).head(10)

if len(ghg_change) > 0:

    plt.figure(figsize=(9, 6), dpi=150)
    colors = ["#F4A261" if x > 0 else "#2A9D8F" for x in ghg_change["Change_Amount"]]
    bars = plt.bar(
        ghg_change["Country"],
        ghg_change["Change_Amount"],
        color=colors,
        width=0.7,
        alpha=0.9
    )

    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2,
            height + (0.2 if height > 0 else -0.2),
            f"{height:.1f}",
            ha="center",
            fontsize=8
        )
    
    plt.title("2010-2020 人均温室气体排放变化（Top10）", fontsize=12, pad=10)
    plt.xlabel("国家", fontsize=11)
    plt.ylabel("变化量（吨/人）", fontsize=11)
    plt.axhline(y=0, color="#666", linestyle="--")
    plt.xticks(rotation=45, ha="right", fontsize=8)
    plt.subplots_adjust(top=0.9, bottom=0.2, left=0.1, right=0.95)

    bar_path = os.path.join(SAVE_DIR, "bar_2010_2020_final.png")
    plt.savefig(
        bar_path, 
        dpi=150, 
        facecolor="white",
        bbox_inches=None
    )
    plt.close()
    print(f"路径：{bar_path}显示国家数：{len(ghg_change)}")


路径：./data/bar_2010_2020_final.png显示国家数：10



Glyph 22269 (\N{CJK UNIFIED IDEOGRAPH-56FD}) missing from font(s) Arial.


Glyph 23478 (\N{CJK UNIFIED IDEOGRAPH-5BB6}) missing from font(s) Arial.


Glyph 21464 (\N{CJK UNIFIED IDEOGRAPH-53D8}) missing from font(s) Arial.


Glyph 21270 (\N{CJK UNIFIED IDEOGRAPH-5316}) missing from font(s) Arial.


Glyph 37327 (\N{CJK UNIFIED IDEOGRAPH-91CF}) missing from font(s) Arial.


Glyph 65288 (\N{FULLWIDTH LEFT PARENTHESIS}) missing from font(s) Arial.


Glyph 21544 (\N{CJK UNIFIED IDEOGRAPH-5428}) missing from font(s) Arial.


Glyph 20154 (\N{CJK UNIFIED IDEOGRAPH-4EBA}) missing from font(s) Arial.


Glyph 65289 (\N{FULLWIDTH RIGHT PARENTHESIS}) missing from font(s) Arial.


Glyph 22343 (\N{CJK UNIFIED IDEOGRAPH-5747}) missing from font(s) Arial.


Glyph 28201 (\N{CJK UNIFIED IDEOGRAPH-6E29}) missing from font(s) Arial.


Glyph 23460 (\N{CJK UNIFIED IDEOGRAPH-5BA4}) missing from font(s) Arial.


Glyph 27668 (\N{CJK UNIFIED IDEOGRAPH-6C14}) missing from font(s) Arial.


Glyph 20307 (\N{CJK UNI

柱状图,保留2010到2020年的数据(提取有效值),宽高比9,6,插入值


In [69]:
corr_df = final_df[(final_df["Year"] >= TARGET_START) & (final_df["Year"] <= TARGET_END)]

corr_matrix = corr_df[["Per_Capita_GDP", "Total_Population", "Per_Capita_Energy"]].corr()

# 绘图
plt.figure(figsize=(8, 6), dpi=300)
cmap = plt.cm.RdYlGn_r
im = plt.imshow(corr_matrix, cmap=cmap, vmin=-1, vmax=1)

# 颜色条与数值标注
plt.colorbar(im, label="相关系数（-1~1）", shrink=0.8)
for i in range(len(corr_matrix.columns)):
    for j in range(len(corr_matrix.columns)):
        text_color = "white" if abs(corr_matrix.iloc[i, j]) > 0.5 else "black"
        plt.text(
            j, i, 
            f"{corr_matrix.iloc[i, j]:.3f}",
            ha="center", va="center", 
            color=text_color, fontweight="bold"
        )

plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, rotation=45, ha="right")
plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns)
plt.title(f"{TARGET_START}-{TARGET_END} 指标相关性", fontsize=14, pad=20)
plt.tight_layout()
# plt.show()

heatmap_path = os.path.join(SAVE_DIR, f"heatmap_{TARGET_START}_{TARGET_END}.png")
plt.savefig(heatmap_path, dpi=300)
plt.close()
print(f" 路径：{heatmap_path}")


Glyph 25351 (\N{CJK UNIFIED IDEOGRAPH-6307}) missing from font(s) Arial.


Glyph 26631 (\N{CJK UNIFIED IDEOGRAPH-6807}) missing from font(s) Arial.


Glyph 30456 (\N{CJK UNIFIED IDEOGRAPH-76F8}) missing from font(s) Arial.


Glyph 20851 (\N{CJK UNIFIED IDEOGRAPH-5173}) missing from font(s) Arial.


Glyph 24615 (\N{CJK UNIFIED IDEOGRAPH-6027}) missing from font(s) Arial.


Glyph 31995 (\N{CJK UNIFIED IDEOGRAPH-7CFB}) missing from font(s) Arial.


Glyph 25968 (\N{CJK UNIFIED IDEOGRAPH-6570}) missing from font(s) Arial.


Glyph 65288 (\N{FULLWIDTH LEFT PARENTHESIS}) missing from font(s) Arial.


Glyph 65289 (\N{FULLWIDTH RIGHT PARENTHESIS}) missing from font(s) Arial.


Glyph 25351 (\N{CJK UNIFIED IDEOGRAPH-6307}) missing from font(s) Arial.


Glyph 26631 (\N{CJK UNIFIED IDEOGRAPH-6807}) missing from font(s) Arial.


Glyph 30456 (\N{CJK UNIFIED IDEOGRAPH-76F8}) missing from font(s) Arial.


Glyph 20851 (\N{CJK UNIFIED IDEOGRAPH-5173}) missing from font(s) Arial.


Glyph 24615 (\N{CJK UNI

 路径：./data/heatmap_2010_2025.png


相关性热图,计算相关系数,绘制图案,标注颜色和数值

In [70]:
heatmap_df = final_df[(final_df["Year"] >= TARGET_START) & (final_df["Year"] <= TARGET_END)]

heatmap_df["Per_Capita_GHG_norm"] = (heatmap_df["Per_Capita_GHG"] - heatmap_df["Per_Capita_GHG"].min()) / (heatmap_df["Per_Capita_GHG"].max() - heatmap_df["Per_Capita_GHG"].min())

fig = px.choropleth(
    heatmap_df,
    locations="Country",
    locationmode="country names",
    color="Per_Capita_GHG_norm",  
    color_continuous_scale=px.colors.sequential.RdBu_r, 
    animation_frame="Year",
    title=f"2010-2025 人均温室气体排放热力图",
    labels={"Per_Capita_GHG_norm": "排放强度（归一化）"},
    width=1200, height=600
)

fig.update_layout(
    title_x=0.5,
    geo=dict(showframe=False, showcoastlines=True)
)

html_path = os.path.join(SAVE_DIR, f"heatmap_interactive_{TARGET_START}_{TARGET_END}.html")
fig.write_html(html_path)
print(f"路径：{html_path}")


print(f"文件列表（{SAVE_DIR}）：")
for file in os.listdir(SAVE_DIR):
    if file.endswith((".png", ".html", ".csv")):
        print(f"   - {file}")

路径：./data/heatmap_interactive_2010_2025.html
文件列表（./data/）：
   - API_NY.GDP.MKTP.KD_DS2_en_csv_v2_216051.csv
   - API_SP.POP.TOTL_DS2_en_csv_v2_246068.csv
   - bar_2010_2020_final.png
   - energy_cleaned.csv
   - final_cleaned_data.csv
   - gdp_cleaned.csv
   - heatmap_2010_2025.png
   - heatmap_interactive_2010_2025.html
   - interactive_multi_index_2010_2025.html
   - Metadata_Country_API_NY.GDP.MKTP.KD_DS2_en_csv_v2_216051.csv
   - Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2_246068.csv
   - Metadata_Indicator_API_NY.GDP.MKTP.KD_DS2_en_csv_v2_216051.csv
   - Metadata_Indicator_API_SP.POP.TOTL_DS2_en_csv_v2_246068.csv
   - owid-energy-data.csv
   - trend_2010_2020_final.png



The library used by the *country names* `locationmode` option is changing in an upcoming version. Country names in existing plots may not work in the new version. To ensure consistent behavior, consider setting `locationmode` to *ISO-3*.



交互式热力图,数据归一化,高对比度配色