In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

plt.rcParams["font.family"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
DATA_DIR = "./data/"
RESULT_DIR = "./results/"

final_df = pd.read_csv(os.path.join(DATA_DIR, "final_cleaned_data.csv"))
model_df = final_df[final_df["Year"].between(2010, 2020)].copy()

model_df = model_df[["Country", "Year", "Per_Capita_GDP", "Per_Capita_Energy", "Per_Capita_GHG"]].dropna()
print(f"样本量：{model_df.shape[0]}")

人均GDP,年份,能源消耗
目的是预测人均的CO2排放

In [None]:

def predict_ghg():
    global_avg = model_df.groupby("Year")[["Per_Capita_GHG"]].mean().reset_index()
    X = global_avg[["Year"]].values  
    y = global_avg["Per_Capita_GHG"].values 

   
    poly = PolynomialFeatures(degree=2) 
    X_poly = poly.fit_transform(X)
    reg = LinearRegression()
    reg.fit(X_poly, y)


    future_years = np.arange(2021, 2031).reshape(-1, 1)
    future_X_poly = poly.transform(future_years)
    future_ghg = reg.predict(future_X_poly)

    y_pred = reg.predict(X_poly)
    r2 = r2_score(y, y_pred)
    print(f"回归模型R²：{r2:.3f}")


    plt.figure(figsize=(10, 6), dpi=150)

    plt.scatter(X, y, color="#2A9D8F", label="2010-2020历史数据", s=50)

    plt.plot(X, y_pred, color="#E76F51", linewidth=2.5, label=f"2次多项式拟合（R²={r2:.3f}）")

    plt.plot(future_years, future_ghg, color="#F4A261", linestyle="--", linewidth=2.5, marker="o", label="2021-2030预测")

    plt.title("2010-2030 全球人均CO₂排放趋势预测", fontsize=13)
    plt.xlabel("年份", fontsize=11)
    plt.ylabel("人均CO₂排放（吨/人）", fontsize=11)
    plt.xticks(np.concatenate([X.flatten(), future_years.flatten()], axis=0)[::2], fontsize=9)
    plt.grid(alpha=0.2)
    plt.legend(fontsize=10)
    plt.tight_layout()

    save_path = os.path.join(RESULT_DIR, "ghg_prediction.png")
    plt.savefig(save_path, dpi=150, facecolor="white")
    plt.close()
    print(f"路径：{save_path}")

    return pd.DataFrame({"Year": future_years.flatten(), "Predicted_Per_Capita_GHG": future_ghg})

pred_result = predict_ghg()
print("\n===== 2021-2030人均CO₂排放预测结果 =====")
print(pred_result.round(4))

以全球的数据为准
回归模型,预测2020到2030的是对应数据,并且执行可视化,现根据历史,建立对应的预测,线性回归,进行二次多项式回归,比线性回归更加准确也许
计算对应的R^2

In [None]:

def cluster_countries():
    
    country_data = model_df[model_df["Year"] == 2020].groupby("Country")[
        ["Per_Capita_Energy", "Per_Capita_GHG"]
    ].mean().reset_index()
    
    scaler = StandardScaler()
    X_cluster = scaler.fit_transform(country_data[["Per_Capita_Energy", "Per_Capita_GHG"]])

    kmeans = KMeans(n_clusters=3, random_state=42)
    country_data["Cluster"] = kmeans.fit_predict(X_cluster)

    plt.figure(figsize=(10, 6), dpi=150)
    colors = ["#2A9D8F", "#E76F51", "#F4A261"]
    for cluster in range(3):
        cluster_data = country_data[country_data["Cluster"] == cluster]
        plt.scatter(
            cluster_data["Per_Capita_Energy"],
            cluster_data["Per_Capita_GHG"],
            color=colors[cluster],
            label=f"分组{cluster+1}",
            s=80,
            alpha=0.8
        )
        for _, row in cluster_data.iterrows():
            plt.text(
                row["Per_Capita_Energy"] + 0.001,
                row["Per_Capita_GHG"] + 0.0001,
                row["Country"],
                fontsize=8
            )

    plt.title("2020年国家能源消耗-排放强度聚类结果", fontsize=13)
    plt.xlabel("人均能源消耗（万吨油当量/万人）", fontsize=11)
    plt.ylabel("人均CO₂排放（百万吨/万人）", fontsize=11)
    plt.grid(alpha=0.2)
    plt.legend(fontsize=10)
    plt.tight_layout()

    save_path = os.path.join(RESULT_DIR, "country_clustering.png")
    plt.savefig(save_path, dpi=150, facecolor="white")
    plt.close()
    print(f"路径：{save_path}")

    return country_data[["Country", "Cluster"]]

cluster_result = cluster_countries()

print(cluster_result)

使用K-means的方法处理,取2020年的国家数据,使用聚类,将其分为三类进行(选取初始K的方法有待商榷),
最终生成对应的图表
