In [1]:
import chardet

INPUT_ATHLETES = r"C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\athletes_1.csv"
INPUT_SPORTS = r"C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\2025_Problem_C_Data\summerOly_programs.csv"

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding']

# 示例用法
print("运动员文件编码:", detect_encoding(INPUT_ATHLETES))
print("运动分类文件编码:", detect_encoding(INPUT_SPORTS))

运动员文件编码: utf-8
运动分类文件编码: Windows-1252


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import gc

# ========================================================
# 配置参数
# ========================================================
INPUT_ATHLETES = r"C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\athletes_1.csv"
INPUT_SPORTS = r"C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\2025_Problem_C_Data\summerOly_programs.csv"
OUTPUT_FILE = r"C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\athletes_with_scores.csv"

athletes_encoding = detect_encoding(INPUT_ATHLETES)
sports_encoding = detect_encoding(INPUT_SPORTS)

GAMMA_MAP = {
    "Athletics": 0.85, "Swimming": 0.87, "Weightlifting": 0.86,
    "Equestrian": 0.93, "Archery": 0.92, "Basketball": 0.88,
    "Football": 0.89, "_default": 0.88
}
YEAR_RANGE = (1896, 2024)

# ========================================================
# 数据预处理（增强校验）
# ========================================================
def load_and_preprocess():
    """加强数据校验的预处理"""
    # 加载运动员数据
    athletes = pd.read_csv(
        INPUT_ATHLETES,
        encoding=athletes_encoding,
        dtype={
            "Year": "int16",
            "Total": "int8",
            "Gold": "int8",
            "Silver": "int8",
            "Bronze": "int8"
        },
        usecols=["Name", "NOC", "Year", "Sport", "Total", "Gold", "Silver", "Bronze"]
    )
    
    # 数据清洗
    athletes = athletes[
        (athletes["Year"].between(*YEAR_RANGE)) &
        (athletes["Total"] >= 0) &
        (athletes["Gold"] >= 0) &
        (athletes["Silver"] >= 0) &
        (athletes["Bronze"] >= 0)
    ].dropna(subset=["Name", "NOC", "Sport"])
    
    # 加载运动分类数据
    sports = pd.read_csv(INPUT_SPORTS, 
                       usecols=["Sport", "Discipline"], 
                       encoding=sports_encoding).drop_duplicates().dropna()
    
    # 合并数据
    df = pd.merge(athletes, sports, on="Sport", how="left")
    df["Discipline"] = df["Discipline"].fillna("Other")
    
    return df[["Name", "NOC", "Year", "Sport", "Total", "Gold", "Silver", "Bronze"]]

# ========================================================
# 核心计算模块（内存优化版）
# ========================================================
def calculate_athlete_scores(group):
    """计算运动员得分（保持原始姓名）"""
    try:
        if group.empty:
            return pd.DataFrame()
            
        # 按年份排序
        group = group.sort_values("Year")
        
        # 使用numpy数组优化
        years = group["Year"].values.astype('int16')
        totals = group["Total"].values.astype('int8')
        golds = group["Gold"].values.astype('int8')
        silvers = group["Silver"].values.astype('int8')
        bronzes = group["Bronze"].values.astype('int8')
        sport = group["Sport"].iloc[0]
        
        gamma = GAMMA_MAP.get(sport, GAMMA_MAP["_default"])
        scores = []
        
        for i in range(len(years)):
            current_year = years[i]
            
            # 历史数据索引
            valid_indices = np.where(years[:i] < current_year)[0]
            
            if len(valid_indices) > 0:
                time_diff = current_year - years[valid_indices]
                decay_weights = np.power(gamma, time_diff)
                
                hist_total = np.dot(totals[valid_indices], decay_weights)
                hist_gold = np.dot(golds[valid_indices], decay_weights)
                hist_silver = np.dot(silvers[valid_indices], decay_weights)
                hist_bronze = np.dot(bronzes[valid_indices], decay_weights)
            else:
                hist_total = hist_gold = hist_silver = hist_bronze = 0.0
                
            # 计算得分
            score = (
                0.4 * (hist_total + totals[i]) +
                0.3 * (hist_gold + golds[i]) +
                0.2 * (hist_silver + silvers[i]) +
                0.1 * (hist_bronze + bronzes[i])
            )
            scores.append(score)
        
        return pd.DataFrame({
            "Name": group["Name"].values,  # 保持原始姓名
            "NOC": group["NOC"].values,
            "Year": years,
            "Sport": sport,
            "adjusted_score": scores
        })
        
    except Exception as e:
        print(f"处理 {group['Name'].iloc[0]} 时出错: {str(e)}")
        return pd.DataFrame()

# ========================================================
# 主处理流程（增强排序逻辑）
# ========================================================
def main():
    # 加载数据
    print("正在加载数据...")
    df = load_and_preprocess()
    if df.empty:
        print("数据加载失败，请检查输入文件")
        return
    
    # 分组处理
    print("正在计算运动员得分...")
    results = []
    for name, group in tqdm(df.groupby("Name"), desc="处理运动员", unit="athlete"):
        result = calculate_athlete_scores(group)
        if not result.empty:
            results.append(result)
        
        # 每处理100个运动员清理内存
        if len(results) % 100 == 0:
            _ = gc.collect()
    
    # 合并结果
    print("正在合并结果...")
    if not results:
        print("没有有效结果可供保存")
        return
        
    processed_df = pd.concat(results, ignore_index=True)
    
    # 归一化处理
    scaler = MinMaxScaler()
    processed_df["score"] = scaler.fit_transform(processed_df[["adjusted_score"]])
    
    # 合并原始数据
    final_df = pd.merge(
        df,
        processed_df,
        on=["Name", "NOC", "Year", "Sport"],
        how="left"
    )
    
    # 按国家、年份、得分排序
    final_df = final_df.sort_values(
        by=["NOC", "Year", "score"], 
        ascending=[True, True, False]
    )
    
    # 保存结果
    final_df.to_csv(OUTPUT_FILE, index=False)
    print(f"数据已保存至 {OUTPUT_FILE}")
    
    # 可视化示例
    visualize_results(final_df)

# ========================================================
# 可视化模块（按国家展示）
# ========================================================
def visualize_results(df):
    """生成国家维度可视化图表"""
    plt.figure(figsize=(16, 9))
    
    # 选择示例国家
    sample_nocs = ['USA', 'CHN', 'RUS', 'GER', 'JPN']
    
    for noc in sample_nocs:
        country_data = df[df["NOC"] == noc]
        if not country_data.empty:
            # 计算每届平均分
            year_avg = country_data.groupby("Year")["score"].mean().reset_index()
            plt.plot(year_avg["Year"], year_avg["score"], 
                    marker="o", linestyle="-", linewidth=2, 
                    markersize=8, label=noc)
    
    plt.title("National Performance Trend (Average Score)", fontsize=14)
    plt.xlabel("Olympic Year", fontsize=12)
    plt.ylabel("Average Score", fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.legend(title="National Olympic Committee", fontsize=10)
    plt.xticks(np.arange(YEAR_RANGE[0], YEAR_RANGE[1]+1, 8), rotation=45)
    
    # 保存图表
    plt.tight_layout()
    plt.savefig(r"C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\national_performance_trend.png", dpi=300)
    plt.close()
    print("国家维度可视化图表已保存")

# ========================================================
# 执行入口
# ========================================================
if __name__ == "__main__":
    # 配置环境
    os.environ["OPENBLAS_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"
    
    try:
        main()
    except Exception as e:
        print(f"程序运行失败: {str(e)}")
        import traceback
        traceback.print_exc()

正在加载数据...
正在计算运动员得分...


处理运动员: 100%|████████████████████████████████████████████████████████| 129992/129992 [11:57<00:00, 181.15athlete/s]


正在合并结果...
数据已保存至 C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\athletes_with_scores.csv
国家维度可视化图表已保存


In [3]:
import pandas as pd

# ========================================================
# 配置参数
# ========================================================
INPUT_FILE = r"C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\athletes_with_scores.csv"
OUTPUT_FILE = r"C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\athletes_with_scores_deduplicated_full.csv"

# ========================================================
# 完全去重函数
# ========================================================
def remove_full_duplicates(df):
    """
    去除所有列完全相同的重复行
    参数：
        df: 原始数据框
    返回：
        去重后的数据框
    """
    # 记录原始数据量
    original_count = len(df)
    
    # 执行完全去重（保留第一条记录）
    df_deduplicated = df.drop_duplicates(keep='first')
    
    # 记录处理结果
    removed_count = original_count - len(df_deduplicated)
    duplicate_rate = removed_count / original_count * 100
    
    print(f"原始记录数: {original_count:,}")
    print(f"去重后记录数: {len(df_deduplicated):,}")
    print(f"移除重复行数: {removed_count:,} ({duplicate_rate:.2f}%)")
    
    return df_deduplicated

# ========================================================
# 主处理流程
# ========================================================
def main():
    try:
        # 读取数据（低内存模式）
        df = pd.read_csv(INPUT_FILE, 
                        dtype={'Year': 'int16', 'score': 'float32'},
                        engine='c')
        print("数据加载成功")
    except Exception as e:
        print(f"文件读取失败: {str(e)}")
        return
    
    # 执行完全去重
    print("\n正在执行完全去重...")
    df_clean = remove_full_duplicates(df)
    
    # 保存结果
    df_clean.to_csv(OUTPUT_FILE, index=False)
    print(f"\n去重结果已保存至: {OUTPUT_FILE}")
    
    # 展示示例
    print("\n重复记录示例（前5条完全重复的记录）：")
    duplicates = df[df.duplicated(keep=False)]
    if not duplicates.empty:
        print(duplicates.sort_values(by=df.columns.tolist()).head(5))
    else:
        print("未发现完全重复的记录")

# ========================================================
# 执行入口
# ========================================================
if __name__ == "__main__":
    main()

数据加载成功

正在执行完全去重...
原始记录数: 2,523,115
去重后记录数: 213,667
移除重复行数: 2,309,448 (91.53%)

去重结果已保存至: C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\athletes_with_scores_deduplicated_full.csv

重复记录示例（前5条完全重复的记录）：
               Name  NOC  Year       Sport  Total  Gold  Silver  Bronze  \
27066  (jr) Larocca  ARG  2024  Equestrian      0     0       0       0   
27067  (jr) Larocca  ARG  2024  Equestrian      0     0       0       0   
27068  (jr) Larocca  ARG  2024  Equestrian      0     0       0       0   
27069  (jr) Larocca  ARG  2024  Equestrian      0     0       0       0   
27070  (jr) Larocca  ARG  2024  Equestrian      0     0       0       0   

       adjusted_score  score  
27066             0.0    0.0  
27067             0.0    0.0  
27068             0.0    0.0  
27069             0.0    0.0  
27070             0.0    0.0  


In [5]:
import pandas as pd
import matplotlib.pyplot as plt

# ========================================================
# 配置参数
# ========================================================
INPUT_FILE = r"C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\athletes_with_scores_deduplicated_full.csv"
OUTPUT_FILE = r"C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\atheletes_scores.csv"
VISUALIZATION_PATH = r"C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\national_performance_trend.png"

# ========================================================
# 核心计算函数
# ========================================================
def calculate_national_scores(df):
    """
    计算每个国家每届奥运会的平均运动员指数
    参数：
        df: 包含完整运动员数据的数据框
    返回：
        包含国家、年份、平均指数的数据框
    """
    # 分组计算平均值（自动跳过缺失值）
    national_avg = df.groupby(['NOC', 'Year'], as_index=False)['score'].mean()
    
    # 重命名列
    national_avg.columns = ['国家代码', '奥运年份', '平均运动员指数']
    
    # 排序：先按国家代码，再按年份
    national_avg = national_avg.sort_values(['国家代码', '奥运年份'])
    
    return national_avg.round({'平均运动员指数': 4})  # 保留4位小数

# ========================================================
# 可视化函数
# ========================================================
def visualize_national_trends(df):
    """
    生成国家表现趋势图
    参数：
        df: 包含国家平均指数的数据框
    """
    plt.figure(figsize=(18, 10))
    
    # 选择奖牌大国作为示例
    top_countries = ['USA', 'CHN', 'RUS', 'GBR', 'GER', 'JPN', 'AUS']
    
    for noc in top_countries:
        country_data = df[df['国家代码'] == noc]
        if not country_data.empty:
            plt.plot(country_data['奥运年份'], 
                    country_data['平均运动员指数'],
                    marker='o',
                    linestyle='-',
                    linewidth=2,
                    markersize=8,
                    label=noc)

    plt.title('各国奥运表现趋势 (1896-2024)', fontsize=16, pad=20)
    plt.xlabel('奥运年份', fontsize=12)
    plt.ylabel('平均运动员指数', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.legend(title="国家/地区", bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # 设置x轴刻度
    plt.xticks(
        ticks=range(1896, 2025, 8),
        rotation=45,
        fontsize=10
    )
    
    # 保存高清图片
    plt.tight_layout()
    plt.savefig(VISUALIZATION_PATH, dpi=300, bbox_inches='tight')
    plt.close()

# ========================================================
# 主处理流程
# ========================================================
def main():
    try:
        # 读取数据（优化内存占用）
        dtype_spec = {
            'NOC': 'category',
            'Year': 'int16',
            'score': 'float32'
        }
        df = pd.read_csv(INPUT_FILE, usecols=dtype_spec.keys(), dtype=dtype_spec)
    except Exception as e:
        print(f"数据加载失败: {str(e)}")
        return

    # 执行计算
    print("正在计算国家平均指数...")
    national_scores = calculate_national_scores(df)
    
    # 保存结果
    national_scores.to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')
    print(f"计算结果已保存至: {OUTPUT_FILE}")
    
    # 生成可视化
    print("正在生成趋势图表...")
    visualize_national_trends(national_scores)
    print(f"可视化图表已保存至: {VISUALIZATION_PATH}")

    # 显示示例数据
    print("\n示例数据（前10行）：")
    print(national_scores.head(10))

# ========================================================
# 执行入口
# ========================================================
if __name__ == "__main__":
    main()

正在计算国家平均指数...
计算结果已保存至: C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\atheletes_scores.csv
正在生成趋势图表...


  national_avg = df.groupby(['NOC', 'Year'], as_index=False)['score'].mean()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(VISUALIZATION_PATH, dpi=300, bbox_inches='tight')
  plt.savefig(VISUALIZATION_PATH, dpi=300, bbox_inches='tight')
  plt.savefig(VISUALIZATION_PATH, dpi=300, bbox_inches='tight')
  plt.savefig(VISUALIZATION_PATH, dpi=300, bbox_inches='tight')
  plt.savefig(VISUALIZATION_PATH, dpi=300, bbox_inches='tight')
  plt.savefig(VISUALIZATION_PATH, dpi=300, bbox_inches='tight')
  plt.savefig(VISUALIZATION_PATH, dpi=300, bbox_inches='tight')
  plt.savefig(VISUALIZATION_PATH, dpi=300, bbox_inches='tight')
  plt.savefi

可视化图表已保存至: C:\Users\NANA\.jupyter\JupyterProject\MCM_02\code\data\national_performance_trend.png

示例数据（前10行）：
  国家代码  奥运年份  平均运动员指数
0  AFG  1896      NaN
1  AFG  1900      NaN
2  AFG  1904      NaN
3  AFG  1906      NaN
4  AFG  1908      NaN
5  AFG  1912      NaN
6  AFG  1920      NaN
7  AFG  1924      NaN
8  AFG  1928      NaN
9  AFG  1932      NaN
