In [2]:
import pandas as pd

def aggregate_to_hourly(input_csv, output_csv, has_header=True):
    """
    将分钟级 CSV 聚合到小时级：
    - 对功率、电压等做 mean
    - 对 Sub_metering 和 RR 做 sum
    - RR 从十分之一毫米除10恢复为毫米
    - 其他（NBJRR*）取第一个
    """
    # 加载数据
    if has_header:
        df = pd.read_csv(input_csv, parse_dates=['DateTime'], index_col='DateTime')
    else:
        # 无表头，指定列名
        cols = ['DateTime', 'Global_active_power', 'Global_reactive_power',
                'Voltage', 'Global_intensity', 'Sub_metering_1',
                'Sub_metering_2', 'Sub_metering_3', 'RR',
                'NBJRR1', 'NBJRR5', 'NBJRR10', 'NBJBROU']
        df = pd.read_csv(input_csv, header=None, names=cols,
                         parse_dates=['DateTime'], index_col='DateTime')
    
    # 转为数值类型
    num_cols = df.columns
    df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

    # RR 单位处理
    df['RR'] = df['RR'] / 10.0

    # 聚合规则
    agg_rules = {
        # 平均型
        'Global_active_power': 'mean',
        'Global_reactive_power': 'mean',
        'Voltage': 'mean',
        'Global_intensity': 'mean',
        # 能耗累加
        'Sub_metering_1': 'sum',
        'Sub_metering_2': 'sum',
        'Sub_metering_3': 'sum',
        'RR': 'sum',
        # 月度不变，取首值
        'NBJRR1': 'first',
        'NBJRR5': 'first',
        'NBJRR10': 'first',
        'NBJBROU': 'first'
    }
    # 筛选聚合列，避免缺列报错
    agg_rules = {k: v for k, v in agg_rules.items() if k in df.columns}

    # 重采样到小时级
    hourly = df.resample('H').agg(agg_rules)
    hourly.ffill(inplace=True)

    # 保存
    hourly.to_csv(output_csv)
    print(f"Aggregated '{input_csv}' → '{output_csv}'. Rows: {len(hourly)}")
    print(hourly.head(), hourly.info(), sep='\n')


# 对 train.csv（带表头）聚合
aggregate_to_hourly('train.csv', 'train_hourly.csv', has_header=True)

# 对 test.csv（无表头）聚合
aggregate_to_hourly('test.csv', 'test_hourly.csv', has_header=False)


  hourly = df.resample('H').agg(agg_rules)


Aggregated 'train.csv' → 'train_hourly.csv'. Rows: 17911
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17911 entries, 2006-12-16 17:00:00 to 2008-12-31 23:00:00
Freq: h
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Global_active_power    17911 non-null  float64
 1   Global_reactive_power  17911 non-null  float64
 2   Voltage                17911 non-null  float64
 3   Global_intensity       17911 non-null  float64
 4   Sub_metering_1         17911 non-null  float64
 5   Sub_metering_2         17911 non-null  float64
 6   Sub_metering_3         17911 non-null  float64
 7   RR                     17911 non-null  float64
 8   NBJRR1                 17911 non-null  float64
 9   NBJRR5                 17911 non-null  float64
 10  NBJRR10                17911 non-null  float64
 11  NBJBROU                17911 non-null  float64
dtypes: float64(12)
memory usage: 1.8 MB
                     G

  df = pd.read_csv(input_csv, header=None, names=cols,
  hourly = df.resample('H').agg(agg_rules)


Aggregated 'test.csv' → 'test_hourly.csv'. Rows: 16678
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 16678 entries, 2009-01-01 00:00:00 to 2010-11-26 21:00:00
Freq: h
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Global_active_power    16678 non-null  float64
 1   Global_reactive_power  16678 non-null  float64
 2   Voltage                16678 non-null  float64
 3   Global_intensity       16678 non-null  float64
 4   Sub_metering_1         16678 non-null  float64
 5   Sub_metering_2         16678 non-null  float64
 6   Sub_metering_3         16678 non-null  float64
 7   RR                     16678 non-null  float64
 8   NBJRR1                 16678 non-null  float64
 9   NBJRR5                 16678 non-null  float64
 10  NBJRR10                16678 non-null  float64
 11  NBJBROU                16678 non-null  float64
dtypes: float64(12)
memory usage: 1.7 MB
                     Glo