### This notebook is study from <a href="https://www.kaggle.com/code/yorkyong/exploring-eeg-a-beginner-s-guide">exploring eeg a beginner's guide</a>.This notebook is training lightgbm models.

### Import necessary libraries.

In [None]:
import os#与操作系统进行交互的库
import pandas as pd#导入csv文件的库
import numpy as np#进行矩阵运算的库
import matplotlib.pyplot as plt#一个强大的绘图库
from lightgbm import LGBMClassifier#导入lgbm分类器
import gc#垃圾回收的库
import dill#对对象进行序列化和反序列化(例如保存和加载树模型)
from sklearn.model_selection import GroupKFold#根据数据的一些属性,例如地区,时间来划分K个子集
import warnings#避免一些可以忽略的报错
warnings.filterwarnings('ignore')#filterwarnings()方法是用于设置警告过滤器的方法，它可以控制警告信息的输出方式和级别。

### Config

In [None]:
#设置随机种子,保证模型可以复现
import random
seed=2024
num_folds=10
#这是pss4e1的参数,HMS比赛没有找过参数
lgb_params={'random_state': seed, 'n_estimators': 1024,
           'reg_alpha': 0.3245237982823759, 'reg_lambda': 9.713500590822735,
           'colsample_bytree': 0.5031339908309955, 'subsample': 0.9680254188045883, 
           'learning_rate': 0.036537966896644465, 'num_leaves': 29, 'min_child_samples': 99}
#设置随机种子,保证模型可以复现
np.random.seed(seed)
random.seed(seed)

### Import train and spectrogram dataset.

In [None]:
#读取数据
train_df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
TARGETS = train_df.columns[-6:]#需要预测的列名
print('Train shape:', train_df.shape )
print('Targets', list(TARGETS))
train_df.head()

In [None]:
#将数据取出
files = os.listdir('/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/')
spectrograms = {}
for i,f in enumerate(files):
    tmp = pd.read_parquet(f"/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/{f}")
    #f:'1000086677.parquet'  
    name = int(f.split('.')[0])#name:1000086677
    spectrograms[name] = tmp.iloc[:,1:].values#第0列是时间列.shape:(T,400)

### Feature engineer

In [None]:
#从训练数据中每个eeg_id 选出'spectrograd_id'第一个,并且'spectrogram_label_offset_seconds'最小的那个 
train = train_df.groupby('eeg_id')[['spectrogram_id',
                                    'spectrogram_label_offset_seconds',
                                     'patient_id',
                                     'expert_consensus'
                                   ]].agg(
    {'spectrogram_id':'first','spectrogram_label_offset_seconds':['min','max'],
     'patient_id':'first','expert_consensus':'first'})
train.columns = ['spec_id','min','max','patient_id','target']#将列名改成spec_id和min,'max'

#每个eeg_id应该都有一个spec_id,但是每个spec_id却不一定只有一个eeg_id.
print(f"len(train):{len(train)},unique_eeg:{train_df['eeg_id'].nunique()},unique_spec:{train['spec_id'].nunique()}")

#这里对每个样本专家投票的数据进行归一化
tmp = train_df.groupby('eeg_id')[TARGETS].agg('sum')
for t in TARGETS:
    train[t] = tmp[t].values
y_data = train[TARGETS].values 
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train[TARGETS] = y_data

#将eeg_id设置为索引
train = train.reset_index() 

train.head()

In [None]:
#除了时间列,其他列都要
SPEC_COLS = pd.read_parquet(f"/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/1000086677.parquet").columns[1:]
#对这些列的数据构造特征.
FEATURES = [f'{c}_mean_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_min_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_mean_20s' for c in SPEC_COLS]
FEATURES += [f'{c}_min_20s' for c in SPEC_COLS]

#这个就是训练数据
data = np.zeros((len(train),len(FEATURES)))
for k in range(len(train)):
    row = train.iloc[k]#取出第k个数据,或者说第K行
    r = int( (row['min'] + row['max'])//4 ) 
    #找出对应的数据spectrograms[row.spec_id]
    row_data=spectrograms[row.spec_id]
    #数据 时间维度是[r:r+300],列是400,按列对不是缺失值的数据求均值和最小值.
    data[k,:400] = np.nanmean(row_data[r:r+300,:],axis=0)
    data[k,400:800] = np.nanmin(row_data[r:r+300,:],axis=0)
    #数据 时间维度是[r+145:r+155],列是400,按列对不是缺失值的数据求均值和最小值.
    data[k,800:1200] =  np.nanmean(row_data[r+145:r+155,:],axis=0)
    data[k,1200:1600] = np.nanmin(row_data[r+145:r+155,:],axis=0)
#统计好特征.
train[FEATURES] = data
print('New train shape:',train.shape)

### Models training and Save

In [None]:
all_oof = []
all_true = []
TARS = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}

#保存训练好的树模型,obj是保存的模型,path是需要保存的路径
def pickle_dump(obj, path):
    #打开指定的路径path,binary write(二进制写入)
    with open(path, mode="wb") as f:
        #将obj对象保存到f,使用协议版本4进行序列化
        dill.dump(obj, f, protocol=4)
#根据数据的一些属性,使用5折交叉验证.
gkf = GroupKFold(n_splits=num_folds)
#根据训练数据的类别和患者id来划分.
for fold, (train_index, valid_index) in enumerate(gkf.split(train , train .target, train .patient_id)):   
    
    print(f'Fold {fold+1}')
    
    model = LGBMClassifier(**lgb_params)
    
    # Prepare training and validation data
    X_train = train.loc[train_index, FEATURES]
    y_train = train.loc[train_index, 'target'].map(TARS)
    X_valid = train.loc[valid_index, FEATURES]
    y_valid = train.loc[valid_index, 'target'].map(TARS)
    
    #取出训练数据TARGETS这几列概率最大的概率值,也就是真实类别的概率值作为权重.
    sample_weight=np.max(train.loc[train_index, TARGETS].values,axis=1)
    
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
              verbose=0,#不输出任何东西
              sample_weight=sample_weight,
              early_stopping_rounds=100)
    pickle_dump(model, f'/kaggle/working/lgb_f{fold}.model') #保存训练好的模型   
    
    all_oof.append(model.predict_proba(X_valid))
    all_true.append(train.loc[valid_index, TARGETS].values)
    
    del X_train, y_train, X_valid, y_valid#每次训练完删除掉
    gc.collect()#如果一个对象不再引用,那就会被回收掉
    
all_oof = np.concatenate(all_oof)
all_true = np.concatenate(all_true)

### Metric

In [None]:
def KL_loss(p,q):
    epsilon=10**(-15)
    p=np.clip(p,epsilon,1-epsilon)
    #对第一个维度,就是num_classes维度的损失求和,得到每个样本的损失,然后对第0维求平均,得到每个样本平均KL散度.
    return np.mean(np.sum(p*(np.log(p)-np.log(q)),axis=1))
print(f"CV of Kullback Leibler Divergence:{KL_loss(all_true,all_oof)}")