## Created by yunsuxiaozi
### Import necessary libraries

In [None]:
import pandas as pd#导入csv文件的库
import numpy as np#进行矩阵运算的库
from lightgbm import LGBMClassifier#导入lgbm分类器
import dill#对对象进行序列化和反序列化(例如保存和加载树模型)
from sklearn.model_selection import StratifiedKFold
import warnings#避免一些可以忽略的报错
warnings.filterwarnings('ignore')#filterwarnings()方法是用于设置警告过滤器的方法，它可以控制警告信息的输出方式和级别.

### Config

In [None]:
#设置随机种子,保证模型可以复现
import random
seed=2024
#设置随机种子,保证模型可以复现
np.random.seed(seed)
random.seed(seed)
num_folds=10
#这是pss4e1的参数,HMS比赛没有找过参数
lgb_params={'random_state': seed, 'n_estimators': 1024,
           'reg_alpha': 0.3245237982823759, 'reg_lambda': 9.713500590822735,
           'colsample_bytree': 0.5031339908309955, 'subsample': 0.9680254188045883, 
           'learning_rate': 0.036537966896644465, 'num_leaves': 29, 'min_child_samples': 99}

### Import dataset

In [None]:
TARGETS=['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
train_df=pd.read_csv("/kaggle/input/hms-harmful-brain-activity-classification/train.csv")
eeg_ids=train_df['eeg_id'].unique()
print(f"len(eeg_ids):{len(eeg_ids)}")

train_feats=pd.DataFrame({'eeg_id':eeg_ids})
for target in TARGETS:
    target_sum=train_df.groupby(train_df['eeg_id'])[[target]].agg({target:'sum'}).reset_index()
    train_feats=train_feats.merge(target_sum,on='eeg_id',how='left')
total_sum=train_feats[TARGETS].values.sum(axis=1)
for target in TARGETS:
    train_feats[target]/=total_sum
train_feats.head()

### Feature Engineer

In [None]:
#https://www.kaggle.com/code/yunsuxiaozi/writing-quality-fusion-notebook
columns=['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz',
       'Fp2', 'F4', 'C4', 'P4', 'F8', 'T4', 'T6', 'O2', 'EKG']
#对数据求统计特征
AGGREGATIONS = ['count', 'mean', 'std', 'min', 'max', 'sem','median', 'skew','sum']
gaps=[1,2,3,5,10,20,30,60,100]

add_feats=[]
for eeg_id in eeg_ids:
    eeg_data=pd.read_parquet(f"/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet")
    feats=[]#每个eeg_id的feats
    for col  in columns:
        for gap in gaps:
            eeg_data[f"{col}_shift{gap}"]=eeg_data[col].shift(gap)
            eeg_data[f"{col}_gap{gap}"]=eeg_data[col]-eeg_data[f"{col}_shift{gap}"]
            feats+=list(eeg_data[f"{col}_gap{gap}"].agg(AGGREGATIONS).values)
    add_feats.append(feats)
features=[f"{col}_gap_{gap}_{agg}" for agg  in  AGGREGATIONS for gap in gaps for col in columns]
print(f"len(features):{len(features)}")
train_feats[features]=add_feats
train_feats.head()

### Train and save models

In [None]:
X=train_feats[features]
y=train_feats[TARGETS]
y_label=np.argmax(y.values,axis=1)
all_oof = []
all_true = []

#保存训练好的树模型,obj是保存的模型,path是需要保存的路径
def pickle_dump(obj, path):
    #打开指定的路径path,binary write(二进制写入)
    with open(path, mode="wb") as f:
        #将obj对象保存到f,使用协议版本4进行序列化
        dill.dump(obj, f, protocol=4)
#根据数据的一些属性,使用5折交叉验证.
skf = StratifiedKFold(n_splits=num_folds,shuffle=True,random_state=seed)
#根据训练数据的类别和患者id来划分.
for fold, (train_index, valid_index) in enumerate(skf.split(X,y_label.astype(str))):   
    
    print(f'Fold {fold+1}')
    
    model = LGBMClassifier(**lgb_params)
    
    X_train = X.loc[train_index].values
    y_train = np.argmax(y.loc[train_index].values,axis=1)#最大的下标
    sample_weight=np.max(y.loc[train_index].values,axis=1)#最大的概率(置信度)
    X_valid = X.loc[valid_index].values
    y_valid = np.argmax(y.loc[valid_index].values,axis=1)
    
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], 
              verbose=0,#不输出任何东西
              sample_weight=sample_weight,
              early_stopping_rounds=100)
    pickle_dump(model, f'/kaggle/working/lgb_f{fold}.model') #保存训练好的模型   
    
    all_oof.append(model.predict_proba(X_valid))
    all_true.append(y.loc[valid_index].values)
    
all_oof = np.concatenate(all_oof)
all_true = np.concatenate(all_true)

### Metric

In [None]:
def KL_loss(p,q):
    epsilon=10**(-15)
    p=np.clip(p,epsilon,1-epsilon)
    #对第一个维度,就是num_classes维度的损失求和,得到每个样本的损失,然后对第0维求平均,得到每个样本平均KL散度.
    return np.mean( np.sum(p*(np.log(p)-np.log(q)),axis=1)  )
print(f"CV of Kullback Leibler Divergence:{KL_loss(all_true,all_oof)}")