## Created by yunsuxiaozi 2024/4/14

### 比赛链接如下: https://www.biendata.xyz/competition/ind_kdd_2024/

### 这是我第一次参加KDD_cup,记录一下.本次比赛应该会用到知识图谱的相关知识,我这里给一个数据挖掘方面的baseline,目前分数还算不错.

### 导入必要的库

In [32]:
#necessary
import pandas as pd#导入csv文件的库
import numpy as np#进行矩阵运算的库
import json#用于读取和写入json数据格式
import string

#model lgb分类模型,日志评估,早停防止过拟合
from  lightgbm import LGBMClassifier,log_evaluation,early_stopping
from nltk import word_tokenize
import torch
#metric
from sklearn.metrics import roc_auc_score#导入roc_auc曲线
#KFold是直接分成k折,StratifiedKFold还要考虑每种类别的占比
from sklearn.model_selection import StratifiedKFold

# 图神经网络
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

import networkx as nx
import traceback

import matplotlib.pyplot as plt
import seaborn as sns



### 设置相关的参数

In [33]:
#config
class Config():
    seed=2024#随机种子
    num_folds=10#K折交叉验证
    TARGET_NAME ='label'#标签
import random#提供了一些用于生成随机数的函数
#设置随机种子,保证模型可以复现
def seed_everything(seed):
    np.random.seed(seed)#numpy的随机种子
    random.seed(seed)#python内置的随机种子
seed_everything(Config.seed)

### 导入相关的数据集,我这里是将数据放在Kaggle上.

In [34]:
path='./data/'
#sample: Iki037dt dict_keys(['name', 'normal_data', 'outliers'])
with open(path+"train/train_author.json", encoding='utf-8') as f:
    train_author=json.load(f)
#sample : 6IsfnuWU dict_keys(['id', 'title', 'authors', 'abstract', 'keywords', 'venue', 'year'])   
with open(path+"train/pid_to_info_all.json", encoding='utf-8') as f:
    pid_to_info=json.load(f)
#efQ8FQ1i dict_keys(['name', 'papers'])
with open(path+"ind_valid/ind_valid_author.json", encoding='utf-8') as f:
    valid_author=json.load(f)

with open(path+"ind_valid/ind_valid_author_submit.json" ,encoding='utf-8') as f:
    submission=json.load(f)

### 这里做了简单的特征工程.

In [35]:
puncs = '[!“”"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~—～’]+'
stopwords = ['at', 'based', 'in', 'of', 'for', 'on', 'and', 'to', 'an', 'using', 'with',
                    'the', 'by', 'we', 'be', 'is', 'are', 'can']
# self.stopwords_extend = ['university', 'univ', 'china', 'department', 'dept', 'laboratory', 'lab',
#                          'school', 'al', 'et', 'institute', 'inst', 'college', 'chinese', 'beijing',
#                          'journal', 'science', 'international']
stopwords_extend = ['university', 'univ', 'china', 'department', 'dept', 'laboratory', 'lab',
                            'school', 'al', 'et', 'institute', 'inst', 'college', 'chinese', 'beijing',
                            'journal', 'science', 'international', 'key', 'sciences', 'research',
                            'academy', 'state', 'center']

stopwords_check = ['a', 'was', 'were', 'that', '2', 'key', '1', 'technology', '0', 'sciences', 'as',
                            'from', 'r', '3', 'academy', 'this', 'nanjing', 'shanghai', 'state', 's', 'research',
                        'p', 'results', 'peoples', '4', 'which', '5', 'high', 'materials', 'study', 'control',
                        'method', 'group', 'c', 'between', 'or', 'it', 'than', 'analysis', 'system',  'sci',
                        'two', '6', 'has', 'h', 'after', 'different', 'n', 'national', 'japan', 'have', 'cell',
                        'time', 'zhejiang', 'used', 'data', 'these']

stopwords_custom = stopwords + stopwords_extend + stopwords_check
# 删除停用词
def preprocess_text(text):
    stop_words = set(stopwords_custom)
    text = text.lower().translate(str.maketrans("", "", string.punctuation))
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [36]:
%env HF_ENDPOINT=https://hf-mirror.com
# 加载BERT模型和分词器
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True).eval()

env: HF_ENDPOINT=https://hf-mirror.com


In [37]:
def get_cls_embedding(text, model=model, tokenizer=tokenizer):
   
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    
    if device.type == "cuda":
        cls_embedding = cls_embedding.cpu()
    return cls_embedding.numpy()[0]

In [38]:
# 提取图关系网络
coauthor_graph = nx.Graph()
for author_id, author_info in train_author.items():
    papers = author_info['normal_data']
    for paper_id in papers:
        authors = pid_to_info[paper_id]['authors']
        for coauthor in authors:
            if coauthor['name'] != author_info['name']:
                coauthor_graph.add_edge(author_info['name'], coauthor['name'])

In [39]:
coauthor_nodes = list(coauthor_graph.nodes())
num_nodes = len(coauthor_nodes)
num_edges = coauthor_graph.number_of_edges()

node_degrees = dict(coauthor_graph.degree())

node_features = torch.tensor([node_degrees[node] for node in coauthor_nodes], dtype=torch.float).unsqueeze(1)

node_to_index = {node: index for index, node in enumerate(coauthor_nodes)}

edges = [(node_to_index[edge[0]], node_to_index[edge[1]]) for edge in coauthor_graph.edges()]

edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

data = Data(x=node_features, edge_index=edge_index)

In [40]:
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

# 初始化GNN模型
input_dim = 1
hidden_dim = 64
output_dim = 32
gnn_model = GNN(input_dim, hidden_dim, output_dim)

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 将模型移到GPU
gnn_model = gnn_model.to(device)

# 设置模型为评估模式（这一步对于推理非常重要，可以关闭dropout等训练时的特性）
gnn_model.eval()

with torch.no_grad():
    # 确保data也在GPU上
    data = data.to(device)
    graph_embeddings = gnn_model(data)

# 将图特征映射回作者，这里假设不需要在GPU上进行此操作，直接转为numpy用于后续处理
author_embeddings = {}
for index, author_name in enumerate(coauthor_nodes):
    author_embeddings[author_name] = graph_embeddings[index].cpu().numpy()

In [44]:
# 假设当前年份为基准年份，用于计算Publication Age
CURRENT_YEAR = 2024

# 初始化venue频次字典
venue_freq = {}

basic_feats = []
bert_feats = []
graph_feats = []
labels = []
for id, person_info in train_author.items():
    for text_id, label in [(tid, 1) for tid in person_info['normal_data']] + [(tid, 0) for tid in person_info['outliers']]:
        feat = pid_to_info[text_id]
        # ['title', 'abstract', 'keywords', 'authors', 'venue', 'year']
        # Skip if feat is None or any required field in feat is None
        try:
            # 时间特征更改为距今多久
            year_str = feat.get('year', '')
            if year_str:
                publication_age = CURRENT_YEAR - int(year_str)
            else:
                publication_age = CURRENT_YEAR - 2000
                
            # 地点特征改为共出现多少次这个机构
            venue = feat['venue']
            if venue in venue_freq:
                venue_freq[venue] += 1
            else:
                venue_freq[venue] = 1

            # 计算bert对于title和abstract的embedding
            title_embedding = get_cls_embedding(feat['title'])
            abstract_embedding = get_cls_embedding(feat['abstract'])

            bert_features = np.concatenate([title_embedding, abstract_embedding])
            
            # 添加图特征
            authors = feat['authors']
            embeddings = [author_embeddings[author['name']] for author in authors if author['name'] in author_embeddings]
            if embeddings:
                graph_features = np.mean(embeddings, axis=0)
            else:
                graph_features = np.zeros(32)

            basic_feats.append(
                [len(feat['title']), len(feat['abstract']), len(feat['keywords']), len(feat['authors'])
                 , len(feat['venue']), publication_age, venue_freq[venue]]
            )
            bert_feats.append(bert_features)
            graph_feats.append(graph_features)
        except Exception as e:
            print("An error occurred:", e)  # 打印异常信息
            traceback.print_exc()  # 打印详细的异常信息
            basic_feats.append(
                [len(feat['title']), len(feat['abstract']), len(feat['keywords']), len(feat['authors'])
                 , 0, 24, 0]
            )
            bert_feats.append([0] * (2 * 768)) 
            graph_feats.append(np.zeros(32))
        labels.append(label)

An error occurred: object of type 'NoneType' has no len()


Traceback (most recent call last):
  File "/tmp/ipykernel_12598/958863425.py", line 47, in <module>
    , len(feat['venue']), publication_age, venue_freq[venue]]
TypeError: object of type 'NoneType' has no len()


In [45]:
basic_feats = np.array(basic_feats)
bert_feats = np.array(bert_feats)
graph_feats = np.array(graph_feats)
train_feats = np.concatenate([basic_feats, bert_feats,graph_feats], axis=1)

In [46]:
train_feats=np.array(train_feats)
labels=np.array(labels)
print(f"train_feats.shape:{train_feats.shape},labels.shape:{labels.shape}")
print(f"np.mean(labels):{np.mean(labels)}")
train_feats=pd.DataFrame(train_feats)
train_feats['label']=labels
train_feats.head()

train_feats.shape:(148309, 1575),labels.shape:(148309,)
np.mean(labels):0.8834527911320283


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1566,1567,1568,1569,1570,1571,1572,1573,1574,label
0,120.0,0.0,0.0,14.0,18.0,14.0,1.0,-0.47013,-0.494594,-0.253409,...,172.577667,917.606812,-70.400208,-565.000305,139.197052,52.216644,298.513367,-554.812134,-305.153015,1
1,123.0,0.0,0.0,8.0,22.0,13.0,1.0,-1.1114,-0.237089,-0.539751,...,171.325333,910.947998,-69.889336,-560.900269,138.186935,51.837723,296.347137,-550.786072,-302.938599,1
2,100.0,986.0,4.0,9.0,32.0,23.0,1.0,-0.751877,-0.159403,-0.083471,...,172.558716,917.505859,-70.392471,-564.938171,139.181747,52.210903,298.48053,-554.751221,-305.119446,1
3,103.0,0.0,0.0,10.0,31.0,37.0,1.0,-0.513019,-0.210881,-0.852793,...,171.571793,912.258484,-69.989876,-561.707092,138.385712,51.912292,296.773438,-551.578369,-303.374359,1
4,133.0,1629.0,5.0,10.0,18.0,9.0,1.0,-1.051877,-0.482063,-0.148663,...,171.583832,912.322449,-69.994781,-561.74646,138.395432,51.915943,296.794281,-551.617126,-303.395721,1


In [47]:
basic_valid_feats = []
bert_valid_feats = []
graph_valid_feats = []
venue_freq = {}
for id,person_info in valid_author.items():
    for text_id in person_info['papers']:
        feat=pid_to_info[text_id]
        #['title', 'abstract', 'keywords', 'authors', 'venue', 'year']
        try:
            publication_age = CURRENT_YEAR - int(feat['year'])       
            venue = feat['venue']
            if venue in venue_freq:
                venue_freq[venue] += 1
            else:
                venue_freq[venue] = 1
                
            title_embedding = get_cls_embedding(feat['title'])
            abstract_embedding = get_cls_embedding(feat['abstract'])
            
            bert_valid_features = np.concatenate([title_embedding, abstract_embedding])
            
            authors = feat['authors']
            valid_embeddings = [author_embeddings[author['name']] for author in authors if author['name'] in author_embeddings]
            if valid_embeddings:
                graph_valid_features = np.mean(valid_embeddings, axis=0)
            else:
                graph_valid_features = np.zeros(32)

            basic_valid_feats.append(
                [len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
                 ,len(feat['venue']),publication_age,venue_freq[venue]]
                 )
            bert_valid_feats.append(bert_valid_features)
            graph_valid_feats.append(graph_valid_features)
        except:
            basic_valid_feats.append(
                [len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
                 ,len(feat['venue']),24,0]
                 )
            bert_valid_feats.append([0] * (2 * 768)) 
            graph_valid_feats.append(np.zeros(32))


In [48]:
# 合并所有特征
basic_valid_feats = np.array(basic_valid_feats)
bert_valid_feats = np.array(bert_valid_feats)
graph_valid_feats = np.array(graph_valid_feats)

# 合并所有的特征
valid_feats = np.concatenate([basic_valid_feats, bert_valid_feats, graph_valid_feats], axis=1)


In [49]:
valid_feats=np.array(valid_feats)
print(f"valid_feats.shape:{valid_feats.shape}")
valid_feats=pd.DataFrame(valid_feats)
valid_feats.head()

valid_feats.shape:(62229, 1575)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1565,1566,1567,1568,1569,1570,1571,1572,1573,1574
0,123.0,0.0,0.0,10.0,79.0,9.0,1.0,-0.031636,-0.218774,0.100672,...,112.741302,37.716206,200.539597,-15.385701,-123.478714,30.421005,11.41176,65.238991,-121.252159,-66.690048
1,100.0,1060.0,0.0,11.0,36.0,6.0,1.0,-0.311767,-0.082533,0.009451,...,37.018524,12.384089,65.847038,-5.051881,-40.544151,9.988714,3.747045,21.421177,-39.813068,-21.897636
2,57.0,0.0,0.0,9.0,39.0,8.0,1.0,-0.66175,-0.499912,-0.4311,...,4.432458,1.482823,7.884277,-0.604895,-4.854603,1.196013,0.448657,2.564893,-4.767068,-2.62194
3,109.0,761.0,7.0,6.0,19.0,23.0,1.0,-0.006846,-0.122845,0.35461,...,2.623996,0.877825,4.667456,-0.358094,-2.873904,0.708033,0.265602,1.518404,-2.822081,-1.552177
4,108.0,1000.0,5.0,10.0,23.0,4.0,1.0,-0.416468,-0.10375,0.049858,...,51.542408,17.242878,91.681503,-7.033946,-56.451271,13.90769,5.217163,29.825581,-55.433342,-30.488977


### 训练10折lightgbm模型.

In [50]:
choose_cols=[col for col in valid_feats.columns]

def fit_and_predict(model,train_feats=train_feats,test_feats=valid_feats,name=0):
    X=train_feats[choose_cols].copy()
    y=train_feats[Config.TARGET_NAME].copy()
    test_X=test_feats[choose_cols].copy()
    oof_pred_pro=np.zeros((len(X),2))
    test_pred_pro=np.zeros((Config.num_folds,len(test_X),2))

    #10折交叉验证
    skf = StratifiedKFold(n_splits=Config.num_folds,random_state=Config.seed, shuffle=True)

    for fold, (train_index, valid_index) in (enumerate(skf.split(X, y.astype(str)))):
        print(f"name:{name},fold:{fold}")

        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        

        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
                      callbacks=[log_evaluation(100), early_stopping(100)])

        
        oof_pred_pro[valid_index]=model.predict_proba(X_valid)
        #将数据分批次进行预测.
        test_pred_pro[fold]=model.predict_proba(test_X)
    print(f"roc_auc:{roc_auc_score(y.values,oof_pred_pro[:,1])}")
    
    return oof_pred_pro,test_pred_pro

In [51]:
lgb_params={
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 6,
    "learning_rate": 0.05,
    "n_estimators":3072,
    "colsample_bytree": 0.9,
    "colsample_bynode": 0.9,
    "verbose": -1,
    "random_state": Config.seed,
    "reg_alpha": 0.5,
    "reg_lambda": 15,
    "extra_trees":True,
    'num_leaves':64,
    "verbose": -1,
    "max_bin":255,
    }

lgb_model = LGBMClassifier(**lgb_params)
lgb_oof_pred_pro, lgb_test_pred_pro = fit_and_predict(model=lgb_model, train_feats=train_feats, test_feats=valid_feats, name='lgb')

test_preds = lgb_test_pred_pro.mean(axis=0)[:, 1]

name:lgb,fold:0
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.746766
[200]	valid_0's auc: 0.768255


[300]	valid_0's auc: 0.778976
[400]	valid_0's auc: 0.784536
[500]	valid_0's auc: 0.788728
[600]	valid_0's auc: 0.791563
[700]	valid_0's auc: 0.794352
[800]	valid_0's auc: 0.795522
[900]	valid_0's auc: 0.797209
[1000]	valid_0's auc: 0.798226
[1100]	valid_0's auc: 0.799033
[1200]	valid_0's auc: 0.800816
[1300]	valid_0's auc: 0.801872
[1400]	valid_0's auc: 0.802515
[1500]	valid_0's auc: 0.803002
[1600]	valid_0's auc: 0.803416
[1700]	valid_0's auc: 0.803999
[1800]	valid_0's auc: 0.804096
Early stopping, best iteration is:
[1749]	valid_0's auc: 0.80446
name:lgb,fold:1
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.740981
[200]	valid_0's auc: 0.759394
[300]	valid_0's auc: 0.767268
[400]	valid_0's auc: 0.773199
[500]	valid_0's auc: 0.777207
[600]	valid_0's auc: 0.780999
[700]	valid_0's auc: 0.782751
[800]	valid_0's auc: 0.78372
[900]	valid_0's auc: 0.785044
[1000]	valid_0's auc: 0.786337
[1100]	valid_0's auc: 0.787712
[1200]	valid_0's auc: 0.788498
[1300]

### 保存为json文件.

In [52]:
cnt=0
for id,names in submission.items():
    for name in names:
        submission[id][name]=test_preds[cnt]
        cnt+=1
with open('test_preds.json', 'w', encoding='utf-8') as f:
    json.dump(submission, f, ensure_ascii=False, indent=4)