In [152]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [153]:
import os
import sys
sys.path.append("clustering")
from utils import get_data_dir

In [165]:
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def extract_and_process_large_json(input_file, train_file, test_file, target_label="Medicine", limit=50000, test_size=0.2):
    """
    从大 JSON 文件中提取目标数据，编码后分为训练集和测试集。
    
    Args:
        input_file (str): 输入 JSON 文件路径。
        train_file (str): 输出训练集 JSON 文件路径。
        test_file (str): 输出测试集 JSON 文件路径。
        target_label (str): 目标类别标签。
        limit (int): 每种类别的抽取数量。
        test_size (float): 测试集比例。
    """
    target_papers = []
    non_target_papers = []
    target_count = 0
    non_target_count = 0

    # 逐行读取大文件
    with open(input_file, 'r', encoding='utf-8') as infile:
        for line in infile:
            try:
                # 去掉多余逗号并解析 JSON
                record = json.loads(line.strip(",\n"))
                scopus_label = record.get("Scopus_label", "")
                
                # 分类数据
                if scopus_label == target_label and target_count < limit:
                    target_papers.append(record)
                    target_count += 1
                elif scopus_label != target_label and non_target_count < limit:
                    non_target_papers.append(record)
                    non_target_count += 1
                
                # 如果达到限制，停止读取
                if target_count >= limit and non_target_count >= limit:
                    break
            except json.JSONDecodeError:
                continue  # 跳过解析失败的行
    
    print(f"已提取 {len(target_papers)} 篇 {target_label} 和 {len(non_target_papers)} 篇非 {target_label} 的记录")

    # 合并数据
    all_data = target_papers + non_target_papers

    # 对 Scopus_label 进行编码
    labels = [record["Scopus_label"] for record in all_data]
    encoder = LabelEncoder()
    encoded_labels = encoder.fit_transform(labels)
    for i, record in enumerate(all_data):
        record["Scopus_label"] = int(encoded_labels[i])
    
    label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

    # 分割数据为训练集和测试集
    train_data, test_data = train_test_split(all_data, test_size=test_size, random_state=42)

    # 保存到 JSON 文件
    with open(train_file, 'w', encoding='utf-8') as trainfile:
        json.dump(train_data, trainfile, indent=2)
    with open(test_file, 'w', encoding='utf-8') as testfile:
        json.dump(test_data, testfile, indent=2)
    
    print(f"训练数据保存到 {train_file}，测试数据保存到 {test_file}")
    return label_mapping

In [166]:
# 使用示例
Scopus_label_map = extract_and_process_large_json(
    input_file=os.path.join(get_data_dir(),"2010s","dataset.json"),
    train_file=os.path.join(get_data_dir(), "2010s","train.json"),
    test_file=os.path.join(get_data_dir(), "2010s","test.json"),
    target_label="Medicine",
    limit=50000,
    test_size=0.2
)

data_dir:  /home/lyuzhuoqi/projects/clustering/data
data_dir:  /home/lyuzhuoqi/projects/clustering/data
data_dir:  /home/lyuzhuoqi/projects/clustering/data
已提取 50000 篇 Medicine 和 50000 篇非 Medicine 的记录
训练数据保存到 /home/lyuzhuoqi/projects/clustering/data/2010s/train.json，测试数据保存到 /home/lyuzhuoqi/projects/clustering/data/2010s/test.json


In [167]:
Scopus_label_map['Medicine']

np.int64(18)

In [168]:
import json

def load_data(train_file, test_file):
    """
    加载训练集和测试集数据。
    
    Args:
        train_file (str): 训练集文件路径。
        test_file (str): 测试集文件路径。
    
    Returns:
        list: 训练集数据。
        list: 测试集数据。
    """
    with open(train_file, 'r', encoding='utf-8') as f:
        train_data = json.load(f)
    with open(test_file, 'r', encoding='utf-8') as f:
        test_data = json.load(f)
    return train_data, test_data

# 加载数据
train_file = os.path.join(get_data_dir(),"2010s","train.json")
test_file = os.path.join(get_data_dir(),"2010s","test.json")
train_data, test_data = load_data(train_file, test_file)

data_dir:  /home/lyuzhuoqi/projects/clustering/data
data_dir:  /home/lyuzhuoqi/projects/clustering/data


In [169]:
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# 自定义词映射
word_mapping = {'mechanics': 'mechanic',
                'mechanical': 'mechanic',
                'electrical': 'electric',
                'electronics': 'electric',
                'financial': 'finance',
                'political': 'politics',
                'historical': 'history',
                'computer': 'computing',
                'intelligent': 'intelligence',
                'agricultural': 'agriculture',
                'educational': 'education',
                'dental': 'dentistry',
                'archaeological': 'archaeology',
                'mathematical': 'mathematics',
                'mathematica': 'mathematics',
                'matematico': 'mathematics',
                'mathematicae': 'mathematics',
                'economic': 'economics',
                'chemical': 'chemistry',
                'geophysical': 'geophysics',
                'botanical': 'botany',
                'physical': 'physics',
                'entomological': 'entomology', 
                'entomologist': 'entomology',
                'biological': 'biology',
                'geographical': 'geography',
                'geological': 'geology',
                'geographer': 'geography',
                'cells': 'cell',
                'policy': 'politics',
}

paper_df = pd.concat([pd.DataFrame(train_data), pd.DataFrame(test_data)])

# 定义预处理函数，进行自定义词映射
def preprocess_text(text):
    words = text.lower().split()  # 将文本小写并按空格分词
    mapped_words = [word_mapping[word] if word in word_mapping else word for word in words]
    filtered_words = [word for word in mapped_words]
    return ' '.join(filtered_words)

# 按 cluster_label 分组，将 OriginalVenue 文本合并为一个文档，并进行预处理
cluster_docs = (
    paper_df.groupby('cluster_label').parallel_apply(lambda x: preprocess_text(' '.join(x.abstract))).tolist()
)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3), Label(value='0 / 3'))), HBox(c…

In [None]:
# 自定义停用词
custom_stop_words = ['results', 'model', 'based', 'data', 'proposed', 'study', 'paper', 'results', 
                     'system', 'patients', 'using', 'method', 'health', 'two', 'article', 'la', 'que', 'en',
                     'el', 'et', 'un', 'one', 'also', 'high', 'properties', 'methods', 'among', 'new',
                     'sp', 'use', 'group', 'used', 'process', 'kg', 'abstract', 'different', 'time',
                     'treatment', 'analysis', 'different', 'performance', 'le', 'los', 'se', 'les',
                     'mm', 'may', 'de', 'del', 'des', 'nov', 'found', 'research', 'showed', 'las',
                     'development', 'years', 'da', 'studies', 'first', 'findings', 'di', 'however', 'three',
                     'associated', 'relationship', 'design', 'em', 'approach', 'risk', 'patient', 'care',
                     '95', 'age', 'ci', 'compared', '10', 'background', 'conclusions', 'significant',
                     'higher', 'vs', 'mortality', 'therapy', 'increased', 'significantly', '19', 'children',
                     'find', 'show', 'problem', 'mathrm', 'included', 'mean', 'systems', '12', 'effects',
                     'participants', 'related', 'cases', 'disease', 'levels', 'outcomes', 'non', 'total',
                     'factors', 'conclusion', 'nm', 'ra', 'year', 'dr'
                     ]

# 计算 TF-IDF
top_words_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english')+custom_stop_words)
tfidf_matrix = top_words_vectorizer.fit_transform(cluster_docs)

In [194]:
top_n = 10  # 每个聚类提取前 n 个关键词
# 获取每个聚类中最重要的关键词
top_words = {}
feature_names = top_words_vectorizer.get_feature_names_out()

# 确保 cluster_label 的顺序与 cluster_docs 一致
cluster_labels = sorted(paper_df['cluster_label'].unique())

for j, cluster_label in enumerate(cluster_labels):
    tfidf_scores = tfidf_matrix[j].toarray().flatten()
    top_indices = tfidf_scores.argsort()[-top_n:][::-1]  # 获取前 n 个关键词的索引
    top_words[cluster_label] = [feature_names[idx] for idx in top_indices]

top_words

{np.int64(0): ['energy',
  'temperature',
  'heat',
  'flow',
  'water',
  'surface',
  'parameters',
  'effect',
  'experimental',
  'conditions'],
 np.int64(1): ['algorithm',
  'information',
  'learning',
  'image',
  'network',
  'models',
  'images',
  'features',
  'propose',
  'algorithms'],
 np.int64(2): ['social',
  'management',
  'firms',
  'knowledge',
  'value',
  'purpose',
  'work',
  'business',
  'information',
  'literature'],
 np.int64(3): ['power',
  'control',
  'algorithm',
  'network',
  'voltage',
  'energy',
  'frequency',
  'antenna',
  'low',
  'sensor'],
 np.int64(4): ['dogs',
  'cows',
  'animals',
  'cell',
  'cattle',
  'days',
  'clinical',
  'horses',
  'control',
  'animal'],
 np.int64(5): ['plant',
  'species',
  'soil',
  'acid',
  'plants',
  'growth',
  'content',
  'protein',
  'yield',
  'production'],
 np.int64(6): ['archaeology',
  'human',
  'holocene',
  'microwear',
  'pleistocene',
  'heritage',
  'sites',
  'cultural',
  'history',
  'cent

In [172]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

def preprocess_data(data, label_field, target_value=None):
    """
    对数据进行预处理，提取文本特征和标签。
    
    Args:
        data (list): 数据列表。
        label_field (str): 目标标签字段名（如 Scopus_label 或 cluster_label）。
        target_value (str or int): 目标分类的值（如 Medicine 或 1）。
    
    Returns:
        list: 文本数据列表。
        list: 标签数据列表。
    """
    texts = [item["abstract"] for item in data]
    if target_value is not None:
        labels = [1 if item[label_field] == target_value else 0 for item in data]
    else:
        labels = [item[label_field] for item in data]
    return texts, labels

def train_and_evaluate_with_tfidf(vectorizer, train_texts, train_labels, test_texts, test_labels):
    """
    使用预训练的 TF-IDF 进行向量化并训练 Random Forest 模型。
    
    Args:
        vectorizer (TfidfVectorizer): 已拟合的 TF-IDF 向量化器。
        train_texts (list): 训练文本数据。
        train_labels (list): 训练标签数据。
        test_texts (list): 测试文本数据。
        test_labels (list): 测试标签数据。
    """
    # 使用已拟合的 TF-IDF 进行向量化
    X_train = vectorizer.transform(train_texts)
    X_test = vectorizer.transform(test_texts)

    # 随机森林分类器
    classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    classifier.fit(X_train, train_labels)

    # 预测与评估
    predictions = classifier.predict(X_test)
    print("Classification Report:")
    print(classification_report(test_labels, predictions))
    print("Test accuracy:", accuracy_score(test_labels, predictions))

In [175]:
# Baseline task: 使用 Scopus_label 进行分类 (Medicine vs Others)
print("\nBaseline task: Binary classification based on Scopus_label (Medicine vs Others)")
train_texts, train_labels = preprocess_data(train_data, label_field="Scopus_label", target_value=Scopus_label_map['Medicine'])
test_texts, test_labels = preprocess_data(test_data, label_field="Scopus_label", target_value=Scopus_label_map['Medicine'])

# 拟合 TF-IDF（仅在训练数据上）
print("Fitting TF-IDF vectorizer...")
vectorizer = TfidfVectorizer(max_features=10000, stop_words=stopwords.words('english'))
vectorizer.fit(train_texts)

print("Training and evaluating Random Forest classifier...")
train_and_evaluate_with_tfidf(vectorizer, train_texts, train_labels, test_texts, test_labels)


Baseline task: Binary classification based on Scopus_label (Medicine vs Others)
Fitting TF-IDF vectorizer...
Training and evaluating Random Forest classifier...
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.86      0.88      9965
           1       0.87      0.91      0.89     10035

    accuracy                           0.89     20000
   macro avg       0.89      0.89      0.89     20000
weighted avg       0.89      0.89      0.89     20000

Test accuracy: 0.88795


In [176]:
from collections import Counter

# 任务 2: 使用 cluster_label 进行分类 (每个类 vs 非当前类)
print("\nTask 2: Binary classification based on cluster_label (current category vs others)")
cluster_label_count = Counter([item["cluster_label"] for item in train_data + test_data])
print("All cluster labels and corresponding paper number:", cluster_label_count.most_common())
for tup in cluster_label_count.most_common():
    cluster = tup[0]
    paper_number = tup[1]
    print(f"\nCurrent target binary classification category: cluster_label = {cluster}")
    print("Top words:", top_words[cluster])
    print(f"Number of papers: {paper_number} ({paper_number/(cluster_label_count.total())*100} %)")
    train_texts, train_labels = preprocess_data(train_data, label_field="cluster_label", target_value=cluster)
    test_texts, test_labels = preprocess_data(test_data, label_field="cluster_label", target_value=cluster)
    # 复用已拟合的 TF-IDF
    train_and_evaluate_with_tfidf(vectorizer, train_texts, train_labels, test_texts, test_labels)


Task 2: Binary classification based on cluster_label (current category vs others)
All cluster labels and corresponding paper number: [(15, 17818), (22, 11816), (20, 11564), (23, 11481), (12, 6413), (8, 5264), (0, 5226), (21, 3708), (18, 3239), (10, 2911), (5, 2471), (3, 2354), (9, 2035), (16, 1890), (13, 1750), (19, 1533), (1, 1453), (2, 1267), (24, 1179), (11, 956), (7, 878), (4, 857), (17, 850), (25, 517), (14, 380), (6, 190)]

Current target binary classification category: cluster_label = 15
Top words: ['clinical', 'cell', 'blood', 'hospital', 'levels']
Number of papers: 17818 (17.818 %)
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92     16408
           1       0.81      0.29      0.43      3592

    accuracy                           0.86     20000
   macro avg       0.84      0.64      0.68     20000
weighted avg       0.86      0.86      0.83     20000

Test accuracy: 0.86125

Current target binary class

KeyboardInterrupt: 