In [5]:
import sys
sys.path.append("periodical-clustering")

from utils import get_data_dir

# Filter out papers between 2010-2021

In [None]:
import pandas as pd
import os

paper_df = pd.read_parquet(os.path.join(get_data_dir(), "MAG_paper.parquet"))

data_dir:  /home/lyuzhuoqi/projects/clustering/data


In [41]:
paper_df

Unnamed: 0_level_0,Year,DocType,VenueID
PaperID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3206416479,1800,Journal,2360835
3205676634,1800,Journal,148921865
3092044961,1800,Journal,3006142753
3092547797,1800,Journal,3006142753
2895498877,1800,Journal,118082279
...,...,...,...
3212273925,2022,Journal,44455300
3175031963,2022,Journal,2764413287
3198241111,2022,Journal,67716761
3202490341,2022,Journal,166541267


In [45]:
paper_df[(paper_df['Year'] >= 2010) & (paper_df['Year'] <= 2021)]

Unnamed: 0_level_0,Year,DocType,VenueID
PaperID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2049933365,2010,Journal,80951755
2038148770,2010,Journal,119525064
2373199189,2010,Journal,2764482698
2289331308,2010,Journal,2764425571
2784227654,2010,Journal,77020770
...,...,...,...
3185296615,2021,Journal,8391440
3165199696,2021,Journal,120683614
3212184219,2021,Journal,157451995
3193186508,2021,Journal,2764944466


In [47]:
paper_df[(paper_df['Year'] >= 2010) & (paper_df['Year'] <= 2021)].to_parquet(os.path.join(get_data_dir(), 
                                                                                          "2010s", "MAG_paper.parquet"))

data_dir:  /home/lyuzhuoqi/projects/clustering/data


# Process abstracts

In [2]:
import os
import ijson
import json
from tqdm import tqdm
import pandas as pd
from typing import Dict, Any

class DataPreprocessor:
    def __init__(self, data_dir: str):
        self.data_dir = data_dir
        self.abstracts_dir = os.path.join(data_dir, '2010s/classification_tasks/abstracts')
        self.labels_dir = os.path.join(data_dir, '2010s/classification_tasks/labels')
        os.makedirs(self.abstracts_dir, exist_ok=True)
        os.makedirs(self.labels_dir, exist_ok=True)

    def process_abstracts(self, mag_abstracts_path: str, cluster_df: pd.DataFrame,
                        paper_to_venue: Dict[str, Any]) -> None:
        """提取并保存论文摘要"""
        print("Filtering valid papers...")
        # 将cluster_df的index转换为集合，这样查找更快
        valid_venues = set(cluster_df.index)
        # 使用集合推导式，比列表推导式更快
        valid_pid = {pid for pid, vid in paper_to_venue.items() if vid in valid_venues}
        print(f"valid_pid: {len(valid_pid)}")

        # 获取文件大小用于进度条
        file_size = os.path.getsize(mag_abstracts_path)
        matched_count = 0
        
        # 准备输出路径
        output_path = os.path.join(self.abstracts_dir, 'paper_abstracts.json')
        # 使用批量处理提高写入效率
        batch = {}
        batch_size = 50000
        with open(mag_abstracts_path, 'r', encoding='utf-8') as input_file, \
            open(output_path, 'w', encoding='utf-8') as output_file:
            
            # 初始化进度条
            with tqdm(total=file_size, desc="Processing abstracts", 
                    unit="B", unit_scale=True) as pbar:
                
                parser = ijson.kvitems(input_file, "")  # 解析顶层键值对
                batch = []
                batch_size = 50000  # 增大批量写入规模
                counter = 0  # 控制进度条更新频率

                for paper_id, abstract in parser:
                    counter += 1
                    if counter % 10000 == 0:  # 每 1000 条更新一次进度条
                        pbar.update(input_file.tell() - pbar.n)

                    try:
                        paper_id = int(paper_id)
                        if paper_id in valid_pid:
                            batch.append({'PaperID': paper_id,
                                          'abstract':abstract})
                            matched_count += 1
                    except ValueError:
                        continue

                    if len(batch) >= batch_size:
                        output_file.write("\n".join(json.dumps(item) for item in batch) + "\n")
                        batch = []

                if batch:  # 写入剩余的批量数据
                    output_file.write("\n".join(json.dumps(item) for item in batch) + "\n")
        print(f"abstracts: {matched_count}")
        print(f"Saved to {output_path}")

    def extract_labels(self, cluster_df: pd.DataFrame, 
                      paper_to_venue: Dict[int, int]) -> None:
        """为每种聚类方法提取并保存标签"""
        # 获取所有标签列
        label_columns = [
            # 'Scopus_label', 
            # 'movMF_label', 
            # 'kmeans_label', 
            # 'skm_label',
            # 'n2v_kmeans_label',
            # 'cm_kmeans_label',
            # 'bert_kmeans_label',
            # 'gnn_kmeans_label',
            'cocm_kmeans_label']  # 添加所需的标签列
        
        for label_col in label_columns:
            print(f"Processing {label_col}...")
            # 构建PaperID到标签的映射
            venue_to_label = cluster_df[label_col].to_dict()
            paper_labels = {
                pid: venue_to_label[vid] 
                for pid, vid in paper_to_venue.items() 
                if vid in venue_to_label
            }
            
            # 保存标签
            output_path = os.path.join(
                self.labels_dir, 
                f"{label_col.lower()}.json"
            )
            with open(output_path, 'w') as f:
                json.dump(paper_labels, f)
            print(f"Saved to {output_path}")

In [6]:
data_dir = get_data_dir()
MAG_paper_df = pd.read_parquet(os.path.join(get_data_dir(), "2010s","MAG_paper.parquet"))
paper_to_venue = MAG_paper_df['VenueID'].to_dict()
paper_to_venue

data_dir:  /home/zqlyu2/projects/periodical-clustering/data
data_dir:  /home/zqlyu2/projects/periodical-clustering/data


{2049933365: 80951755,
 2038148770: 119525064,
 2373199189: 2764482698,
 2289331308: 2764425571,
 2784227654: 77020770,
 3032776793: 2764659248,
 2359024845: 2764610422,
 2090765587: 2755434998,
 1982732226: 183492911,
 2385836617: 2764593375,
 2809173448: 2764863013,
 3140293050: 2622093537,
 2375443784: 2764554180,
 1987591524: 202680183,
 2071864527: 190066210,
 2040405307: 105183386,
 1986439752: 1980519,
 2153779033: 91660768,
 2078947818: 201081534,
 2024704296: 1135705775,
 2392142542: 2764554180,
 2051541198: 59479367,
 2058138512: 111727011,
 2357419162: 2764810065,
 3029664364: 2764697616,
 2022427143: 69957782,
 2143932387: 204931651,
 2353867085: 2764638780,
 2116213074: 115312524,
 2374396473: 2764819181,
 3020209503: 1191630588,
 2070406492: 6147291,
 2080860174: 152760256,
 1986865494: 113170167,
 2027925926: 1192710900,
 2089901363: 2915058201,
 2394031763: 2764453096,
 1992202857: 15484759,
 2089307567: 111155417,
 2080680783: 104917558,
 2357501125: 2764598783,
 20670

In [7]:
cluster_df = pd.read_parquet(os.path.join(get_data_dir(), "2010s/clustering_results","cluster_df.parquet"))

data_dir:  /home/zqlyu2/projects/periodical-clustering/data


## Make abstracts

In [9]:
preprocessor = DataPreprocessor(data_dir)

In [None]:
preprocessor.process_abstracts(os.path.join(data_dir,'MAG_abstracts.json'), cluster_df, paper_to_venue)

Filtering valid papers...
valid_pid: 29309324


Processing abstracts: 100%|███████████████████████████████████████████████████████████████████████████████████▉| 177G/177G [07:38<00:00, 385MB/s]

abstracts: 23322430
Saved to /home/lyuzhuoqi/projects/clustering/data/2010s/classification_tasks/abstracts/paper_abstracts.json





In [25]:
import json

def read_lines(file_path, n: int=2):
    """
    从 Flat JSON 文件中读取前两行并打印解析结果。
    
    Args:
        file_path (str): Flat JSON 文件路径（每行一个 JSON 对象）。
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f):
            if idx < n:  # 只处理前n行
                try:
                    parsed_line = json.loads(line.strip())  # 解析 JSON 对象
                    print(json.dumps(parsed_line, indent=4, ensure_ascii=False))  # 美化输出
                except json.JSONDecodeError as e:
                    print(f"解析失败：{e}, 原始数据：{line.strip()}")
            else:
                break

In [59]:
output_path = os.path.join(get_data_dir(), "2010s/classification_tasks/abstracts", "paper_abstracts.json")
read_lines(output_path, 20)

data_dir:  /home/lyuzhuoqi/projects/clustering/data
{
    "PaperID": 2055207701,
    "abstract": "Abstract Lake Trout Salvelinus namaycush were extirpated from Lake Michigan by the early 1950s, and as part of an effort to restore naturally reproducing populations, hatchery-reared fish have been stocked since the early 1960s. Stocked fish are marked with a fin clip to differentiate them from wild, lake-produced Lake Trout; marking error for the 2007–2010 year-classes of Lake Trout stocked by federal hatcheries averaged 3.0%. Egg deposition, emergent fry, and wild juvenile Lake Trout have previously been observed, but no sustained wild recruitment has been measured in assessment surveys or in sport and commercial fishery catches. In 2011 and 2012, we caught juvenile Lake Trout in gill-net and bottom trawl catches that were targeting Bloater Coregonus hoyi in water depths greater than 80 m. Unclipped, wild Lake Trout represented 20% of all Lake Trout caught in a southern offshore region o

## Make labels

In [10]:
preprocessor.extract_labels(cluster_df, paper_to_venue)

Processing cocm_kmeans_label...
Saved to /home/zqlyu2/projects/periodical-clustering/data/2010s/classification_tasks/labels/cocm_kmeans_label.json


# Statistics

## All papers

In [16]:
data_dir = get_data_dir()
import os
import pandas as pd
MAG_paper_df = pd.read_parquet(os.path.join(get_data_dir(), "2010s","MAG_paper.parquet"))
cluster_df = pd.read_parquet(os.path.join(get_data_dir(), "2010s/clustering_results","cluster_df.parquet"))

data_dir:  /home/lyuzhuoqi/projects/clustering/data
data_dir:  /home/lyuzhuoqi/projects/clustering/data
data_dir:  /home/lyuzhuoqi/projects/clustering/data


In [17]:
cluster_df

Unnamed: 0,Scopus_label,movMF_label,movMF_distance,x_val,y_val,kmeans_label,kmeans_distance,skm_label,skm_distance,spectral_label,n2v_kmeans_label,cm_kmeans_label,gnn_kmeans_label,bert_kmeans_label
202381698,Multidisciplinary,22,0.445886,-67.928200,15.572327,17,0.628846,20,0.444711,24,3,21,0,0
137773608,Multidisciplinary,22,0.590942,-68.405334,-55.633186,17,0.735654,20,0.559494,24,20,21,0,3
125754415,Multidisciplinary,22,0.574571,-68.448853,-55.613579,17,0.705024,20,0.550081,24,20,21,0,0
3880285,Multidisciplinary,23,0.618842,-68.407288,-55.634430,8,0.724859,17,0.610582,24,20,21,0,3
111155417,Chemistry,23,0.220853,-54.506985,-61.217068,11,0.495787,12,0.198758,14,4,24,21,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764485818,Medicine,21,0.268070,-28.969574,34.819569,18,0.542531,14,0.251015,8,18,1,19,1
83454320,Arts and Humanities,25,0.034777,78.609909,31.736822,13,0.251599,3,0.056642,21,5,1,14,1
16507453,Arts and Humanities,6,0.113656,89.206772,17.625090,13,0.307089,3,0.128660,21,5,1,14,1
121509672,Agricultural and Biological Sciences,5,0.179111,-36.757057,-0.591017,21,0.430040,16,0.207435,10,5,1,10,1


In [18]:
MAG_paper_df

Unnamed: 0_level_0,Year,DocType,VenueID
PaperID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2049933365,2010,Journal,80951755
2038148770,2010,Journal,119525064
2373199189,2010,Journal,2764482698
2289331308,2010,Journal,2764425571
2784227654,2010,Journal,77020770
...,...,...,...
3185296615,2021,Journal,8391440
3165199696,2021,Journal,120683614
3212184219,2021,Journal,157451995
3193186508,2021,Journal,2764944466


In [19]:
MAG_paper_df.merge(cluster_df['Scopus_label'], left_on='VenueID', right_index=True)

Unnamed: 0_level_0,Year,DocType,VenueID,Scopus_label
PaperID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2038148770,2010,Journal,119525064,Agricultural and Biological Sciences
2289331308,2010,Journal,2764425571,Agricultural and Biological Sciences
2784227654,2010,Journal,77020770,Computer Science
3032776793,2010,Journal,2764659248,Medicine
1982732226,2010,Journal,183492911,Materials Science
...,...,...,...,...
3138140877,2021,Journal,196821226,Computer Science
3185296615,2021,Journal,8391440,Medicine
3165199696,2021,Journal,120683614,Computer Science
3212184219,2021,Journal,157451995,"Biochemistry, Genetics and Molecular Biology"


In [20]:
label_df = MAG_paper_df.merge(cluster_df['Scopus_label'], left_on='VenueID', right_index=True)
label_counts = label_df['Scopus_label'].value_counts()
total = len(label_df)
percentage_df = pd.DataFrame({
    'Count': label_counts,
    'Percentage': (label_counts / total * 100).round(2)
})
percentage_df.loc['In total', 'Count'] = total
percentage_df.loc['In total', 'Percentage'] = 100
percentage_df.to_csv(os.path.join(data_dir, "2010s", "MAG_paper_Scopus_label_distribution.csv"))
percentage_df


Unnamed: 0_level_0,Count,Percentage
Scopus_label,Unnamed: 1_level_1,Unnamed: 2_level_1
Medicine,8879364.0,30.3
Engineering,2152101.0,7.34
Social Sciences,1862276.0,6.35
Physics and Astronomy,1670077.0,5.7
Agricultural and Biological Sciences,1658349.0,5.66
Chemistry,1525236.0,5.2
"Biochemistry, Genetics and Molecular Biology",1482029.0,5.06
Materials Science,1350517.0,4.61
Arts and Humanities,873076.0,2.98
Mathematics,763526.0,2.61


## Paper with abstract

In [4]:
import json
import os

with open(os.path.join(get_data_dir(), "2010s/classification_tasks/labels", "scopus_label.json"), 
          'r', encoding='utf-8') as file:
        loaded_labels = json.load(file)

data_dir:  /home/lyuzhuoqi/projects/clustering/data


In [5]:
labels = {}
for key, value in loaded_labels.items():
    converted_key = int(key)
    labels[converted_key] = value
del loaded_labels

In [6]:
labels

{2038148770: 'Agricultural and Biological Sciences',
 2289331308: 'Agricultural and Biological Sciences',
 2784227654: 'Computer Science',
 3032776793: 'Medicine',
 1982732226: 'Materials Science',
 1987591524: 'Chemical Engineering',
 2071864527: 'Social Sciences',
 2040405307: 'Biochemistry, Genetics and Molecular Biology',
 1986439752: 'Physics and Astronomy',
 2153779033: 'Agricultural and Biological Sciences',
 2078947818: 'Multidisciplinary',
 2051541198: 'Engineering',
 2058138512: 'Medicine',
 2022427143: 'Agricultural and Biological Sciences',
 2143932387: 'Chemistry',
 2116213074: 'Materials Science',
 2070406492: 'Immunology and Microbiology',
 2080860174: 'Chemistry',
 1986865494: 'Chemistry',
 2089307567: 'Chemistry',
 2080680783: 'Social Sciences',
 2067038653: 'Agricultural and Biological Sciences',
 2001170778: 'Biochemistry, Genetics and Molecular Biology',
 1986621857: 'Medicine',
 2064995156: 'Earth and Planetary Sciences',
 1581063856: 'Medicine',
 2052373745: 'Mate

In [7]:
1967969092 in labels

True

In [8]:
import os
from tqdm import tqdm
import json
abstracts_path = os.path.join(os.path.join(get_data_dir(), '2010s/classification_tasks/abstracts/paper_abstracts.json'))

print("Loading abstracts...")
abstracts = {}
with open(abstracts_path, 'r') as f:
    for line in tqdm(f):
        try:
            paper_data = json.loads(line.strip())
            pid = paper_data['PaperID']
            abstracts[pid] = paper_data['abstract']
        except (json.JSONDecodeError, KeyError) as e:
            continue
print(f"Loaded {len(abstracts)} abstracts")

data_dir:  /home/lyuzhuoqi/projects/clustering/data
Loading abstracts...


23322430it [03:12, 120962.20it/s]


Loaded 23322430 abstracts


In [9]:
data = {}
for PaperID in tqdm(abstracts):
    if PaperID in labels:
        data[PaperID] = labels[PaperID]

del abstracts
del labels

100%|████████████████████████████████████████████████████████████████████████████████████████████| 23322430/23322430 [02:22<00:00, 163501.53it/s]


In [10]:
data

{2055207701: 'Agricultural and Biological Sciences',
 2055207717: 'Medicine',
 2055207742: 'Social Sciences',
 2055207772: 'Medicine',
 2055207790: 'Immunology and Microbiology',
 2055207792: 'Immunology and Microbiology',
 2055207803: 'Nursing',
 2055207810: 'Chemistry',
 2055207824: 'Medicine',
 2055207847: 'Physics and Astronomy',
 2055207878: 'Physics and Astronomy',
 2055207889: 'Medicine',
 2055207947: 'Chemical Engineering',
 2055207960: 'Multidisciplinary',
 2055207977: 'Biochemistry, Genetics and Molecular Biology',
 2055207981: 'Chemistry',
 2055208009: 'Engineering',
 2055208066: 'Multidisciplinary',
 2055208080: 'Social Sciences',
 2055208095: 'Medicine',
 2055208100: 'Mathematics',
 2055208137: 'Immunology and Microbiology',
 2055208159: 'Materials Science',
 2055208178: 'Social Sciences',
 2055208198: 'Social Sciences',
 2055208231: 'Engineering',
 2055208235: 'Medicine',
 2055208283: 'Biochemistry, Genetics and Molecular Biology',
 2055208284: 'Agricultural and Biologica

In [12]:
import pandas as pd
with_abs_labels = pd.DataFrame(data.items(), columns=['PaperID', 'Scopus_label'])
del data
with_abs_labels

Unnamed: 0,PaperID,Scopus_label
0,2055207701,Agricultural and Biological Sciences
1,2055207717,Medicine
2,2055207742,Social Sciences
3,2055207772,Medicine
4,2055207790,Immunology and Microbiology
...,...,...
23322425,2726882176,Multidisciplinary
23322426,2726882179,Social Sciences
23322427,2726882219,Materials Science
23322428,2726882240,"Business, Management and Accounting"


In [13]:
with_abs_labels.set_index('PaperID', inplace=True)
with_abs_labels.to_parquet(os.path.join(get_data_dir(), "2010s","classification_tasks",  "paper_with_abs_scopus.parquet"))

data_dir:  /home/lyuzhuoqi/projects/clustering/data


In [15]:
with_abs_label_counts = with_abs_labels.value_counts()
with_abs_total = len(with_abs_labels)
with_abs_percentage_df = pd.DataFrame({
    'Count': with_abs_label_counts,
    'Percentage': (with_abs_label_counts / with_abs_total * 100).round(2)
})
with_abs_percentage_df.loc['In total', 'Count'] = with_abs_total
with_abs_percentage_df.loc['In total', 'Percentage'] = 100
with_abs_percentage_df.to_csv(os.path.join(get_data_dir(), "2010s", "MAG_paper_with_abstract_Scopus_label_distribution.csv"))
with_abs_percentage_df

data_dir:  /home/lyuzhuoqi/projects/clustering/data


Unnamed: 0_level_0,Count,Percentage
Scopus_label,Unnamed: 1_level_1,Unnamed: 2_level_1
Medicine,6411733.0,27.49
Engineering,1813378.0,7.78
Physics and Astronomy,1479608.0,6.34
Agricultural and Biological Sciences,1393633.0,5.98
Social Sciences,1372120.0,5.88
Chemistry,1370159.0,5.87
"Biochemistry, Genetics and Molecular Biology",1261428.0,5.41
Materials Science,1158695.0,4.97
Mathematics,671648.0,2.88
Environmental Science,663621.0,2.85


## Compare

In [21]:
level0 = ['Dataset', 'Dataset', 'MAG', 'MAG']
level1 = ['Count', 'Percentage', 'Count', 'Percentage']

# 创建MultiIndex
cols = pd.MultiIndex.from_arrays([level0, level1])

label_distri_df = with_abs_percentage_df.merge(percentage_df,on='Scopus_label', suffixes=('_dataset', '_MAG'))
label_distri_df.columns = cols
label_distri_df

Unnamed: 0_level_0,Dataset,Dataset,MAG,MAG
Unnamed: 0_level_1,Count,Percentage,Count,Percentage
Scopus_label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Medicine,6411733.0,27.49,8879364.0,30.3
Engineering,1813378.0,7.78,2152101.0,7.34
Physics and Astronomy,1479608.0,6.34,1670077.0,5.7
Agricultural and Biological Sciences,1393633.0,5.98,1658349.0,5.66
Social Sciences,1372120.0,5.88,1862276.0,6.35
Chemistry,1370159.0,5.87,1525236.0,5.2
"Biochemistry, Genetics and Molecular Biology",1261428.0,5.41,1482029.0,5.06
Materials Science,1158695.0,4.97,1350517.0,4.61
Mathematics,671648.0,2.88,763526.0,2.61
Environmental Science,663621.0,2.85,760812.0,2.6


In [22]:
label_distri_df.to_excel(os.path.join(data_dir, "2010s", "paper_scopus_label_distribution.xlsx"))