In [1]:
import shutil

import os
import json
import faiss  # pip install faiss-cpu

from bge import BGE

full_dataset = "./arxiv/arxiv-metadata-oai-snapshot.json"
selected_dataset = "./arxiv/arxiv-metadata-cscl.json"
CATEGORY2IDS = "./datasets/arxiv_category2ids.json"

### 下载完整元数据并节选出cs.CL

In [None]:
# Download latest version  https://www.kaggle.com/api/v1/datasets/download/Cornell-University/arxiv?dataset_version_number=233
# or > pip install kagglehub
import kagglehub

path = kagglehub.dataset_download("Cornell-University/arxiv")

print("Path to dataset files:", path)

dest_path = "./arxiv"
os.makedirs(dest_path, exist_ok=True)
for file_name in os.listdir(path):
    shutil.move(os.path.join(path, file_name), dest_path)

print("Dataset files moved to:", dest_path)

In [None]:
# 筛选出包含指定分类的元数据
def select(file_path, output_path, target="cs.CL"):
    selected_data = []
    with open(file_path, "r", encoding="utf-8") as file:
        # kagglehub 数据集默认 jsonl
        for line in file:
            record = json.loads(line.strip())
            if target in record.get("categories", ""):
                selected_data.append(record)

    with open(output_path, "w", encoding="utf-8") as output_file:
        json.dump(selected_data, output_file, ensure_ascii=False)

    print(f"共找到 {len(selected_data)} 条记录，已保存到 {output_path}")

    return selected_data


# 筛选 cs.CL
selected_data = select(full_dataset, selected_dataset, "cs.CL")

### 对cs.CL元数据作进一步处理，分出过滤表与向量表

前者（需要附带向量索引）负责进行如领域筛选（预处理阶段直接做好，多做点冗余分表）、年份范围（预排序，一次`lower_bound`二分，然后取到上界为止）筛选的工作；后者负责应对相似性搜索（Top 2N），得到的结果重新映射回过滤表，并剔除不在候选集中的部分

In [1]:
import bisect

def lower_bound(sorted_list, value):
    return bisect.bisect_left(sorted_list, value)
def upper_bound(sorted_list, value):
    return bisect.bisect_right(sorted_list, value)

print("lower_bound:", lower_bound([1, 2, 4, 5], 3))

lower_bound: 2


总共包含`142`种细分类

In [3]:
with open(selected_dataset, "r", encoding="utf-8") as file:
    selected_data = json.load(file)

In [9]:
extracted_data = {}
for line in selected_data:
    extracted_data[line["id"]] = {
        "title": line.get("title"),
        "doi": line.get("doi"),
        "categories": line.get("categories"),
        "abstract": line.get("abstract"),
        "publish_date": next(
            (
                version["created"]
                for version in line.get("versions", [])
                if version["version"] == "v1"
            ),
            None,
        ),
    }

with open("./datasets/simplify_data.json", "w", encoding="utf-8") as output_file:
    json.dump(extracted_data, output_file,indent=4, ensure_ascii=False)

{'title': 'Natural Language Processing (almost) from Scratch', 'doi': None, 'categories': 'cs.LG cs.CL', 'abstract': '  We propose a unified neural network architecture and learning algorithm that\ncan be applied to various natural language processing tasks including:\npart-of-speech tagging, chunking, named entity recognition, and semantic role\nlabeling. This versatility is achieved by trying to avoid task-specific\nengineering and therefore disregarding a lot of prior knowledge. Instead of\nexploiting man-made input features carefully optimized for each task, our\nsystem learns internal representations on the basis of vast amounts of mostly\nunlabeled training data. This work is then used as a basis for building a\nfreely available tagging system with good performance and minimal computational\nrequirements.\n', 'publish_date': 'Wed, 2 Mar 2011 11:34:50 GMT'}


In [14]:
from collections import defaultdict
from datetime import datetime
# 时间戳到 ID
date_to_ids = defaultdict(list)
for line in selected_data:
    publish_date = next(
        (
            version["created"]
            for version in line.get("versions", [])
            if version["version"] == "v1"
        ),
        None,
    )
    if publish_date:
        timestamp = datetime.strptime(publish_date, "%a, %d %b %Y %H:%M:%S %Z").timestamp()
        date_to_ids[timestamp].append(line["id"])

date_to_ids = dict(sorted(date_to_ids.items()))
with open("./datasets/date_to_ids.json", "w", encoding="utf-8") as output_file:
    json.dump(date_to_ids, output_file, indent=4, ensure_ascii=False)

In [10]:
with open("./datasets/simplify_data.json", "r", encoding="utf-8") as file:
    extracted_data = json.load(file)
    
print(extracted_data["cs/9912006"])

{'title': 'Resolution of Verb Ellipsis in Japanese Sentence using Surface\n  Expressions and Examples', 'doi': None, 'categories': 'cs.CL', 'abstract': '  Verbs are sometimes omitted in Japanese sentences. It is necessary to recover\nomitted verbs for purposes of language understanding, machine translation, and\nconversational processing. This paper describes a practical way to recover\nomitted verbs by using surface expressions and examples. We experimented the\nresolution of verb ellipses by using this information, and obtained a recall\nrate of 73% and a precision rate of 66% on test sentences.\n', 'publish_date': 'Mon, 13 Dec 1999 05:19:46 GMT'}


In [23]:
# 分类到 id
category_to_ids = {}
for record in selected_data:
    categories = record.get("categories", "").split()
    doc_id = record.get("id")

    # 特殊处理 cs.CL 分类：仅包含单领域文献
    if "cs.CL" in categories and len(categories) == 1:
        if "cs.CL" not in category_to_ids:
            category_to_ids["cs.CL"] = []
        category_to_ids["cs.CL"].append(doc_id)
        continue
    
    # 其他分类
    for category in categories:
        if category == "cs.CL" and len(categories) > 1:
            continue  # 跳过跨领域文献的 cs.CL 标签
        if category not in category_to_ids:
            category_to_ids[category] = []
        category_to_ids[category].append(doc_id)


sorted_category_to_ids = dict(
    sorted(category_to_ids.items(), key=lambda item: len(item[1]), reverse=True)
)

for i, (category, ids) in enumerate(list(category_to_ids.items())[-2:]):
    print(f"{category}: {ids}")

unique_ids = set()
for ids in category_to_ids.values():
    unique_ids.update(ids)
print(f"处理正确：{len(selected_data)==len(unique_ids)}")

os.makedirs(os.path.dirname(CATEGORY2IDS), exist_ok=True)
with open(CATEGORY2IDS, "w", encoding="utf-8") as f:
    json.dump(sorted_category_to_ids, f, ensure_ascii=False, indent=4)

adap-org: ['cs/9902027']
physics.class-ph: ['physics/0307117']
处理正确：True


In [3]:
def preprocess(records):
    texts, ids = [], []
    for rec in records:
        ids.append(rec["id"])
        
        # 拼接文本（跨领域和发表时间放在外面）
        text = (
            f"title: {rec['title'].strip()}\n"
            f"authors: {rec['authors'].strip()}\n"            
            f"abstract: {rec['abstract'].strip()}\n"
        )
        texts.append(text)
    return ids, texts

In [None]:
%git clone https://huggingface.co/BAAI/bge-base-en-v1.5 ./models/bge-base-en-v1.5

In [5]:
ids, texts = preprocess(selected_data)
print("[INFO] Preprocessing complete. Number of records:", len(texts))
# 保存 ID 列表以便检索结果映射
with open("./datasets/bge_id_map_cscl.json", "w") as f:
    json.dump(ids, f)
print("[INFO] FAISS index and ID map saved.")

for id, text in list(zip(ids, texts))[:1]:
    print(f"\nID: {id}\nText:\n{text}\n")

[INFO] Preprocessing complete. Number of records: 83334
[INFO] FAISS index and ID map saved.

ID: 0704.2083
Text:
title: Introduction to Arabic Speech Recognition Using CMUSphinx System
authors: H. Satori, M. Harti and N. Chenfour
abstract: In this paper Arabic was investigated from the speech recognition problem
point of view. We propose a novel approach to build an Arabic Automated Speech
Recognition System (ASR). This system is based on the open source CMU Sphinx-4,
from the Carnegie Mellon University. CMU Sphinx is a large-vocabulary;
speaker-independent, continuous speech recognition system based on discrete
Hidden Markov Models (HMMs). We build a model using utilities from the
OpenSource CMU Sphinx. We will demonstrate the possible adaptability of this
system to Arabic voice recognition.




In [6]:
import numpy as np
bge = BGE()
vectors = bge.embed_texts(texts)
np.save("./datasets/bge_vectors_cscl.npy", np.vstack(vectors))
print("[INFO] Text embedding complete. Number of vectors:", len(vectors))

生成嵌入中: 100%|██████████| 83334/83334 [4:36:33<00:00,  5.02it/s]  


[INFO] Text embedding complete. Number of vectors: 83334


In [12]:
vectors = np.array(np.load("./datasets/bge_vectors_cscl.npy"), dtype=np.float32)
print("[INFO] Vectors loaded. Shape:", vectors.shape)

[INFO] Vectors loaded. Shape: (83334, 768)


In [13]:
# IVF_PQ 索引   PQ加速检索同时减少内存占用（略微降低一些准确性）
d = vectors.shape[1]
quantizer = faiss.IndexFlatL2(d)
idx = faiss.IndexIVFPQ(quantizer, d, 300, 64, 16)
idx.train(vectors)
idx.add(vectors)

faiss.write_index(idx, "./datasets/faiss_cscl_ivfpq.idx")