# Head

In [72]:
import pandas as pd
from pathlib import Path
home = Path.home()

# models = ['qwen3', 'gemma3', 'llama4', 'qwq']
models = ['llama4', 'gemma3', 'qwen3']

suffixes = None
# suffixes = '_sent_shuffle'
# suffixes = '_tail'

if suffixes is not None:
    csv_files = [home / f'projects/TLDR/data/paper_html_10.1038/abs_annotation/generated_annotations/{model}{suffixes}.txt' for model in models]
else:
    csv_files = [home / f'projects/TLDR/data/paper_html_10.1038/abs_annotation/generated_annotations/{model}.txt' for model in models]

df = pd.read_csv(home / 'projects/TLDR/data/paper_html_10.1038/abs_annotation/test.tsv', sep='\t')
for model, csv_file in zip(models, csv_files):
    with open(csv_file, encoding='utf-8') as f:
        lines = f.readlines()
        single_df = pd.DataFrame({model: [line.rstrip('\n') for line in lines]})
    print(f"Loaded {model} data with shape: {single_df.shape}")
    df = df.join(single_df)

for index in pd.read_csv(home / "projects/TLDR/description/invalid_entry_in_test.txt", sep='\t', header=None).values.flatten().tolist():
    df = df.drop(index-2)  # Adjusting for zero-based index
df

Loaded llama4 data with shape: (35636, 1)
Loaded gemma3 data with shape: (35636, 1)
Loaded qwen3 data with shape: (35636, 1)


Unnamed: 0,doi,paper_id,abstract,annotation,llama4,gemma3,qwen3
0,10.1073/pnas.91.7.2757,107202074,The origin and taxonomic status of domesticate...,A demonstration that cattle have been domestic...,This study suggests that there were two distin...,This study refutes the single origin of domest...,mtDNA analysis reveals ancient divergence betw...
1,10.1093/genetics/154.4.1785,83366887,Abstract The domestic pig originates from the ...,Evidence is presented for independent domestic...,"This study, among others, provides evidence of...",This study provides evidence for independent d...,This study demonstrates independent pig domest...
2,10.1073/pnas.96.16.9252,122095374,We previously mapped a quantitative trait locu...,This paper shows how the identity-by-descent a...,The study describes the fine-mapping approach ...,This study used fine-mapping methods to identi...,This QTL study identifies a 5cM bovine chromos...
3,10.1101/gr.10.2.220,100831446,A genome-wide linkage disequilibrium (LD) map ...,The pattern of linkage disequilibrium (LD) acr...,This study demonstrated that linkage disequili...,Reference 35 reports long-range LD in Dutch bl...,"""Genome-wide analysis of Dutch Black-and-white..."
4,10.1126/science.8134840,17452622,The European wild boar was crossed with the do...,The first paper to show the use of divergent i...,The data reported here constitute a comprehens...,This study identifies a major QTL on SSC4 affe...,Identifies a major QTL on chromosome 4 underly...
...,...,...,...,...,...,...,...
35631,10.2337/db08-1168,4860455,OBJECTIVE—Regulatory T-cells (Tregs) have cata...,This article describes the good manufacturing ...,These studies suggest that isolation and expan...,This study describes an efficient protocol for...,This study demonstrates that CD4+CD127lo/−CD25...
35632,10.1126/science.aar3246,4860145,Engineering cytokine-receptor pairs Interleuki...,This study reports the generation of an orthog...,This study demonstrates that orthogonal IL-2 a...,Reference 48 describes the engineering of a sy...,This work describes engineered synthetic IL-2 ...
35633,10.1126/science.aad2791,62290395,T cells target peptide combos One of the endur...,This article shows that some diabetogenic T ce...,This study identifies an important mechanism u...,Reference 51 shows that autoreactive T cells c...,This study identifies autoimmune T cell recogn...
35634,10.1073/pnas.1902566116,82979762,Polymorphic HLAs form the primary immune barri...,This article describes the development of gene...,This study presents a comprehensive strategy t...,This work demonstrates that a combined strateg...,This study reports a genome-editing strategy t...


# Fetch paper references

In [None]:
from sqlalchemy import create_engine
from tqdm import tqdm
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv(home / 'projects/TLDR/.env')

MYSQL_HOST = os.getenv('MYSQL_HOST')
MYSQL_USER = os.getenv('MYSQL_USER')
MYSQL_PASS = os.getenv('MYSQL_PASS')
MYSQL_DB = os.getenv('MYSQL_DB')

engine = create_engine(f'mysql+pymysql://{MYSQL_USER}:{MYSQL_PASS}@{MYSQL_HOST}/{MYSQL_DB}?charset=utf8mb4')

paper_ids = df['paper_id'].unique().tolist()
BATCH_SIZE = 500  # 每批查多少条，可调大或调小

results = []
# 用tqdm显示批次进度和预计完成时间
for i in tqdm(range(0, len(paper_ids), BATCH_SIZE), desc="Querying refs", unit="batch"):
    batch = paper_ids[i:i+BATCH_SIZE]
    id_str = ','.join(str(int(pid)) for pid in batch)
    sql = f"SELECT paper_id, refs_doi FROM paper_ref WHERE paper_id IN ({id_str})"
    batch_df = pd.read_sql(sql, engine)
    results.append(batch_df)

# 合并所有批次的结果
refs_df = pd.concat(results, ignore_index=True)

Querying refs:  24%|█████████████▊                                            | 14/59 [01:15<03:34,  4.78s/batch]

# label paper subject based on their references

In [73]:
import pandas as pd
from tqdm import tqdm

refs_df = pd.read_parquet(home / 'projects/TLDR/data/paper_refs.parquet')
display(refs_df)

all_dois = set()
for dois in refs_df['refs_doi']:
    all_dois.update(dois)
all_dois = list(all_dois)
print(f"Total unique DOIs: {len(all_dois)}")

Unnamed: 0,paper_id,refs_doi
0,1353153,"[10.1146/annurev.immunol.16.1.323, 10.1016/s00..."
1,1634910,"[10.1146/annurev.genet.31.1.213, 10.1016/0968-..."
2,1655469,"[10.1093/nar/25.17.3389, 10.1038/24094, 10.109..."
3,1778349,"[10.1016/0092-8674(91)90614-5, 10.1038/373441a..."
4,2550721,"[10.1073/pnas.93.10.4827, 10.1126/science.8091..."
...,...,...
27354,135559127,"[10.1029/2005jd005978, 10.1177/095968361987519..."
27355,136137471,"[10.1126/science.32.812.120, 10.1126/science.2..."
27356,136366684,"[10.1126/science.aan6826, 10.1038/s41586-020-2..."
27357,136374303,"[10.1016/s0301-5629(97)00269-x, 10.1088/0031-9..."


Total unique DOIs: 819700


## ref DOI -> ref container id

In [74]:
from sqlalchemy import create_engine
from sqlalchemy import text
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv(home / 'projects/TLDR/.env')

MYSQL_HOST = os.getenv('MYSQL_HOST')
MYSQL_USER = os.getenv('MYSQL_USER')
MYSQL_PASS = os.getenv('MYSQL_PASS')
MYSQL_DB = os.getenv('MYSQL_DB')

engine = create_engine(f'mysql+pymysql://{MYSQL_USER}:{MYSQL_PASS}@{MYSQL_HOST}/{MYSQL_DB}?charset=utf8mb4')

BATCH_SIZE = 1000
results = []
for i in tqdm(range(0, len(all_dois), BATCH_SIZE)):
    batch = [doi for doi in all_dois[i:i+BATCH_SIZE] if doi]  # 去除 None
    placeholders = ','.join([f":doi{i}" for i in range(len(batch))])
    sql = f"SELECT doi, container_id FROM paper_bib WHERE doi IN ({placeholders})"
    params = {f"doi{i}": doi for i, doi in enumerate(batch)}
    with engine.connect() as conn:
        result = conn.execute(text(sql), params)
        batch_df = pd.DataFrame(result.fetchall(), columns=result.keys())
        results.append(batch_df)
doi_container_df = pd.concat(results, ignore_index=True)
doi2container = dict(zip(doi_container_df['doi'], doi_container_df['container_id']))

100%|██████████████████████████████████████████████████████████████████████████| 820/820 [00:31<00:00, 25.83it/s]


In [76]:
import pandas as pd
from tqdm import tqdm

def map_dois_to_container_ids(dois):
    # 保持原顺序，若查不到container_id则为None或空字符串
    container_ids = []
    query_results = [doi2container.get(doi, None) for doi in dois]
    for id in query_results:
        if id is not None:
            container_ids.append(id)
    return container_ids

refs_df['refs_container_id'] = refs_df['refs_doi'].apply(map_dois_to_container_ids)
display(refs_df)

all_cid = set()
for cids in refs_df['refs_container_id']:
    all_cid.update(cids)
all_cid = list(all_cid)
print(f"Total unique Container IDs: {len(all_cid)}")

Unnamed: 0,paper_id,refs_doi,refs_container_id
0,1353153,"[10.1146/annurev.immunol.16.1.323, 10.1016/s00...","[C1983233, C197332, C197119, C19052, C193212, ..."
1,1634910,"[10.1146/annurev.genet.31.1.213, 10.1016/0968-...","[C1967166, C19151, C195919, C197332, C1982180,..."
2,1655469,"[10.1093/nar/25.17.3389, 10.1038/24094, 10.109...","[C197452, C18691, C197452, C18801, C18801, C19..."
3,1778349,"[10.1016/0092-8674(91)90614-5, 10.1038/373441a...","[C197332, C18691, C18801, C18801, C199510, C18..."
4,2550721,"[10.1073/pnas.93.10.4827, 10.1126/science.8091...","[C19151, C18801, C1987250, C1981240, C18801, C..."
...,...,...,...
27354,135559127,"[10.1029/2005jd005978, 10.1177/095968361987519...","[C18965, C1991115, C196921, C197489, C1987291,..."
27355,136137471,"[10.1126/science.32.812.120, 10.1126/science.2...","[C18801, C18801, C197452, C200063, C2012547, C..."
27356,136366684,"[10.1126/science.aan6826, 10.1038/s41586-020-2...","[C18801, C18691, C197332, C197332, C18691, C19..."
27357,136374303,"[10.1016/s0301-5629(97)00269-x, 10.1088/0031-9...","[C197316, C195623, C197316, C198891, C197316, ..."


Total unique Container IDs: 12341


## Ref container ID -> Ref Scopus labels

In [77]:
from sqlalchemy import create_engine
from sqlalchemy import text
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv(home / 'projects/TLDR/.env')

MYSQL_HOST = os.getenv('MYSQL_HOST')
MYSQL_USER = os.getenv('MYSQL_USER')
MYSQL_PASS = os.getenv('MYSQL_PASS')
MYSQL_DB = os.getenv('MYSQL_DB')

engine = create_engine(f'mysql+pymysql://{MYSQL_USER}:{MYSQL_PASS}@{MYSQL_HOST}/{MYSQL_DB}?charset=utf8mb4')

BATCH_SIZE = 1000
results = []
for i in tqdm(range(0, len(all_cid), BATCH_SIZE)):
    batch = [cid for cid in all_cid[i:i+BATCH_SIZE] if cid]  # 去除 None
    placeholders = ','.join([f":cid{i}" for i in range(len(batch))])
    sql = f"SELECT container_id, container_title,scopus_cat FROM container WHERE container_id IN ({placeholders})"
    params = {f"cid{i}": cid for i, cid in enumerate(batch)}
    with engine.connect() as conn:
        result = conn.execute(text(sql), params)
        batch_df = pd.DataFrame(result.fetchall(), columns=result.keys())
        results.append(batch_df)
container_scopus_df = pd.concat(results, ignore_index=True)

100%|████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 24.91it/s]


In [78]:
container_scopus_df.scopus_cat = container_scopus_df.scopus_cat.apply(lambda x: x.split(';;') if isinstance(x, str) else [])
display(container_scopus_df)
cid2scopus = dict(zip(container_scopus_df['container_id'], container_scopus_df['scopus_cat']))

Unnamed: 0,container_id,container_title,scopus_cat
0,102134,PROCEEDINGS OF THE 2013 ACM WORKSHOP ON CLOUD ...,[]
1,107450,PROCEEDINGS OF OCEANOBS'09: SUSTAINED OCEAN OB...,[]
2,119509,2011 49TH ANNUAL ALLERTON CONFERENCE ON COMMUN...,[]
3,124185,2011 IEEE SEVENTH INTERNATIONAL CONFERENCE ON ...,[]
4,153199,2020 IEEE 7TH INTERNATIONAL WORKSHOP ON METROL...,[]
...,...,...,...
12303,C20164583,CURRENT LANDSCAPE ECOLOGY REPORTS,[]
12304,C20181914,JOURNAL OF BIODIVERSITY CONSERVATION AND BIORE...,[]
12305,C20191395,THE LANCET DIGITAL HEALTH,"[Health Information Management, Decision Scien..."
12306,C20194706,KHARKIV SURGICAL SCHOOL,[]


In [71]:
container_scopus_df[container_scopus_df.scopus_cat.map(len) != 0].sort_values(by='scopus_cat', key=lambda x: x.map(len)).head(20)

Unnamed: 0,container_id,container_title,scopus_cat
51,C18361,IRISH JOURNAL OF MEDICAL SCIENCE,[General Medicine]
3301,C1978248,MEMBRANE BIOCHEMISTRY,[Biochemistry]
7407,C1989111,CRIMINAL JUSTICE MATTERS,[Law]
7404,C198850,NURSING SCIENCE QUARTERLY,[General Nursing]
3307,C1979101,WESTERN JOURNAL OF NURSING RESEARCH,[General Nursing]
7400,C1988321,ANNALS OF SEX RESEARCH,[General Psychology]
7399,C1988318,INRAE PRODUCTIONS ANIMALES,[Animal Science and Zoology]
7397,C1988244,CURRENT OPINION IN PSYCHIATRY,[Psychiatry and Mental Health]
7395,C1988228,ARCHIVES OF HISTOLOGY AND CYTOLOGY,[Histology]
7393,C198819,APPLIED MATHEMATICS LETTERS,[Applied Mathematics]


# Get Scopus categories

In [7]:
def scopus_cat_vote(cid_lst: list[str]) -> str:
    cat_counts = {}
    for cid in cid_lst:
        scopus_cats = cid2scopus.get(cid, [])
        scopus_cats = [cat for cat in scopus_cats if (cat != 'Multidisciplinary') 
                        and (not cat.startswith('General'))
                        and (not cat.endswith('(miscellaneous)'))]
        if len(scopus_cats) == 0:
            continue
        if len(scopus_cats) == 1:
            cat = scopus_cats[0]
            cat_counts[cat] = cat_counts.get(cat, 0) + 1
        if len(scopus_cats) > 1:
            for cat in scopus_cats:
                cat_counts[cat] = cat_counts.get(cat, 0) + 1/len(scopus_cats)
    return max(cat_counts, key=cat_counts.get) if cat_counts else None

refs_df['scopus_cat'] = refs_df['refs_container_id'].apply(scopus_cat_vote)
refs_df


Unnamed: 0,paper_id,refs_doi,refs_container_id,scopus_cat
0,1353153,"[10.1146/annurev.immunol.16.1.323, 10.1016/s00...","[C1983233, C197332, C197119, C19052, C193212, ...",Immunology
1,1634910,"[10.1146/annurev.genet.31.1.213, 10.1016/0968-...","[C1967166, C19151, C195919, C197332, C1982180,...",Structural Biology
2,1655469,"[10.1093/nar/25.17.3389, 10.1038/24094, 10.109...","[C197452, C18691, C197452, C18801, C18801, C19...",Genetics
3,1778349,"[10.1016/0092-8674(91)90614-5, 10.1038/373441a...","[C197332, C18691, C18801, C18801, C199510, C18...",Cell Biology
4,2550721,"[10.1073/pnas.93.10.4827, 10.1126/science.8091...","[C19151, C18801, C1987250, C1981240, C18801, C...",Genetics
...,...,...,...,...
27354,135559127,"[10.1029/2005jd005978, 10.1177/095968361987519...","[C18965, C1991115, C196921, C197489, C1987291,...",Atmospheric Science
27355,136137471,"[10.1126/science.32.812.120, 10.1126/science.2...","[C18801, C18801, C197452, C200063, C2012547, C...",Genetics
27356,136366684,"[10.1126/science.aan6826, 10.1038/s41586-020-2...","[C18801, C18691, C197332, C197332, C18691, C19...",Genetics
27357,136374303,"[10.1016/s0301-5629(97)00269-x, 10.1088/0031-9...","[C197316, C195623, C197316, C198891, C197316, ...",Cell Biology


In [53]:
refs_df.scopus_cat.value_counts().head(20)

scopus_cat
Cell Biology                                    4161
Immunology                                      3597
Genetics                                        3577
Molecular Biology                               1805
Biochemistry                                    1115
Microbiology                                    1026
Neurology (clinical)                             904
Physiology                                       893
Oncology                                         584
Ecology, Evolution, Behavior and Systematics     570
Virology                                         493
Cancer Research                                  487
Cognitive Neuroscience                           362
Catalysis                                        344
Plant Science                                    335
Condensed Matter Physics                         319
Developmental Biology                            305
Infectious Diseases                              300
Cardiology and Cardiovascular Medic

In [9]:
import pandas as pd

ASJC_df = pd.read_excel(home / 'projects/TLDR/data/ASJC1.xlsx', header=[0])
ASJC_df

Unnamed: 0,CodeSystem,Code,Description
0,ASJC,10,Multidisciplinary
1,ASJC,11,Agricultural and Biological Sciences
2,ASJC,12,Arts and Humanities
3,ASJC,13,Biochemistry
4,ASJC,14,Business
...,...,...,...
356,ASJC,3612,Physical Therapy
357,ASJC,3613,Podiatry
358,ASJC,3614,Radiological and Ultrasound Technology
359,ASJC,3615,Respiratory Care


In [10]:
def map_cat_to_field(cat: str) -> str:
    """
    将Scopus分类映射到ASJC领域
    """
    if cat is None:
        return None
    if ',' in cat:
        cat = cat.split(',')[0]
    try:
        code = ASJC_df[ASJC_df['Description'] == cat]['Code'].values[0]
    except IndexError:
        print(f"Warning: Category '{cat}' not found in ASJC DataFrame.")
    field_code = str(code)[0:2]
    field = ASJC_df[ASJC_df['Code'] == int(field_code)]['Description'].values[0]
    return field
refs_df['scopus_field'] = refs_df['scopus_cat'].apply(map_cat_to_field)
refs_df

Unnamed: 0,paper_id,refs_doi,refs_container_id,scopus_cat,scopus_field
0,1353153,"[10.1146/annurev.immunol.16.1.323, 10.1016/s00...","[C1983233, C197332, C197119, C19052, C193212, ...",Immunology,Immunology and Microbiology
1,1634910,"[10.1146/annurev.genet.31.1.213, 10.1016/0968-...","[C1967166, C19151, C195919, C197332, C1982180,...",Structural Biology,Biochemistry
2,1655469,"[10.1093/nar/25.17.3389, 10.1038/24094, 10.109...","[C197452, C18691, C197452, C18801, C18801, C19...",Genetics,Biochemistry
3,1778349,"[10.1016/0092-8674(91)90614-5, 10.1038/373441a...","[C197332, C18691, C18801, C18801, C199510, C18...",Cell Biology,Biochemistry
4,2550721,"[10.1073/pnas.93.10.4827, 10.1126/science.8091...","[C19151, C18801, C1987250, C1981240, C18801, C...",Genetics,Biochemistry
...,...,...,...,...,...
27354,135559127,"[10.1029/2005jd005978, 10.1177/095968361987519...","[C18965, C1991115, C196921, C197489, C1987291,...",Atmospheric Science,Earth and Planetary Sciences
27355,136137471,"[10.1126/science.32.812.120, 10.1126/science.2...","[C18801, C18801, C197452, C200063, C2012547, C...",Genetics,Biochemistry
27356,136366684,"[10.1126/science.aan6826, 10.1038/s41586-020-2...","[C18801, C18691, C197332, C197332, C18691, C19...",Genetics,Biochemistry
27357,136374303,"[10.1016/s0301-5629(97)00269-x, 10.1088/0031-9...","[C197316, C195623, C197316, C198891, C197316, ...",Cell Biology,Biochemistry


In [11]:
refs_df.scopus_field.value_counts()

scopus_field
Biochemistry                            13140
Immunology and Microbiology              5206
Medicine                                 4005
Agricultural and Biological Sciences     1131
Neuroscience                              777
Earth and Planetary Sciences              505
Physics and Astronomy                     466
Chemical Engineering                      353
Chemistry                                 319
Pharmacology                              319
Social Sciences                           253
Psychology                                155
Engineering                               141
Environmental Science                     111
Materials Science                          93
Energy                                     83
Economics                                  65
Nursing                                    53
Arts and Humanities                        37
Computer Science                           33
Mathematics                                33
Dentistry            

In [12]:
refs_df[['paper_id', 'scopus_cat', 'scopus_field']].to_parquet(home / 'projects/TLDR/data/paper_scopus_cat.parquet', index=False)

# Directly to field

In [79]:
import pandas as pd

ASJC_df = pd.read_excel(home / 'projects/TLDR/data/ASJC1.xlsx', header=[0])
ASJC_df

Unnamed: 0,CodeSystem,Code,Description
0,ASJC,10,Multidisciplinary
1,ASJC,11,Agricultural and Biological Sciences
2,ASJC,12,Arts and Humanities
3,ASJC,13,Biochemistry
4,ASJC,14,Business
...,...,...,...
356,ASJC,3612,Physical Therapy
357,ASJC,3613,Podiatry
358,ASJC,3614,Radiological and Ultrasound Technology
359,ASJC,3615,Respiratory Care


In [88]:
def map_cat_to_field(cat: str) -> str:
    """
    将Scopus分类映射到ASJC领域
    """
    if cat is None:
        return None
    if ',' in cat:
        cat = cat.split(',')[0]
    try:
        code = ASJC_df[ASJC_df['Description'] == cat]['Code'].values[0]
    except IndexError:
        print(f"Warning: Category '{cat}' not found in ASJC DataFrame.")
        return None
    field_code = str(code)[0:2]
    field = ASJC_df[ASJC_df['Code'] == int(field_code)]['Description'].values[0]
    return field

In [95]:
def scopus_field_vote(cid_lst: list[str]) -> str:
    field_counts = {}
    for cid in cid_lst:
        scopus_cats = cid2scopus.get(cid, [])
        scopus_cats = [cat.strip() for cat in scopus_cats if (cat != 'Multidisciplinary') and (cat != '')]
        if len(scopus_cats) == 0:
            continue

        scopus_fields = [map_cat_to_field(cat) for cat in scopus_cats if map_cat_to_field(cat) is not None]
        if len(scopus_fields) == 1:
            field = scopus_fields[0]
            field_counts[field] = field_counts.get(field, 0) + 1
        if len(scopus_fields) > 1:
            for field in scopus_fields:
                field_counts[field] = field_counts.get(field, 0) + 1/len(scopus_fields)
    return max(field_counts, key=field_counts.get) if field_counts else None

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

refs_df['scopus_field'] = refs_df['refs_container_id'].parallel_apply(scopus_field_vote)
refs_df


INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2736), Label(value='0 / 2736'))), …

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Unnamed: 0,paper_id,refs_doi,refs_container_id,scopus_field
0,1353153,"[10.1146/annurev.immunol.16.1.323, 10.1016/s00...","[C1983233, C197332, C197119, C19052, C193212, ...",Biochemistry
1,1634910,"[10.1146/annurev.genet.31.1.213, 10.1016/0968-...","[C1967166, C19151, C195919, C197332, C1982180,...",Biochemistry
2,1655469,"[10.1093/nar/25.17.3389, 10.1038/24094, 10.109...","[C197452, C18691, C197452, C18801, C18801, C19...",Biochemistry
3,1778349,"[10.1016/0092-8674(91)90614-5, 10.1038/373441a...","[C197332, C18691, C18801, C18801, C199510, C18...",Biochemistry
4,2550721,"[10.1073/pnas.93.10.4827, 10.1126/science.8091...","[C19151, C18801, C1987250, C1981240, C18801, C...",Biochemistry
...,...,...,...,...
27354,135559127,"[10.1029/2005jd005978, 10.1177/095968361987519...","[C18965, C1991115, C196921, C197489, C1987291,...",Earth and Planetary Sciences
27355,136137471,"[10.1126/science.32.812.120, 10.1126/science.2...","[C18801, C18801, C197452, C200063, C2012547, C...",Biochemistry
27356,136366684,"[10.1126/science.aan6826, 10.1038/s41586-020-2...","[C18801, C18691, C197332, C197332, C18691, C19...",Biochemistry
27357,136374303,"[10.1016/s0301-5629(97)00269-x, 10.1088/0031-9...","[C197316, C195623, C197316, C198891, C197316, ...",Biochemistry


In [96]:
refs_df.scopus_field.value_counts().head(20)

scopus_field
Biochemistry                            14010
Medicine                                 6278
Neuroscience                             2421
Immunology and Microbiology              1335
Agricultural and Biological Sciences      881
Chemistry                                 556
Physics and Astronomy                     548
Earth and Planetary Sciences              471
Materials Science                         182
Psychology                                156
Social Sciences                           102
Environmental Science                      91
Pharmacology                               57
Engineering                                53
Economics                                  36
Computer Science                           36
Energy                                     33
Chemical Engineering                       26
Mathematics                                18
Dentistry                                  17
Name: count, dtype: int64

In [97]:
refs_df[['paper_id', 'scopus_field']].to_parquet(home / 'projects/TLDR/data/paper_scopus_field.parquet', index=False)