In [34]:
from glob import glob
import pathlib
import os

def find_all_tsv_files(root_dir):
    """递归查找所有子目录下的.tsv文件"""
    # 匹配 abs_annotation/数字/*.tsv
    pattern = os.path.join(root_dir, '[0-9]*', '*.tsv')
    return glob(pattern), ['10.1038/'+'.'.join(os.path.basename(f).split('.')[0:-1]) for f in glob(pattern)]

home = pathlib.Path.home()
tsv_files, DOIs = find_all_tsv_files(home / 'projects/TLDR/data/paper_html_10.1038/abs_annotation')
print(len(tsv_files), "TSV files found.")

9503 TSV files found.


In [35]:
tsv_files[0]

'/home/lyuzhuoqi/projects/TLDR/data/paper_html_10.1038/abs_annotation/4/35052563.tsv'

In [36]:
DOIs

['10.1038/35052563',
 '10.1038/35089520',
 '10.1038/35096061',
 '10.1038/35056041',
 '10.1038/35038540',
 '10.1038/35076523',
 '10.1038/35095564',
 '10.1038/35042073',
 '10.1038/35098584',
 '10.1038/35036213',
 '10.1038/35036228',
 '10.1038/35080005',
 '10.1038/35067500',
 '10.1038/35077544',
 '10.1038/35067069',
 '10.1038/35049054',
 '10.1038/35103068',
 '10.1038/35048058',
 '10.1038/35086062',
 '10.1038/35103104',
 '10.1038/35088576',
 '10.1038/35101078',
 '10.1038/35053570',
 '10.1038/35056058',
 '10.1038/35099020',
 '10.1038/35105052',
 '10.1038/35086057',
 '10.1038/35039051',
 '10.1038/35096009',
 '10.1038/35100540',
 '10.1038/35052556',
 '10.1038/35066084',
 '10.1038/35058521',
 '10.1038/35058574',
 '10.1038/35067016',
 '10.1038/35040042',
 '10.1038/35040009',
 '10.1038/35036191',
 '10.1038/35100503',
 '10.1038/35103000',
 '10.1038/35067005',
 '10.1038/35080071',
 '10.1038/35038572',
 '10.1038/35104078',
 '10.1038/35093548',
 '10.1038/35066075',
 '10.1038/35096067',
 '10.1038/350

In [37]:
'10.1038/3509551' in DOIs

True

In [38]:
from tqdm import tqdm
import pandas as pd
import csv

data = []
for tsv_file, citing_doi in tqdm(zip(tsv_files, DOIs)):
    with open(tsv_file, "r", encoding="utf-8") as infile:
        reader = csv.reader(infile, delimiter='\t')
        file_header = next(reader, None)
        # 检查文件为空
        if file_header is None:
            continue
        for row in reader:
            # 检查列数是否正确
            if len(row) != 4:
                print(f"Warning: {tsv_file} has invalid row (wrong number of columns): {row}")
                continue
            # 检查每列是否为空
            if any(cell.strip() == "" or cell.strip() in ["#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null"] for cell in row):
                print(f"Warning: {tsv_file} has empty cell in row: {row}")
                continue
            data.append({
                "doi": row[0],
                "cited_by_doi": citing_doi,
            })

co_citation_df = pd.DataFrame(data)
co_citation_df

4138it [00:00, 7059.34it/s]



9503it [00:01, 6937.16it/s]


Unnamed: 0,doi,cited_by_doi
0,10.1073/pnas.91.7.2757,10.1038/35052563
1,10.1093/genetics/154.4.1785,10.1038/35052563
2,10.1073/pnas.96.16.9252,10.1038/35052563
3,10.1101/gr.10.2.220,10.1038/35052563
4,10.1126/science.8134840,10.1038/35052563
...,...,...
35636,10.2337/db08-1168,10.1038/s41573-019-0041-4
35637,10.1126/science.aar3246,10.1038/s41573-019-0041-4
35638,10.1126/science.aad2791,10.1038/s41573-019-0041-4
35639,10.1073/pnas.1902566116,10.1038/s41573-019-0041-4


In [39]:
co_citation_df.to_parquet(home / 'projects/TLDR/data/co_citation.parquet', index=False)