In [1]:
import pandas as pd
from pathlib import Path
home = Path.home()

# models = ['qwen3', 'gemma3', 'llama4', 'qwq']
models = ['llama4', 'gemma3', 'qwen3']

suffixes = None
# suffixes = '_sent_shuffle'
# suffixes = '_tail'

if suffixes is not None:
    csv_files = [home / f'projects/TLDR/data/paper_html_10.1038/abs_annotation/generated_annotations/{model}{suffixes}.txt' for model in models]
else:
    csv_files = [home / f'projects/TLDR/data/paper_html_10.1038/abs_annotation/generated_annotations/{model}.txt' for model in models]

df = pd.read_csv(home / 'projects/TLDR/data/paper_html_10.1038/abs_annotation/test.tsv', sep='\t')
for model, csv_file in zip(models, csv_files):
    with open(csv_file, encoding='utf-8') as f:
        lines = f.readlines()
        single_df = pd.DataFrame({model: [line.rstrip('\n') for line in lines]})
    print(f"Loaded {model} data with shape: {single_df.shape}")
    df = df.join(single_df)

for index in pd.read_csv(home / "projects/TLDR/description/invalid_entry_in_test.txt", sep='\t', header=None).values.flatten().tolist():
    df = df.drop(index-2)  # Adjusting for zero-based index
df

Loaded llama4 data with shape: (35636, 1)
Loaded gemma3 data with shape: (35636, 1)
Loaded qwen3 data with shape: (35636, 1)


Unnamed: 0,doi,paper_id,abstract,annotation,llama4,gemma3,qwen3
0,10.1073/pnas.91.7.2757,107202074,The origin and taxonomic status of domesticate...,A demonstration that cattle have been domestic...,This study suggests that there were two distin...,This study refutes the single origin of domest...,mtDNA analysis reveals ancient divergence betw...
1,10.1093/genetics/154.4.1785,83366887,Abstract The domestic pig originates from the ...,Evidence is presented for independent domestic...,"This study, among others, provides evidence of...",This study provides evidence for independent d...,This study demonstrates independent pig domest...
2,10.1073/pnas.96.16.9252,122095374,We previously mapped a quantitative trait locu...,This paper shows how the identity-by-descent a...,The study describes the fine-mapping approach ...,This study used fine-mapping methods to identi...,This QTL study identifies a 5cM bovine chromos...
3,10.1101/gr.10.2.220,100831446,A genome-wide linkage disequilibrium (LD) map ...,The pattern of linkage disequilibrium (LD) acr...,This study demonstrated that linkage disequili...,Reference 35 reports long-range LD in Dutch bl...,"""Genome-wide analysis of Dutch Black-and-white..."
4,10.1126/science.8134840,17452622,The European wild boar was crossed with the do...,The first paper to show the use of divergent i...,The data reported here constitute a comprehens...,This study identifies a major QTL on SSC4 affe...,Identifies a major QTL on chromosome 4 underly...
...,...,...,...,...,...,...,...
35631,10.2337/db08-1168,4860455,OBJECTIVE—Regulatory T-cells (Tregs) have cata...,This article describes the good manufacturing ...,These studies suggest that isolation and expan...,This study describes an efficient protocol for...,This study demonstrates that CD4+CD127lo/−CD25...
35632,10.1126/science.aar3246,4860145,Engineering cytokine-receptor pairs Interleuki...,This study reports the generation of an orthog...,This study demonstrates that orthogonal IL-2 a...,Reference 48 describes the engineering of a sy...,This work describes engineered synthetic IL-2 ...
35633,10.1126/science.aad2791,62290395,T cells target peptide combos One of the endur...,This article shows that some diabetogenic T ce...,This study identifies an important mechanism u...,Reference 51 shows that autoreactive T cells c...,This study identifies autoimmune T cell recogn...
35634,10.1073/pnas.1902566116,82979762,Polymorphic HLAs form the primary immune barri...,This article describes the development of gene...,This study presents a comprehensive strategy t...,This work demonstrates that a combined strateg...,This study reports a genome-editing strategy t...


# Length, novel n-gram ratio, and NEs per 100 words

In [3]:
import numpy as np
import nltk
import spacy
from tqdm import tqdm

# Ensure nltk punkt is downloaded
nltk.download('punkt')
# Load spaCy English model for NER
nlp = spacy.load("en_core_web_sm")

def length_statistics(texts):
    num_words = []
    num_sentences = []
    words_per_sentence = []
    for text in texts:
        num_words_single = 0
        sentences = nltk.sent_tokenize(text)
        num_sentences.append(len(sentences))
        for sent in sentences:
            words = nltk.word_tokenize(sent)
            words_per_sentence.append(len(words))
            num_words_single += len(words)
        num_words.append(num_words_single)
    return {
        '# of Words': f"{np.mean(num_words):.2f}±{np.std(num_words):.2f}",
        '# of Sent.': f"{np.mean(num_sentences):.2f}±{np.std(num_sentences):.2f}",
        '# of Words/Sent': f"{np.mean(words_per_sentence):.2f}±{np.std(words_per_sentence):.2f}"
    }

def get_ngrams(text, n):
    words = str(text).split()
    return set(tuple(words[i:i+n]) for i in range(len(words)-n+1))

def percent_novel_ngrams(df, col_name: str, n):
    abstracts_ngrams = df['abstract'].apply(lambda x: get_ngrams(x, n))
    annotations_ngrams = df[col_name].apply(lambda x: get_ngrams(x, n))
    novel_counts = []
    total_counts = []
    for ann, abs_ in zip(annotations_ngrams, abstracts_ngrams):
        novel = ann - abs_
        novel_counts.append(len(novel))
        total_counts.append(len(ann))
    mean = np.mean([nc/tc if tc > 0 else 0 for nc, tc in zip(novel_counts, total_counts)])
    std = np.std([nc/tc if tc > 0 else 0 for nc, tc in zip(novel_counts, total_counts)])
    return (mean * 100, std * 100)

def named_entities_per_100_words(texts):
    ne_per_100 = []
    for text in tqdm(texts, desc="Counting named entities"):
        doc = nlp(text)
        ents = len(doc.ents)
        words = len([token for token in doc if token.is_alpha])
        if words == 0:
            ne_per_100.append(0)
        else:
            ne_per_100.append(100 * ents / words)
    mean = np.mean(ne_per_100)
    std = np.std(ne_per_100)
    return mean, std

[nltk_data] Downloading package punkt to /home/lyuzhuoqi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
for model_name in ['gemma3', 'llama4', 'qwen3']:
    print(f"Processing {model_name}...")
    gen_text = df[model_name].tolist()

    # Length statistics
    gen_stats = length_statistics(gen_text)
    print(f"{model_name} Length Stats:", gen_stats)

    # Novel n-grams
    gen_novel_1 = percent_novel_ngrams(df, model_name, 1)
    gen_novel_2 = percent_novel_ngrams(df, model_name, 2)
    gen_novel_3 = percent_novel_ngrams(df, model_name, 3)
    print(f"{model_name} % of novel n-grams: n=1: {gen_novel_1[0]:.2f}±{gen_novel_1[1]:.2f}%, n=2: {gen_novel_2[0]:.2f}±{gen_novel_2[1]:.2f}%, n=3: {gen_novel_3[0]:.2f}±{gen_novel_3[1]:.2f}%")

    # Named entities per 100 words
    ne_mean, ne_std = named_entities_per_100_words(gen_text)
    print(f"{model_name} # of NEs per 100 words: {ne_mean:.2f}±{ne_std:.2f}")

Processing gemma3...
gemma3 Length Stats: {'# of Words': '29.06±11.03', '# of Sent.': '1.03±2.31', '# of Words/Sent': '28.31±9.59'}
gemma3 % of novel n-grams: n=1: 47.25±12.15%, n=2: 81.81±11.81%, n=3: 92.58±8.95%


Counting named entities: 100%|████████████████████████████████████████████| 35621/35621 [02:55<00:00, 203.42it/s]


gemma3 # of NEs per 100 words: 4.49±4.64
Processing llama4...
llama4 Length Stats: {'# of Words': '43.11±764.99', '# of Sent.': '1.30±3.32', '# of Words/Sent': '33.24±628.57'}
llama4 % of novel n-grams: n=1: 51.51±13.82%, n=2: 85.47±12.07%, n=3: 94.43±9.25%


Counting named entities: 100%|████████████████████████████████████████████| 35621/35621 [03:22<00:00, 176.15it/s]


llama4 # of NEs per 100 words: 3.46±4.28
Processing qwen3...
qwen3 Length Stats: {'# of Words': '153.29±1799.10', '# of Sent.': '6.22±91.82', '# of Words/Sent': '24.64±140.67'}
qwen3 % of novel n-grams: n=1: 51.19±10.92%, n=2: 85.36±8.61%, n=3: 95.05±5.85%


Counting named entities: 100%|█████████████████████████████████████████████| 35621/35621 [08:40<00:00, 68.43it/s]

qwen3 # of NEs per 100 words: 4.89±4.37





In [6]:
model_name = 'annotation'

gen_text = df[model_name].tolist()

# Length statistics
gen_stats = length_statistics(gen_text)
print(f"{model_name} Length Stats:", gen_stats)

# Novel n-grams
gen_novel_1 = percent_novel_ngrams(df, model_name, 1)
gen_novel_2 = percent_novel_ngrams(df, model_name, 2)
gen_novel_3 = percent_novel_ngrams(df, model_name, 3)
print(f"{model_name} % of novel n-grams: n=1: {gen_novel_1[0]:.2f}±{gen_novel_1[1]:.2f}%, n=2: {gen_novel_2[0]:.2f}±{gen_novel_2[1]:.2f}%, n=3: {gen_novel_3[0]:.2f}±{gen_novel_3[1]:.2f}%")

# Named entities per 100 words
ne_mean, ne_std = named_entities_per_100_words(gen_text)
print(f"{model_name} # of NEs per 100 words: {ne_mean:.2f}±{ne_std:.2f}")

annotation Length Stats: {'# of Words': '25.29±11.43', '# of Sent.': '1.14±0.41', '# of Words/Sent': '22.22±8.83'}
annotation % of novel n-grams: n=1: 55.75±14.61%, n=2: 88.46±12.39%, n=3: 95.93±9.19%


Counting named entities: 100%|█████████████████████████████████████████████| 35621/35621 [08:44<00:00, 67.92it/s]

annotation # of NEs per 100 words: 5.56±5.93





In [7]:
model_name = 'abstract'

gen_text = df[model_name].tolist()

# Length statistics
gen_stats = length_statistics(gen_text)
print(f"{model_name} Length Stats:", gen_stats)

# Named entities per 100 words
ne_mean, ne_std = named_entities_per_100_words(gen_text)
print(f"{model_name} # of NEs per 100 words: {ne_mean:.2f}±{ne_std:.2f}")

abstract Length Stats: {'# of Words': '191.58±82.53', '# of Sent.': '7.37±3.18', '# of Words/Sent': '26.01±11.71'}


Counting named entities: 100%|█████████████████████████████████████████████| 35621/35621 [16:23<00:00, 36.23it/s]

abstract # of NEs per 100 words: 5.11±3.83





# Copy rate

In [7]:
train = pd.read_csv('/home/lyuzhuoqi/projects/TLDR/data/paper_html_10.1038/abs_annotation/train.tsv', sep='\t')
train

Unnamed: 0,abs_doi,paper_id,abstract,annotation
0,10.1084/jem.20020319,123473941,CD30 is up-regulated in several human diseases...,References 38 and 39 report the identification...
1,10.1073/pnas.1424989112,28844728,Carbon monoxide occurs at relatively high conc...,This work is the first report of atmospheric t...
2,10.4049/jimmunol.164.8.4143,99473470,Abstract Inhibition of class II trans-activato...,References 43 and 44 report that the gene enco...
3,10.1073/pnas.0704665104,123031244,High-throughput direct sequencing techniques h...,This study characterizes typical nucleotide mi...
4,10.1093/bioinformatics/btu703,125314318,"Summary: Annotating genetic variants, especial...",This paper presents a pathogenicity scoring me...


In [8]:
def ngram_copy_rate(df, model_col: str, train_samples: list, n: int = 3, threshold: float = 0.5):
    """
    计算每个生成的summary与所有训练summary的n-gram重合比例，
    并返回Zero Copy Rate(%)，Copy Rate > threshold (%)，Max(%)
    - df: 包含生成summary的DataFrame
    - model_col: 生成summary所在的列名
    - train_samples: 训练样本summary的列表（字符串列表）
    - n: n-gram的n，默认为3
    """
    # 预处理训练样本的n-gram集合
    train_ngrams = set()
    for t in train_samples:
        train_ngrams.update(get_ngrams(t, n))

    copy_rates = []
    for gen in df[model_col].tolist():
        gen_ngrams = get_ngrams(gen, n)
        if len(gen_ngrams) == 0:
            copy_rates.append(0)
            continue
        overlap = gen_ngrams & train_ngrams
        copy_rate = len(overlap) / len(gen_ngrams)
        copy_rates.append(copy_rate)

    copy_rates = np.array(copy_rates)
    zero_copy_rate = np.sum(copy_rates == 0) / len(copy_rates) * 100
    high_copy_rate = np.sum(copy_rates > threshold) / len(copy_rates) * 100
    max_copy_rate = np.max(copy_rates) * 100 if len(copy_rates) > 0 else 0

    return zero_copy_rate, high_copy_rate, max_copy_rate


In [9]:
for model_name in ['gemma3', 'llama4', 'qwen3']:
    zero, high, maxv = ngram_copy_rate(df, model_name, train['abstract'].to_list()+train['annotation'].to_list(), n=3, threshold=0.1)
    print(f"{model_name} 3-gram Copy Rate: Zero={zero:.2f}%, High Proportion={high:.2f}%, Max={maxv:.2f}%")

gemma3 3-gram Copy Rate: Zero=91.25%, High Proportion=0.28%, Max=27.78%
llama4 3-gram Copy Rate: Zero=86.63%, High Proportion=1.05%, Max=46.15%
qwen3 3-gram Copy Rate: Zero=94.88%, High Proportion=0.03%, Max=95.24%
