In [3]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", device=0)

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

# Prepare Data

In [7]:
import pandas as pd

In [9]:
df = pd.read_csv('data/collection_with_abstracts.csv')

In [10]:
df['Abstract'] = df['Abstract'].fillna('')
df['text_data'] = df.apply(lambda row: row['Title'] + '\n\n' + row['Abstract'], axis=1)

# Task 1: Filtering out irrelevant papers

In [11]:
title_abstract_list = df[['PMID', 'text_data']].values.tolist()

In [12]:
title_abstract_list[0]

[39435445,
 'Editorial: The operationalization of cognitive systems in the comprehension of visual structures\n\n']

In [19]:
def classify_papers(title_abstract_list, candidate_labels):
    papers_classification = []
    for PMID, title_abstract in title_abstract_list:
        output = classifier(title_abstract, candidate_labels, multi_label=True)
        data = {'PMID': PMID, 'title_abstract': title_abstract, output['labels'][0]: output['scores'][0], output['labels'][1]: output['scores'][1]}
        papers_classification.append(data)
    return papers_classification

In [20]:
candidate_labels = ["deep learning in virology", "deep learning in epidemiology"]
papers_classification = classify_papers(title_abstract_list, candidate_labels)

In [22]:
import pickle

with open("embeddings/papers_classification.pkl", "wb") as file:
    pickle.dump(papers_classification, file)


In [23]:
relevant_papers = [p for p in papers_classification if (p['deep learning in epidemiology'] >= 0.5 or p['deep learning in virology'] >= 0.5) ]

In [24]:
len(relevant_papers)

5832

In [35]:
filtered_df = pd.DataFrame(relevant_papers)

In [36]:
filtered_df.head()

Unnamed: 0,PMID,title_abstract,deep learning in virology,deep learning in epidemiology
0,39390053,Multi-scale input layers and dense decoder agg...,0.952872,0.938716
1,39363262,Truncated M13 phage for smart detection of E. ...,0.946782,0.882754
2,39287522,AI for Multistructure Incidental Findings and ...,0.175686,0.716531
3,39273393,Digital Whole Slide Image Analysis of Elevated...,0.213716,0.538159
4,39269702,Health Warnings on Instagram Advertisements fo...,0.855169,0.902817


# Task 2: Classify relevant papers

In [37]:
filtered_df = pd.merge(df, filtered_df[['PMID', 'deep learning in virology', 'deep learning in epidemiology']], on='PMID', how='inner')

In [38]:
filtered_df.head()

Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI,Abstract,text_data,deep learning in virology,deep learning in epidemiology
0,39390053,Multi-scale input layers and dense decoder agg...,"Lan X, Jin W.",Sci Rep. 2024 Oct 10;14(1):23729. doi: 10.1038...,Lan X,Sci Rep,2024,2024/10/10,PMC11467340,,10.1038/s41598-024-74701-0,Accurate segmentation of COVID-19 lesions from...,Multi-scale input layers and dense decoder agg...,0.952872,0.938716
1,39363262,Truncated M13 phage for smart detection of E. ...,"Yuan J, Zhu H, Li S, Thierry B, Yang CT, Zhang...",J Nanobiotechnology. 2024 Oct 3;22(1):599. doi...,Yuan J,J Nanobiotechnology,2024,2024/10/04,PMC11451008,,10.1186/s12951-024-02881-y,BACKGROUND: The urgent need for affordable and...,Truncated M13 phage for smart detection of E. ...,0.946782,0.882754
2,39287522,AI for Multistructure Incidental Findings and ...,"Marcinkiewicz AM, Buchwald M, Shanbhag A, Bedn...",Radiology. 2024 Sep;312(3):e240541. doi: 10.11...,Marcinkiewicz AM,Radiology,2024,2024/09/17,PMC11427857,,10.1148/radiol.240541,Background Incidental extrapulmonary findings ...,AI for Multistructure Incidental Findings and ...,0.175686,0.716531
3,39273393,Digital Whole Slide Image Analysis of Elevated...,"Karancsi Z, Gregus B, Krenács T, Cserni G, Nag...",Int J Mol Sci. 2024 Aug 30;25(17):9445. doi: 1...,Karancsi Z,Int J Mol Sci,2024,2024/09/14,PMC11394775,,10.3390/ijms25179445,Triple-negative breast cancer (TNBC) is a subt...,Digital Whole Slide Image Analysis of Elevated...,0.213716,0.538159
4,39269702,Health Warnings on Instagram Advertisements fo...,"Wu J, Trifiro BM, Ranker LR, Origgi JM, Benjam...",JAMA Netw Open. 2024 Sep 3;7(9):e2434434. doi:...,Wu J,JAMA Netw Open,2024,2024/09/13,PMC11400217,,10.1001/jamanetworkopen.2024.34434,IMPORTANCE: Synthetic nicotine is increasingly...,Health Warnings on Instagram Advertisements fo...,0.855169,0.902817


In [39]:
title_abstract_list = filtered_df[['PMID', 'text_data']].values.tolist()

In [40]:
candidate_labels = ["computer vision", "text mining"]
papers_classification = classify_papers(title_abstract_list, candidate_labels)

In [41]:
def label_paragraphs(papers_classification, threshold=0.5):
    for paper in papers_classification:
        label = 'other'
        if paper['computer vision'] >= threshold and paper['text mining'] >= threshold:
            label = 'both'
        elif paper['computer vision'] >= threshold:
            label = 'computer vision'
        elif paper['text mining'] >= threshold:
            label = 'text mining'
        # else:
        #     label = 'other'
        paper['label'] = label

In [42]:
label_paragraphs(papers_classification, threshold=0.5)

In [44]:
papers_classification[3]

{'PMID': 39273393,
 'title_abstract': 'Digital Whole Slide Image Analysis of Elevated Stromal Content and Extracellular Matrix Protein Expression Predicts Adverse Prognosis in Triple-Negative Breast Cancer\n\nTriple-negative breast cancer (TNBC) is a subtype of breast cancer with a poor prognosis and limited treatment options. This study evaluates the prognostic value of stromal markers in TNBC, focusing on the tumor-stroma ratio (TSR) and overall stroma ratio (OSR) in whole slide images (WSI), as well as the expression of type-I collagen, type-III collagen, and fibrillin-1 on tissue microarrays (TMAs), using both visual assessment and digital image analysis (DIA). A total of 101 female TNBC patients, primarily treated with surgery between 2005 and 2016, were included. We found that high visual OSR correlates with worse overall survival (OS), advanced pN categories, lower stromal tumor-infiltrating lymphocyte count (sTIL), lower mitotic index, and patient age (p &lt; 0.05). TSR showed 

In [45]:
classified_df = pd.DataFrame(papers_classification)
classified_df = pd.merge(filtered_df, classified_df, on='PMID', how='inner')

In [46]:
classified_df.head()

Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI,Abstract,text_data,deep learning in virology,deep learning in epidemiology,title_abstract,computer vision,text mining,label
0,39390053,Multi-scale input layers and dense decoder agg...,"Lan X, Jin W.",Sci Rep. 2024 Oct 10;14(1):23729. doi: 10.1038...,Lan X,Sci Rep,2024,2024/10/10,PMC11467340,,10.1038/s41598-024-74701-0,Accurate segmentation of COVID-19 lesions from...,Multi-scale input layers and dense decoder agg...,0.952872,0.938716,Multi-scale input layers and dense decoder agg...,0.147727,0.043978,other
1,39363262,Truncated M13 phage for smart detection of E. ...,"Yuan J, Zhu H, Li S, Thierry B, Yang CT, Zhang...",J Nanobiotechnology. 2024 Oct 3;22(1):599. doi...,Yuan J,J Nanobiotechnology,2024,2024/10/04,PMC11451008,,10.1186/s12951-024-02881-y,BACKGROUND: The urgent need for affordable and...,Truncated M13 phage for smart detection of E. ...,0.946782,0.882754,Truncated M13 phage for smart detection of E. ...,0.966224,0.478241,computer vision
2,39287522,AI for Multistructure Incidental Findings and ...,"Marcinkiewicz AM, Buchwald M, Shanbhag A, Bedn...",Radiology. 2024 Sep;312(3):e240541. doi: 10.11...,Marcinkiewicz AM,Radiology,2024,2024/09/17,PMC11427857,,10.1148/radiol.240541,Background Incidental extrapulmonary findings ...,AI for Multistructure Incidental Findings and ...,0.175686,0.716531,AI for Multistructure Incidental Findings and ...,0.113535,0.711038,text mining
3,39273393,Digital Whole Slide Image Analysis of Elevated...,"Karancsi Z, Gregus B, Krenács T, Cserni G, Nag...",Int J Mol Sci. 2024 Aug 30;25(17):9445. doi: 1...,Karancsi Z,Int J Mol Sci,2024,2024/09/14,PMC11394775,,10.3390/ijms25179445,Triple-negative breast cancer (TNBC) is a subt...,Digital Whole Slide Image Analysis of Elevated...,0.213716,0.538159,Digital Whole Slide Image Analysis of Elevated...,0.870068,0.602624,both
4,39269702,Health Warnings on Instagram Advertisements fo...,"Wu J, Trifiro BM, Ranker LR, Origgi JM, Benjam...",JAMA Netw Open. 2024 Sep 3;7(9):e2434434. doi:...,Wu J,JAMA Netw Open,2024,2024/09/13,PMC11400217,,10.1001/jamanetworkopen.2024.34434,IMPORTANCE: Synthetic nicotine is increasingly...,Health Warnings on Instagram Advertisements fo...,0.855169,0.902817,Health Warnings on Instagram Advertisements fo...,0.969292,0.889679,both


In [48]:
classified_df['label'].value_counts()

label
text mining        2657
other              2018
both                918
computer vision     239
Name: count, dtype: int64

In [50]:
classified_df.to_csv('results/filtered_classified_papers.csv', index=False)