In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/collection_with_abstracts.csv')

In [3]:
df['Abstract'] = df['Abstract'].fillna('')

In [4]:
df['text_data'] = df.apply(lambda row: row['Title'] + '\n\n' + row['Abstract'], axis=1)

# Semantic Similarity

In [None]:
# text_mining_phrases = ["natural language processing", "text mining", "NLP", "computational linguistics", "language processing", "text analytics", "textual data analysis", "text data analysis", "text analysis", "speech and language technology", "language modeling", "computational semantics"]
# computer_vision_phrases = ["computer vision", "vision model", "image processing", "vision algorithms", "computer graphics and vision", "object recognition", "scene understanding"]

In [5]:
from sentence_transformers import SentenceTransformer
import spacy

# sbert_model = SentenceTransformer('all-MiniLM-L6-v2')  # Use any preferred SBERT model
bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
nlp = spacy.load("en_core_web_sm")

def split_paragraph_to_sentences(paragraph):
    # Split the paragraph into sentences using spaCy
    doc = nlp(paragraph)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

def encode_paragraphs(paragraphs):

    paragraph_sentence_embeddings = []

    for PMID, paragraph in paragraphs:
        sentences = split_paragraph_to_sentences(paragraph)

        sentence_embeddings = bi_encoder.encode(sentences, convert_to_tensor=True)

        paragraph_sentence_embeddings.append({
            "PMID": PMID,
            "paragraph": paragraph,
            "sentences": sentences,
            "sentence_embeddings": sentence_embeddings
        })

    return paragraph_sentence_embeddings




In [6]:
paragraphs = df[['PMID', 'text_data']].values.tolist()

In [7]:
paragraphs[0]

[39435445,
 'Editorial: The operationalization of cognitive systems in the comprehension of visual structures\n\n']

In [42]:
# Encode paragraphs
paragraph_sentence_embeddings = encode_paragraphs(paragraphs)

# Save and Load paragraph_sentence_embeddings

In [43]:
# import pickle

# with open("embeddings/paragraph_sentence_embeddings.pkl", "wb") as file:
#     pickle.dump(paragraph_sentence_embeddings, file)


In [8]:
import pickle

with open("embeddings/paragraph_sentence_embeddings.pkl", "rb") as file:
    paragraph_sentence_embeddings = pickle.load(file)

In [9]:
paragraph_sentence_embeddings[1]

{'PMID': 39398866,
 'paragraph': "Characterization of arteriosclerosis based on computer-aided measurements of intra-arterial thickness\n\nPURPOSE: Our purpose is to develop a computer vision approach to quantify intra-arterial thickness on digital pathology images of kidney biopsies as a computational biomarker of arteriosclerosis.\r\nAPPROACH: The severity of the arteriosclerosis was scored (0 to 3) in 753 arteries from 33 trichrome-stained whole slide images (WSIs) of kidney biopsies, and the outer contours of the media, intima, and lumen were manually delineated by a renal pathologist. We then developed a multi-class deep learning (DL) framework for segmenting the different intra-arterial compartments (training dataset: 648 arteries from 24 WSIs; testing dataset: 105 arteries from 9 WSIs). Subsequently, we employed radial sampling and made measurements of media and intima thickness as a function of spatially encoded polar coordinates throughout the artery. Pathomic features were ex

# Filter out irrelevant

In [10]:
from sentence_transformers import util

def get_paragraph_similarity_with_query(query, paragraph_sentence_embeddings):
    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)

    paragraph_scores = []

    for paragraph_data in paragraph_sentence_embeddings:

        sentence_embeddings = paragraph_data["sentence_embeddings"]

        similarities = util.cos_sim(query_embedding, sentence_embeddings)[0]

        max_similarity = similarities.max().item()

        max_sim_ind = similarities.argmax().item()

        # paragraph_scores.append((paragraph_data["paragraph"], max_similarity))
        # paragraph_scores.append((paragraph_data["sentences"][max_sim_ind], max_similarity))
        paragraph_scores.append({'PMID': paragraph_data['PMID'], 'paragraph': paragraph_data["paragraph"], 'max_sentence': paragraph_data["sentences"][max_sim_ind], 'max_sentence_score': max_similarity})

    return paragraph_scores


In [11]:
def get_matches(query, threshold):
    paragraph_scores = get_paragraph_similarity_with_query(query, paragraph_sentence_embeddings)
    paragraph_scores.sort(key=lambda x: x['max_sentence_score'], reverse=True)
    # paragraph_scores_filtered = [p for p in paragraph_scores if p['max_sentence_score'] >= threshold]
    return set([p['PMID'] for p in paragraph_scores if p['max_sentence_score'] >= threshold])

In [18]:
combined = (get_matches("virology", 0.15) | get_matches("epidemiology", 0.15)) & get_matches("deep learning", 0.15)

In [19]:
len(combined)

4959

In [20]:
paragraph_sentence_embeddings_filtered = [ p for p in paragraph_sentence_embeddings if p['PMID'] in combined ]

In [21]:
len(paragraph_sentence_embeddings_filtered)

4959

# Classify papers

In [22]:
text_mining_phrases = ["natural language processing", "text mining", "NLP", "computational linguistics", "language processing", "text analytics", "textual data analysis", "text data analysis", "text analysis", "speech and language technology", "language modeling", "computational semantics"]
computer_vision_phrases = ["computer vision", "vision model", "image processing", "vision algorithms", "computer graphics and vision", "object recognition", "scene understanding"]

In [23]:
from sentence_transformers import util
import numpy as np

def get_similarity_with_cv_text_queries(computer_vision_phrases, text_mining_phrases, paragraph_sentence_embeddings):
    
    computer_vision_embeddings = bi_encoder.encode(computer_vision_phrases, convert_to_tensor=True)
    text_mining_embeddings = bi_encoder.encode(text_mining_phrases, convert_to_tensor=True)

    paragraph_scores = []

    # Loop through each paragraph's precomputed sentence embeddings
    for paragraph_data in paragraph_sentence_embeddings:


        sentence_embeddings = paragraph_data["sentence_embeddings"]
        
        cv_similarities = util.cos_sim(computer_vision_embeddings, sentence_embeddings)
        text_similarities = util.cos_sim(text_mining_embeddings, sentence_embeddings)
        
        cv_max_score = util.cos_sim(computer_vision_embeddings, sentence_embeddings).max().item()
        text_max_score = util.cos_sim(text_mining_embeddings, sentence_embeddings).max().item()
        
        
        max_index_1 = np.unravel_index(np.argmax(cv_similarities.cpu().numpy()), cv_similarities.shape)
        max_index_2 = np.unravel_index(np.argmax(text_similarities.cpu().numpy()), text_similarities.shape)
        
        computer_vision_max_sentence = paragraph_data['sentences'][max_index_1[1]]
        computer_vision_max_phrase = computer_vision_phrases[max_index_1[0]]
        
        text_mining_max_sentence = paragraph_data['sentences'][max_index_2[1]]
        text_mining_max_phrase = text_mining_phrases[max_index_2[0]]
        
        data = {'PMID': paragraph_data['PMID'], 'computer_vision_max_phrase': computer_vision_max_phrase, 'computer_vision_max_sentence': computer_vision_max_sentence, 'computer_vision_max_score': cv_max_score, 'text_mining_max_phrase': text_mining_max_phrase,'text_mining_max_sentence': text_mining_max_sentence, 'text_mining_max_score': text_max_score}
        paragraph_scores.append(data)
        
    return paragraph_scores


In [24]:
paragraph_scores_cv_text = get_similarity_with_cv_text_queries(computer_vision_phrases, text_mining_phrases, paragraph_sentence_embeddings_filtered)

In [25]:
len(paragraph_scores_cv_text)

4959

In [26]:
def label_paragraphs(paragraph_scores_cv_text, threshold=0.3):
    for data in paragraph_scores_cv_text:
        label = ''
        if data['computer_vision_max_score'] >= threshold and data['text_mining_max_score'] >= threshold:
            label = 'both'
        elif data['computer_vision_max_score'] >= threshold:
            label = 'computer vision'
        elif data['text_mining_max_score'] >= threshold:
            label = 'text mining'
        else:
            label = 'other'
        data['label'] = label

In [36]:
label_paragraphs(paragraph_scores_cv_text, threshold=0.3)

In [37]:
labels = [ d['label'] for d in paragraph_scores_cv_text ]

In [38]:
from collections import Counter
Counter(labels)

Counter({'other': 1554,
         'text mining': 1333,
         'both': 1211,
         'computer vision': 861})