In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/collection_with_abstracts.csv')

In [3]:
df['Abstract'] = df['Abstract'].fillna('')

In [4]:
df['text_data'] = df.apply(lambda row: row['Title'] + '\n\n' + row['Abstract'], axis=1)

# Semantic Similarity

In [None]:
# text_mining_phrases = ["natural language processing", "text mining", "NLP", "computational linguistics", "language processing", "text analytics", "textual data analysis", "text data analysis", "text analysis", "speech and language technology", "language modeling", "computational semantics"]
# computer_vision_phrases = ["computer vision", "vision model", "image processing", "vision algorithms", "computer graphics and vision", "object recognition", "scene understanding"]

In [5]:
from sentence_transformers import SentenceTransformer
import spacy

# sbert_model = SentenceTransformer('all-MiniLM-L6-v2')  
bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
nlp = spacy.load("en_core_web_sm")


def encode_paragraphs(paragraphs):
    paragraph_embeddings_list = []

    for PMID, paragraph in paragraphs:

        paragraph_embeddings = bi_encoder.encode(paragraph, convert_to_tensor=True)

        paragraph_embeddings_list.append({
            "PMID": PMID,
            "paragraph": paragraph,
            "paragraph_embeddings": paragraph_embeddings
        })

    return paragraph_embeddings_list




In [6]:
paragraphs = df[['PMID', 'text_data']].values.tolist()

In [7]:
paragraphs[0]

[39435445,
 'Editorial: The operationalization of cognitive systems in the comprehension of visual structures\n\n']

In [8]:
# Encode paragraphs
paragraph_embeddings_list = encode_paragraphs(paragraphs)

# Save and Load paragraph_embeddings_list

In [9]:
# import pickle

# with open("embeddings/paragraph_embeddings_list.pkl", "wb") as file:
#     pickle.dump(paragraph_embeddings_list, file)


In [15]:
import pickle

with open("embeddings/paragraph_embeddings_list.pkl", "rb") as file:
    paragraph_embeddings_list = pickle.load(file)

In [32]:
from sentence_transformers import util

query_embedding = bi_encoder.encode('deep learning', convert_to_tensor=True)

paragraph_scores = []


paragraph_embeddings = paragraph_embeddings_list[0]["paragraph_embeddings"]

# Calculate similarity between the query and each sentence in the paragraph
similarities = util.cos_sim(query_embedding, paragraph_embeddings)[0]

In [47]:
from sentence_transformers import util

def get_paragraph_similarity_with_query(query, paragraph_embeddings_list):

    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)

    paragraph_scores = []

    for paragraph_data in paragraph_embeddings_list:

        paragraph_embeddings = paragraph_data["paragraph_embeddings"]

        similarities = util.cos_sim(query_embedding, paragraph_embeddings)[0]

        max_similarity = similarities.max().item()


        # Append the paragraph and its max similarity score
        # paragraph_scores.append((paragraph_data["paragraph"], max_similarity))
        # paragraph_scores.append((paragraph_data["sentences"][max_sim_ind], max_similarity))
        paragraph_scores.append({'PMID': paragraph_data['PMID'], 'paragraph': paragraph_data["paragraph"], 'similarity_score': max_similarity})
        # paragraph_scores.append({'PMID': paragraph_data['PMID'], 'paragraph': paragraph_data["paragraph"], 'max_sentence': paragraph_data["sentences"][max_sim_ind], 'max_sentence_score': max_similarity})

    return paragraph_scores


In [70]:
queries_embeddings = bi_encoder.encode(['deep learning', 'operationalization'], convert_to_tensor=True)


paragraph_embeddings = paragraph_embeddings_list[0]["paragraph_embeddings"]

similarities = util.cos_sim(queries_embeddings, paragraph_embeddings)

max_score = similarities.max().item()

max_index = np.argmax(similarities.cpu().numpy())

In [71]:
np.argmax(similarities.cpu().numpy())

1

In [72]:
max_score

0.47679921984672546

In [73]:
max_index

1

In [84]:
# bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')  
bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')

# Filter out irrelevant

In [10]:
from sentence_transformers import util
import numpy as np

def get_max_similarity_with_queries(queries, paragraph_embeddings_list):
    
    queries_embeddings = bi_encoder.encode(queries, convert_to_tensor=True)

    paragraph_scores = []

    for paragraph_data in paragraph_embeddings_list:

        paragraph_embeddings = paragraph_data["paragraph_embeddings"]
        
        similarities = util.cos_sim(queries_embeddings, paragraph_embeddings)
        
        max_score = similarities.max().item()
        
        max_index = np.argmax(similarities.cpu().numpy())
                
        data = {'PMID': paragraph_data['PMID'], 'paragraph': paragraph_data["paragraph"], 'phrase_with_max_score': queries[max_index], 'max_score': max_score}
        paragraph_scores.append(data)
        
    return paragraph_scores
 

In [33]:
def get_matches(queries, threshold):
    paragraph_scores = get_max_similarity_with_queries(queries, paragraph_embeddings_list)
    paragraph_scores.sort(key=lambda x: x['max_score'], reverse=True)
    # paragraph_scores_filtered = [p for p in paragraph_scores if p['max_score'] >= threshold]
    # return paragraph_scores_filtered
    # return [p['PMID'] for p in paragraph_scores if p['max_score'] >= threshold]
    return set([p['PMID'] for p in paragraph_scores if p['max_score'] >= threshold])

In [52]:
a, b, c = get_matches("virology", 0.0), get_matches("epidemiology", 0.0), get_matches("deep learning", 0.0)

In [53]:
len((a | b) & c)

6721

In [54]:
combined = (get_matches("virology", 0.0) | get_matches("epidemiology", 0.0)) & get_matches("deep learning", 0.0)

In [55]:
len(combined)

6721

In [115]:
paragraph_embeddings_list_filtered = [ p for p in paragraph_embeddings_list if p['PMID'] in combined ]

In [116]:
len(paragraph_embeddings_list_filtered)

6721

In [57]:
queries = ['deep learning in virology', 'deep learning in epidemiology']

paragraph_scores = get_max_similarity_with_queries(queries, paragraph_embeddings_list)
paragraph_scores.sort(key=lambda x: x['max_score'], reverse=True)

In [65]:
len(set([p['PMID'] for p in paragraph_scores if p['max_score'] >= 0.15]))

6383

# Classify papers

In [66]:
text_mining_phrases = ["natural language processing", "text mining", "NLP", "computational linguistics", "language processing", "text analytics", "textual data analysis", "text data analysis", "text analysis", "speech and language technology", "language modeling", "computational semantics"]
computer_vision_phrases = ["computer vision", "vision model", "image processing", "vision algorithms", "computer graphics and vision", "object recognition", "scene understanding"]

In [126]:
from sentence_transformers import util
import numpy as np

def get_similarity_with_cv_text_queries(computer_vision_phrases, text_mining_phrases, paragraph_list):
    
    computer_vision_embeddings = bi_encoder.encode(computer_vision_phrases, convert_to_tensor=True)
    text_mining_embeddings = bi_encoder.encode(text_mining_phrases, convert_to_tensor=True)

    paragraph_scores = []

    for paragraph_data in paragraph_list:

        paragraph_embeddings = paragraph_data["paragraph_embeddings"]
        
        cv_similarities = util.cos_sim(computer_vision_embeddings, paragraph_embeddings)
        text_similarities = util.cos_sim(text_mining_embeddings, paragraph_embeddings)
        
        cv_max_score = util.cos_sim(computer_vision_embeddings, paragraph_embeddings).max().item()
        text_max_score = util.cos_sim(text_mining_embeddings, paragraph_embeddings).max().item()
        
        max_index_1 = np.argmax(cv_similarities.cpu().numpy())
        max_index_2 = np.argmax(text_similarities.cpu().numpy())
        
        computer_vision_max_phrase = computer_vision_phrases[max_index_1]
        text_mining_max_phrase = text_mining_phrases[max_index_2]        

        
        data = {'PMID': paragraph_data['PMID'], 'paragraph': paragraph_data['paragraph'], 'computer_vision_max_phrase': computer_vision_max_phrase, 'computer_vision_max_score': cv_max_score, 'text_mining_max_phrase': text_mining_max_phrase, 'text_mining_max_score': text_max_score}
        paragraph_scores.append(data)
        
    return paragraph_scores


In [127]:
paragraph_scores_cv_text = get_similarity_with_cv_text_queries(computer_vision_phrases, text_mining_phrases, paragraph_embeddings_list_filtered)

In [128]:
len(paragraph_scores_cv_text)

6721

In [129]:
def label_paragraphs(paragraph_scores_cv_text, threshold=0.3):
    for data in paragraph_scores_cv_text:
        label = ''
        if data['computer_vision_max_score'] >= threshold and data['text_mining_max_score'] >= threshold:
            label = 'both'
        elif data['computer_vision_max_score'] >= threshold:
            label = 'computer vision'
        elif data['text_mining_max_score'] >= threshold:
            label = 'text mining'
        else:
            label = 'other'
        data['label'] = label

In [133]:
label_paragraphs(paragraph_scores_cv_text, threshold=0.2)

In [134]:
labels = [ d['label'] for d in paragraph_scores_cv_text ]

In [135]:
from collections import Counter
Counter(labels)

Counter({'both': 2956,
         'text mining': 1654,
         'other': 1590,
         'computer vision': 521})