In [6]:
import torch
from torch import nn
from transformers import BertModel, BertTokenizer

# Define the Siamese BERT Network
class SiameseBertNetwork(nn.Module):
    def __init__(self):
        super(SiameseBertNetwork, self).__init__()
        self.bert = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
        self.similarity_layer = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size * 2, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        pooled_output1 = self.bert(input_ids=input_ids1, attention_mask=attention_mask1).pooler_output
        pooled_output2 = self.bert(input_ids=input_ids2, attention_mask=attention_mask2).pooler_output
        combined_output = torch.cat((pooled_output1, pooled_output2), 1)
        similarity_score = self.similarity_layer(combined_output)
        return similarity_score

# Function to tokenize sentences
def tokenize(sentences, tokenizer):
    return tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
model = SiameseBertNetwork()

In [7]:
import pandas as pd

file_path = '/kaggle/input/dataset/summarized_abstracts_bert/summarized_patient-number2-articles.csv'
df = pd.read_csv(file_path)

In [8]:
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
import string

# Ensure you have the NLTK punkt tokenizer downloaded in your environment
nltk.download('punkt')

# Function to split text into sentences and remove punctuation
def split_into_sentences(text):
    return sent_tokenize(text) if pd.notna(text) else []

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Load your CSV file
file_path = '/kaggle/input/dataset/summarized_abstracts_bert/summarized_patient-number2-articles.csv'  # Update with your file path
df = pd.read_csv(file_path)

# Process the summaries to split into sentences and remove punctuation
articles_sentences_cleaned = []
for summary in df['Summary']:
    if pd.notna(summary):
        sentences = split_into_sentences(summary)
        cleaned_sentences = [remove_punctuation(sentence) for sentence in sentences]
        articles_sentences_cleaned.append(cleaned_sentences)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
articles_sentences_cleaned[0]

['However the followup coronary angiography showed negative result and the symptom improved dramatically with the treatment of nonsteroidal antiinflammatory drug treatment',
 'Therefore it is important for the clinician to differentiate acute myocardial infarctionacute stent thrombosis from this rare complication after PCI']

In [10]:
patient_history_sentences = [
        "65", "old woman", "arrives  to the ed", "medical history includes hypertension", "Atherosclerosis", "artery disease", "artery bypass", "when leaning forward", "heard on auscultation", "ECG", "global"
    ]
for i in range(0,4):
    

    pubmed_articles = articles_sentences_cleaned[i]
    # Tokenize sentences
    tokenized_history = tokenize(patient_history_sentences, tokenizer)
    tokenized_articles = tokenize(pubmed_articles, tokenizer)

    # Compute similarity scores
    with torch.no_grad():
        all_scores = []
        for i in range(len(tokenized_history['input_ids'])):
            scores = []
            for j in range(len(tokenized_articles['input_ids'])):
                score = model(
                    input_ids1=tokenized_history['input_ids'][i].unsqueeze(0),
                    attention_mask1=tokenized_history['attention_mask'][i].unsqueeze(0),
                    input_ids2=tokenized_articles['input_ids'][j].unsqueeze(0),
                    attention_mask2=tokenized_articles['attention_mask'][j].unsqueeze(0)
                )
                scores.append(score.item())
            all_scores.append(scores)

    # Aggregate scores (maximum similarity for each sentence in the history list)
    max_similarity_per_sentence = [max(scores) for scores in all_scores]
    overall_entropy = sum(max_similarity_per_sentence) / len(max_similarity_per_sentence)

    print(f"Overall entropy between patient history and pubmed article: {overall_entropy}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Overall entropy between patient history and pubmed article: 0.43397176494965184
Overall entropy between patient history and pubmed article: 0.4577015477877397
Overall entropy between patient history and pubmed article: 0.4434421222943526
Overall entropy between patient history and pubmed article: 0.4578437598852011
