In [1]:
import torch
from torch import nn
from transformers import GPT2Model, GPT2Tokenizer

# Define the Siamese Network using GPT-2
class SiameseGPT2Network(nn.Module):
    def __init__(self):
        super(SiameseGPT2Network, self).__init__()
        self.gpt2 = GPT2Model.from_pretrained('gpt2')
        self.similarity_layer = nn.Sequential(
            nn.Linear(self.gpt2.config.n_embd * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        outputs1 = self.gpt2(input_ids=input_ids1, attention_mask=attention_mask1)
        outputs2 = self.gpt2(input_ids=input_ids2, attention_mask=attention_mask2)

        pooled_output1 = torch.mean(outputs1.last_hidden_state, dim=1)
        pooled_output2 = torch.mean(outputs2.last_hidden_state, dim=1)

        # Ensure that the batch sizes are the same
        min_batch_size = min(pooled_output1.size(0), pooled_output2.size(0))
        pooled_output1 = pooled_output1[:min_batch_size, :]
        pooled_output2 = pooled_output2[:min_batch_size, :]

        combined_output = torch.cat((pooled_output1, pooled_output2), 1)
        similarity_score = self.similarity_layer(combined_output)
        return similarity_score

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = SiameseGPT2Network()
tokenizer.pad_token = tokenizer.eos_token
# Function to tokenize sentences

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [2]:
import pandas as pd

file_path = '/kaggle/input/complete-dataset/summarized_abstracts_bert/summarized_patient-number2-articles.csv'
df = pd.read_csv(file_path)
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
import string

# Ensure you have the NLTK punkt tokenizer downloaded in your environment
nltk.download('punkt')

# Function to split text into sentences and remove punctuation
def split_into_sentences(text):
    return sent_tokenize(text) if pd.notna(text) else []

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Load your CSV file
file_path = '/kaggle/input/complete-dataset/summarized_abstracts_gpt/summarized_patient-number2-articles.csv'  # Update with your file path
df = pd.read_csv(file_path)

# Process the summaries to split into sentences and remove punctuation
articles_sentences_cleaned = []
pubIds=df['ID'].tolist()
for summary in df['Summary']:
    if pd.notna(summary):
        sentences = split_into_sentences(summary)
        cleaned_sentences = [remove_punctuation(sentence) for sentence in sentences]
        articles_sentences_cleaned.append(cleaned_sentences)
        

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
articles_sentences_cleaned[0]

['Acute pericarditis is a rare complication of percutaneous coronary intervention PCI',
 'Here we describe a case of PCI complicated by guidewire perforation and contrast extravasation',
 'Acute pericarditis developed 36 hours after PCI procedure with fever and severe chest pain',
 'Electrocardiogram showed ST elevation in inferiorlateral leads',
 'However the followup coronary angiography showed negative result and the symptom improved dramatically with the treatment of nonsteroidal antiinflammatory drug treatment',
 'Therefore it is important for the clinician to differentiate acute myocardial infarction  acute stent thrombosis from this rare complication after PCI']

In [4]:
def tokenize(sentences, tokenizer):
    max_length = 128  # You can adjust this based on your data
    return tokenizer(sentences, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")

# Example data (lists of lists)
patient_history_sentences = [
        "A 65-year-old woman arrives to the ED complaining of chest pain",
        "Her past medical history includes hypertension atherosclerosis, and coronary artery disease",
        "She underwent a coronary artery bypass graft (CABG) 3 weeks ago for three-vessel disease",
        "She reports that her chest pain worsens with inspiration and lessens when leaning forward.",
        "A friction rub is heard on auscultation",
        "ECG shows global ST elevation"
    ]

# Process and compute similarity scores
overall_similarities = []
for article_sentence in articles_sentences_cleaned:
    tokenized_article= tokenize(article_sentence, tokenizer)
    history_similarities = []
    for history_sentence in patient_history_sentences:
        tokenized_history = tokenize(history_sentence, tokenizer)
        with torch.no_grad():
            similarity_score = model(
                input_ids1=tokenized_history['input_ids'],
                attention_mask1=tokenized_history['attention_mask'],
                input_ids2=tokenized_article['input_ids'],
                attention_mask2=tokenized_article['attention_mask']
            )
            history_similarities.append(similarity_score.mean().item())
    overall_similarity = sum(history_similarities) / len(history_similarities) if history_similarities else 0
    overall_similarities.append(overall_similarity)

    
# Print results
entropy_df = pd.DataFrame({'PubId': pubIds, 'Entropy_score': overall_similarities})
print(entropy_df.head())
# Save to CSV
csv_file_path = 'entropy_score.csv'
entropy_df.to_csv(csv_file_path, index=False)

      PubId  Entropy_score
0  23388234       0.728859
1  17921916       0.727644
2  34067941       0.763335
3  35018605       0.762927
4  32856192       0.774622


In [5]:
print(overall_similarities)

[0.7288590371608734, 0.7276435891787211, 0.7633351385593414, 0.7629267772038778, 0.7746218045552572]
