# Analysis of the PMC-Patients dataset
Link to paper: https://arxiv.org/abs/2202.13876

In [None]:
from datasets import load_dataset


dataset = load_dataset("zhengyun21/PMC-Patients", split="train")

In [None]:
import pandas as pd

patients_summaries = dataset.to_pandas()
patients_summaries.info()

In [None]:
patients_summaries.head()

### Compute the average length of all titles:

In [None]:
title_lengths = [len(title) for title in patients_summaries.title]
pd.Series(title_lengths).describe()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 4))
plt.hist(title_lengths, bins=100, label="count")
plt.xlabel('Length')
plt.ylabel('Count')
plt.title('Title Length Distribution')
plt.show()

### Compute the average length of all patient summaries:

In [None]:
summary_lengths = pd.Series([len(summary) for summary in patients_summaries.patient])
summary_lengths.describe()

In [None]:
import numpy as np


quantile_1 = summary_lengths.quantile(0.01)
quantile_99 = summary_lengths.quantile(0.99)
index = summary_lengths[(summary_lengths >= quantile_99)|(summary_lengths <= quantile_1)].index

summary_lengths_cleaned = summary_lengths.drop(index, inplace=False)

plt.figure(figsize=(20, 4))
plt.hist(summary_lengths_cleaned, bins=200, label="count")
plt.xlabel('Length')
plt.ylabel('Count')
plt.title('Patient Summary Length Distribution (trimmed)')
plt.show()

### Generate random subsets for analysing the veterinary content:

In [None]:
patients_summaries_samples = patients_summaries.sample(n=100, random_state=42)


In [None]:
patients_summaries_samples.to_json("patients_summaries_samples.json", orient="records", lines=True, index=False)

Manual analysis of these random generated sample sets led to the following results:

|Sample Set|Veterinary Proportion|Extrapolation|
|:-----|:--------|:--------|
|Patients Summaries|1%|~ 1,670|

### Analysis by fine-tuned BlueBERT

In [None]:
import torch
from torch import cuda


device = 'cuda' if cuda.is_available() else 'cpu'
device

In [None]:
import kaggle
import os

from Source_code.z_utils.BlueBERTClassifier import BlueBERTClassifier


kaggle.api.authenticate()
data_path = "./models/"
if not os.path.exists(data_path):
    os.makedirs(data_path)
    print(f"Directory created: {data_path}")
models = {}

file_name = "bluebert_pubmed_uncased_L-24_H-1024_A-16.pt"
target_path = f"{data_path}{file_name}"
    
if not os.path.exists(target_path):
    slug = "bluebert-large-pubmed"
    kaggle.api.model_instance_version_download_cli(f"marcelhiltner/{slug}/pytorch/{slug}/1", data_path, untar=True)

blueBERT = torch.load(target_path)
blueBERT.eval()
print("BlueBERT loaded.")

In [None]:
def save_vet_predictions_as_json(predictions, texts, filename):
    label_preds = [torch.argmax(tensor) for tensor in predictions]
    label_preds = torch.stack(label_preds)
    vet_preds = (label_preds == 1).nonzero(as_tuple=True)[0].numpy(force=True)
    vet_preds
    vet_texts = texts.iloc[texts.index.isin(vet_preds)]
    probs = pd.Series(torch.stack([torch.max(tensor) for tensor in predictions]).numpy(force=True), name="probability")
    probs = probs.iloc[probs.index.isin(vet_preds)]
    vet_df = pd.concat([vet_texts, probs], axis=1)

    with open(filename, 'w') as f:
        f.write(vet_df.to_json(orient="records")[1:-1].replace('},{', '} {'))
        
    print(f"Predictions have been saved to {filename}.")
    
    return vet_df

In [None]:
from Source_code.z_utils.data_preprocessing import preprocess_text
from Source_code.z_utils.predict import predict


summary_texts = patients_summaries.patient.sample(n=10000, random_state=42)
summary_texts.reset_index(drop=True, inplace=True)
summary_texts_pp = [preprocess_text(text) for text in summary_texts]

blueBERT.to(device)
blueBERT.eval()
summary_predictions, _ = predict(blueBERT, texts=summary_texts_pp, device=device)

summary_predictions

In [None]:
summary_predictions_t = torch.tensor(summary_predictions)
label_preds = [torch.argmax(tensor) for tensor in summary_predictions_t]
label_preds = torch.stack(label_preds)
vet_preds = (label_preds == 1).nonzero(as_tuple=True)[0].numpy(force=True)
vet_preds
vet_texts = summary_texts.iloc[summary_texts.index.isin(vet_preds)]
probs = pd.Series(torch.stack([torch.max(tensor) for tensor in summary_predictions_t]).numpy(force=True), name="probability")
probs = probs.iloc[probs.index.isin(vet_preds)]
vet_df = pd.concat([vet_texts, probs], axis=1)

filename = "summaries_vet.json"
with open(filename, 'w') as f:
    f.write(vet_df.to_json(orient="records")[1:-1].replace('},{', '} {'))
    
print(f"Predictions have been saved to {filename}.")