In [None]:
import pandas as pd


folder_path_real = "../dataset/"
folder_path_synthetic = "../dataset/"
# Read the datasets
real_df = pd.read_csv(f"{folder_path_real}/medical_tc_train.csv")
synthetic_df = pd.read_csv(f"{folder_path_synthetic}/Simpler_Augmented_Synthetic_Dataset.csv")

# Combine and shuffle the datasets
combined_df = pd.concat([real_df, synthetic_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)


output_path = f"{folder_path_real}/combined_medical_dataset.csv"
combined_df.to_csv(output_path, index=False)

# Print dataset sizes
print(f"Original dataset size: {len(real_df)}")
print(f"Synthetic dataset size: {len(synthetic_df)}")
print(f"Combined dataset size: {len(combined_df)}")


print("\nFirst few rows of combined dataset:")
print(combined_df.head())


Original dataset size: 11550
Synthetic dataset size: 15000
Combined dataset size: 26550

First few rows of combined dataset:
   condition_label                                   medical_abstract
0                1  Extended neck dissection. From the time Crile ...
1                5  Thoracoplasty: current application to the infe...
2                3  Recurrent tension headache in adolescents trea...
3                1  Intraoperative pancreatic fine needle aspirati...
4                1  Presence of identical mitochondrial proteins i...


In [None]:
import pandas as pd
folder_path = "../dataset/"

training_total = len(pd.read_csv(f"{folder_path}/medical_tc_train.csv")) + \
                 len(pd.read_csv(f"{folder_path}/Simpler_Augmented_Synthetic_Dataset.csv"))
test_total = len(pd.read_csv(f"{folder_path}/medical_tc_test.csv"))

print(f"Total number of rows (Training + Test): {training_total + test_total}")

Total number of rows (Training + Test): 29438


In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import re
from tqdm import tqdm
import torch

def clean_clinical_text(text):
    """Clean clinical text by removing special characters and extra whitespace."""
    text = str(text)
    # Remove special characters but keep important punctuation
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\?\!]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text.lower().strip()

def preprocess_for_biobert(df, max_length=512):
    """Preprocess clinical text data for BioBERT"""
    # Load BioBERT tokenizer
    tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-v1.1', device_map="auto")

    # Clean the texts
    print("Cleaning medical abstracts...")
    df['medical_abstract'] = df['medical_abstract'].apply(clean_clinical_text)

    # Prepare lists for encoded data
    input_ids = []
    attention_masks = []

    # Encode each text
    print("Encoding texts with BioBERT tokenizer...")
    for text in tqdm(df['medical_abstract']):
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    # Convert lists to tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(df['condition_label'].values - 1)
    return {
        'input_ids': input_ids,
        'attention_masks': attention_masks,
        'labels': labels
    }



print("Loading data...")
df = pd.read_csv(f"{folder_path}/combined_medical_dataset.csv")


print(f"\nTotal samples: {len(df)}")
print(f"Number of unique conditions: {df['condition_label'].nunique()}")
print("\nLabel distribution:")
print(df['condition_label'].value_counts().sort_index())


print("\nPreprocessing for BioBERT...")
preprocessed_data = preprocess_for_biobert(df)

print("\nPreprocessing complete!")
print(f"Input shape: {preprocessed_data['input_ids'].shape}")
print(f"Number of labels: {len(preprocessed_data['labels'])}")

print("\nUnique labels in processed data:", torch.unique(preprocessed_data['labels']).tolist())

Loading data...

Total samples: 26550
Number of unique conditions: 5

Label distribution:
condition_label
1    5694
2    2810
3    3621
4    5615
5    8810
Name: count, dtype: int64

Preprocessing for BioBERT...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Cleaning medical abstracts...
Encoding texts with BioBERT tokenizer...


100%|██████████| 26550/26550 [00:30<00:00, 877.47it/s]



Preprocessing complete!
Input shape: torch.Size([26550, 512])
Number of labels: 26550

Unique labels in processed data: [0, 1, 2, 3, 4]


In [None]:
import torch
from torch.utils.data import DataLoader, random_split, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.optim import SGD, AdamW
import numpy as np
import time

# Set random seed for reproducibility
torch.manual_seed(42)

def create_data_loaders(input_ids, attention_masks, labels, batch_size=32, val_split=0.1):
    """Create train and validation dataloaders"""
    # Combine into dataset
    dataset = TensorDataset(input_ids, attention_masks, labels)

    # Calculate lengths for split
    val_len = int(len(dataset) * val_split)
    train_len = len(dataset) - val_len

    # Split dataset
    train_dataset, val_dataset = random_split(dataset, [train_len, val_len])

    # Create dataloaders
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True
    )

    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    return train_dataloader, val_dataloader

def setup_model(num_labels=5):
    """Initialize BioBERT model for classification"""
    model = AutoModelForSequenceClassification.from_pretrained(
        'dmis-lab/biobert-v1.1',
        num_labels=num_labels,
        output_attentions=False,
        output_hidden_states=False
    )
    return model

def train_model(model, train_dataloader, val_dataloader, epochs=5):
    # Set up optimizer
    optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.0001)

    # Total steps for scheduler
    total_steps = len(train_dataloader) * epochs

    # Create scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=1000,
        num_training_steps=total_steps
    )

    # Initialize tracking variables
    best_accuracy = 0
    training_stats = []

    # Training loop
    for epoch in range(epochs):
        print(f'\n======== Epoch {epoch + 1} / {epochs} ========')
        print('Training...')

        # Reset tracking variables
        total_train_loss = 0
        model.train()

        # Training
        for step, batch in enumerate(train_dataloader):
            # Progress update
            if step % 40 == 0 and not step == 0:
                print(f'  Batch {step:>5,}  of  {len(train_dataloader):>5,}')

            # Unpack batch and copy to GPU
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Zero gradients
            model.zero_grad()

            # Forward pass
            outputs = model(b_input_ids,
                          attention_mask=b_input_mask,
                          labels=b_labels)

            loss = outputs.loss
            total_train_loss += loss.item()

            # Backward pass
            loss.backward()

            # Clip gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters
            optimizer.step()
            scheduler.step()

        # Calculate average loss for this epoch
        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"\n  Average training loss: {avg_train_loss:.2f}")

        # Validation
        print("\nRunning Validation...")
        model.eval()

        # Tracking variables for validation
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for one epoch
        for batch in val_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            with torch.no_grad():
                outputs = model(b_input_ids,
                              attention_mask=b_input_mask,
                              labels=b_labels)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            accuracy = (predictions == b_labels).float().mean()

            total_eval_accuracy += accuracy.item()
            total_eval_loss += outputs.loss.item()
            nb_eval_steps += 1

        avg_val_accuracy = total_eval_accuracy / nb_eval_steps
        avg_val_loss = total_eval_loss / nb_eval_steps

        print(f"  Validation Loss: {avg_val_loss:.2f}")
        print(f"  Validation Accuracy: {avg_val_accuracy:.2f}")

        if avg_val_accuracy > best_accuracy:
            best_accuracy = avg_val_accuracy
            torch.save(model.state_dict(), '/content/drive/My Drive/best_biobert_medical_classifier.pt')
            print(f"  New best model saved! Accuracy: {best_accuracy:.2f}")

        training_stats.append({
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
        })

    print("\nTraining complete!")
    return model, training_stats, best_accuracy

# Create dataloaders
train_dataloader, val_dataloader = create_data_loaders(
    preprocessed_data['input_ids'],
    preprocessed_data['attention_masks'],
    preprocessed_data['labels']
)

# Initialize model
model = setup_model()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print("\nData Split Information:")
print(f"Training samples: {len(train_dataloader.dataset)}")
print(f"Validation samples: {len(val_dataloader.dataset)}")
print(f"\nModel will run on: {device}")
print(f"Number of batches in training: {len(train_dataloader)}")
print(f"Number of batches in validation: {len(val_dataloader)}")

sample_batch = next(iter(train_dataloader))
print("\nBatch shape information:")
print(f"Input IDs shape: {sample_batch[0].shape}")
print(f"Attention mask shape: {sample_batch[1].shape}")
print(f"Labels shape: {sample_batch[2].shape}")

print("\nStarting training...")
model, training_stats, best_accuracy = train_model(model, train_dataloader, val_dataloader)

print("\nFinal Training Stats:")
for stat in training_stats:
    print(f"Epoch {stat['epoch']}:")
    print(f"  Training Loss: {stat['Training Loss']:.3f}")
    print(f"  Validation Loss: {stat['Valid. Loss']:.3f}")
    print(f"  Validation Accuracy: {stat['Valid. Accur.']:.3f}")

print(f"\nBest validation accuracy: {best_accuracy:.3f}")

In [None]:
# Start training
print("\nStarting training...")
model, training_stats, best_accuracy = train_model(model, val_dataloader, val_dataloader)

# Print final statistics
print("\nFinal Training Stats:")
for stat in training_stats:
    print(f"Epoch {stat['epoch']}:")
    print(f"  Training Loss: {stat['Training Loss']:.3f}")
    print(f"  Validation Loss: {stat['Valid. Loss']:.3f}")
    print(f"  Validation Accuracy: {stat['Valid. Accur.']:.3f}")

print(f"\nBest validation accuracy: {best_accuracy:.3f}")

In [None]:
real_test_df = pd.read_csv(f"{folder_path}/medical_tc_test.csv")
real_test_preprocessed = preprocess_for_biobert(real_test_df)
real_test_dataloader = DataLoader(
    TensorDataset(real_test_preprocessed['input_ids'], real_test_preprocessed['attention_masks'], real_test_preprocessed['labels']),
    batch_size=16,
    shuffle=False
)
model.eval()
total_test_accuracy = 0
for batch in real_test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)
    accuracy = (predictions == b_labels).float().mean()
    total_test_accuracy += accuracy.item()
    avg_test_accuracy = total_test_accuracy / len(real_test_dataloader)
print(f"Average Test Accuracy: {avg_test_accuracy:.3f}")


Cleaning medical abstracts...
Encoding texts with BioBERT tokenizer...


100%|██████████| 2888/2888 [00:03<00:00, 897.01it/s]


Average Test Accuracy: 0.546


In [None]:
model.save_pretrained('../dataset/best_biobert_medical_classifier')

In [None]:
from transformers import AutoModel

# Load the BioBERT model and move it to GPU
biobert_model = AutoModel.from_pretrained('dmis-lab/biobert-v1.1').to('cuda')
print("BioBERT model loaded on GPU!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


BioBERT model loaded on GPU!


In [None]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
import numpy as np

# Load BioBERT model and tokenizer
model_name = 'dmis-lab/biobert-v1.1
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval()

# Main execution
folder_path = "../dataset"

# Load data
print("Loading data...")
df = pd.read_csv(f"{folder_path}/combined_medical_dataset.csv")

texts = df['medical_abstract'].tolist()

# Preprocess and tokenize the dataset
def preprocess_and_tokenize(texts, tokenizer, max_length=512):
    """Tokenize all texts and return input IDs and attention masks."""
    input_ids = []
    attention_masks = []
    for text in tqdm(texts, desc="Tokenizing"):
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Tokenize the full dataset
input_ids, attention_masks = preprocess_and_tokenize(texts, tokenizer)

def extract_embeddings(input_ids, attention_masks, model, batch_size=32):
    """Extract embeddings in batches to handle large datasets."""
    embeddings = []
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        for i in tqdm(range(0, len(input_ids), batch_size), desc="Extracting embeddings"):
            batch_input_ids = input_ids[i:i + batch_size].to('cuda')
            batch_attention_masks = attention_masks[i:i + batch_size].to('cuda')
            outputs = model(batch_input_ids, attention_mask=batch_attention_masks, output_hidden_states=True)
            # Now you can access hidden states:
            cls_embeddings = outputs.hidden_states[-1][:, 0, :]
            embeddings.append(cls_embeddings.cpu())
    return torch.cat(embeddings, dim=0)

embeddings = extract_embeddings(input_ids, attention_masks, model)

embeddings_np = embeddings.numpy()

output_path = '../embeddings.npy'
np.save(output_path, embeddings_np)
print(f"Embeddings saved to {output_path}")


Loading data...


Tokenizing: 100%|██████████| 26550/26550 [00:31<00:00, 851.44it/s]
Extracting embeddings: 100%|██████████| 830/830 [06:24<00:00,  2.16it/s]


Embeddings saved to /content/drive/My Drive/embeddings.npy


In [None]:
test_df = pd.read_csv(f"{folder_path}/medical_tc_test.csv")
texts = test_df['medical_abstract'].tolist()
test_input_ids, test_attention_masks = preprocess_and_tokenize(texts, tokenizer)
test_embeddings = extract_embeddings(test_input_ids, test_attention_masks, model)
test_embeddings_np = test_embeddings.numpy()
np.save('../test_embeddings.npy', test_embeddings_np)

Tokenizing: 100%|██████████| 2888/2888 [00:03<00:00, 840.77it/s]
Extracting embeddings: 100%|██████████| 91/91 [00:42<00:00,  2.15it/s]


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


embeddings_path = '../embeddings.npy'
embeddings = np.load(embeddings_path)


train_labels = df['condition_label'].values - 1

test_embeddings_path = '../test_embeddings.npy'
test_embeddings = np.load(test_embeddings_path)
test_labels = test_df['condition_label'].values - 1


X_train = embeddings
y_train = train_labels
X_test = test_embeddings
y_test = test_labels

# Logistic Regression
print("Training Logistic Regression...")
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
print("\nLogistic Regression Results:")
print(classification_report(y_test, lr_preds))
print("Accuracy:", accuracy_score(y_test, lr_preds))

# Random Forest
print("\nTraining Random Forest...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
print("\nRandom Forest Results:")
print(classification_report(y_test, rf_preds))
print("Accuracy:", accuracy_score(y_test, rf_preds))

# Support Vector Machine
print("\nTraining Support Vector Machine...")
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)
print("\nSVM Results:")
print(classification_report(y_test, svm_preds))
print("Accuracy:", accuracy_score(y_test, svm_preds))


Training Logistic Regression...

Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.66      0.65      0.66       633
           1       0.42      0.46      0.44       299
           2       0.49      0.47      0.48       385
           3       0.60      0.61      0.61       610
           4       0.42      0.42      0.42       961

    accuracy                           0.52      2888
   macro avg       0.52      0.52      0.52      2888
weighted avg       0.52      0.52      0.52      2888

Accuracy: 0.521814404432133

Training Random Forest...

Random Forest Results:
              precision    recall  f1-score   support

           0       0.66      0.66      0.66       633
           1       0.43      0.45      0.44       299
           2       0.50      0.45      0.48       385
           3       0.60      0.61      0.61       610
           4       0.42      0.42      0.42       961

    accuracy                           0.52 