In [None]:
import pandas as pd

In [None]:
import pandas as pd

folder_path_real = "../dataset/"
folder_path_synthetic = "../dataset/"

# Read the datasets
real_df = pd.read_csv(f"{folder_path_real}/medical_tc_train.csv")
synthetic_df = pd.read_csv(f"{folder_path_synthetic}/Simpler_Augmented_Synthetic_Dataset.csv")

# Combine and shuffle the datasets
combined_df = pd.concat([real_df, synthetic_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the combined dataset
output_path = f"{folder_path_real}/combined_medical_dataset.csv"
combined_df.to_csv(output_path, index=False)

# dataset sizes
print(f"Original dataset size: {len(real_df)}")
print(f"Synthetic dataset size: {len(synthetic_df)}")
print(f"Combined dataset size: {len(combined_df)}")

print("\nFirst few rows of combined dataset:")
print(combined_df.head())


Original dataset size: 11550
Synthetic dataset size: 15000
Combined dataset size: 26550

First few rows of combined dataset:
   condition_label                                   medical_abstract
0                1  Extended neck dissection. From the time Crile ...
1                5  Thoracoplasty: current application to the infe...
2                3  Recurrent tension headache in adolescents trea...
3                1  Intraoperative pancreatic fine needle aspirati...
4                1  Presence of identical mitochondrial proteins i...


In [None]:
import pandas as pd
folder_path = "../dataset/"
# Read all datasets
training_total = len(pd.read_csv(f"{folder_path}/medical_tc_train.csv")) + \
                 len(pd.read_csv(f"{folder_path}/Simpler_Augmented_Synthetic_Dataset.csv"))
test_total = len(pd.read_csv(f"{folder_path}/medical_tc_test.csv"))

print(f"Total number of rows (Training + Test): {training_total + test_total}")

Total number of rows (Training + Test): 29438


In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import re
from tqdm import tqdm
import torch

def clean_clinical_text(text):
    """Clean clinical text by removing special characters and extra whitespace."""
    text = str(text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\?\!]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text.lower().strip()

def preprocess_for_biobert(df, max_length=512):
    """Preprocess clinical text data for BioBERT"""
    # Load BioBERT tokenizer
    tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-v1.1', device_map="auto")

    # Clean the texts
    print("Cleaning medical abstracts...")
    df['medical_abstract'] = df['medical_abstract'].apply(clean_clinical_text)

    # Prepare lists for encoded data
    input_ids = []
    attention_masks = []

    # Encode each text
    print("Encoding texts with BioBERT tokenizer...")
    for text in tqdm(df['medical_abstract']):
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    # Convert lists to tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(df['condition_label'].values - 1)

    return {
        'input_ids': input_ids,
        'attention_masks': attention_masks,
        'labels': labels
    }


# Load data
print("Loading data...")
df = pd.read_csv(f"{folder_path}/combined_medical_dataset.csv")

# Print statistics
print(f"\nTotal samples: {len(df)}")
print(f"Number of unique conditions: {df['condition_label'].nunique()}")
print("\nLabel distribution:")
print(df['condition_label'].value_counts().sort_index())

# Preprocess for BioBERT
print("\nPreprocessing for BioBERT...")
preprocessed_data = preprocess_for_biobert(df)

print("\nPreprocessing complete!")
print(f"Input shape: {preprocessed_data['input_ids'].shape}")
print(f"Number of labels: {len(preprocessed_data['labels'])}")

# Verify unique labels
print("\nUnique labels in processed data:", torch.unique(preprocessed_data['labels']).tolist())

Loading data...

Total samples: 26550
Number of unique conditions: 5

Label distribution:
condition_label
1    5694
2    2810
3    3621
4    5615
5    8810
Name: count, dtype: int64

Preprocessing for BioBERT...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Cleaning medical abstracts...
Encoding texts with BioBERT tokenizer...


100%|██████████| 26550/26550 [00:30<00:00, 877.47it/s]



Preprocessing complete!
Input shape: torch.Size([26550, 512])
Number of labels: 26550

Unique labels in processed data: [0, 1, 2, 3, 4]


In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import TensorDataset

# Set random seed for reproducibility
torch.manual_seed(42)

# 1. Split the preprocessed data
def create_data_loaders(input_ids, attention_masks, labels, batch_size=16, val_split=0.1):
    """Create train and validation dataloaders"""
    # Combine into dataset
    dataset = TensorDataset(input_ids, attention_masks, labels)

    # Calculate lengths for split
    val_len = int(len(dataset) * val_split)
    train_len = len(dataset) - val_len

    # Split dataset
    train_dataset, val_dataset = random_split(dataset, [train_len, val_len])

    # Create dataloaders
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True
    )

    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    return train_dataloader, val_dataloader

# 2. Set up BioBERT model
def setup_model(num_labels=5):
    """Initialize BioBERT model for classification"""
    model = AutoModelForSequenceClassification.from_pretrained(
        'dmis-lab/biobert-v1.1',
        num_labels=num_labels,
        output_attentions=False,
        output_hidden_states=False
    )
    return model

# Create dataloaders
train_dataloader, val_dataloader = create_data_loaders(
    preprocessed_data['input_ids'],
    preprocessed_data['attention_masks'],
    preprocessed_data['labels']
)

# Initialize model
model = setup_model()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print("\nData Split Information:")
print(f"Training samples: {len(train_dataloader.dataset)}")
print(f"Validation samples: {len(val_dataloader.dataset)}")
print(f"\nModel will run on: {device}")
print(f"Number of batches in training: {len(train_dataloader)}")
print(f"Number of batches in validation: {len(val_dataloader)}")

# Sample of dataloader contents
sample_batch = next(iter(train_dataloader))
print("\nBatch shape information:")
print(f"Input IDs shape: {sample_batch[0].shape}")
print(f"Attention mask shape: {sample_batch[1].shape}")
print(f"Labels shape: {sample_batch[2].shape}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Data Split Information:
Training samples: 23895
Validation samples: 2655

Model will run on: cuda
Number of batches in training: 1494
Number of batches in validation: 166

Batch shape information:
Input IDs shape: torch.Size([16, 512])
Attention mask shape: torch.Size([16, 512])
Labels shape: torch.Size([16])


In [None]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
import numpy as np

# Load BioBERT model and tokenizer
model_name = 'dmis-lab/biobert-v1.1'
model = AutoModel.from_pretrained(model_name).to('cuda')  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval()

# Main execution
folder_path = "../dataset/"

# Load data
print("Loading data...")
df = pd.read_csv(f"{folder_path}/combined_medical_dataset.csv")


texts = df['medical_abstract'].tolist()

# Preprocess and tokenize the dataset
def preprocess_and_tokenize(texts, tokenizer, max_length=512):
    """Tokenize all texts and return input IDs and attention masks."""
    input_ids = []
    attention_masks = []
    for text in tqdm(texts, desc="Tokenizing"):
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Tokenize
input_ids, attention_masks = preprocess_and_tokenize(texts, tokenizer)

# Move input IDs and attention masks to GPU
def extract_embeddings(input_ids, attention_masks, model, batch_size=32):
    """Extract embeddings in batches to handle large datasets."""
    embeddings = []
    model.eval()
    with torch.no_grad():
        for i in tqdm(range(0, len(input_ids), batch_size), desc="Extracting embeddings"):
            batch_input_ids = input_ids[i:i + batch_size].to('cuda')
            batch_attention_masks = attention_masks[i:i + batch_size].to('cuda')
            outputs = model(batch_input_ids, attention_mask=batch_attention_masks, output_hidden_states=True)
            cls_embeddings = outputs.hidden_states[-1][:, 0, :]
            embeddings.append(cls_embeddings.cpu())
    return torch.cat(embeddings, dim=0)

# Extract embeddings
embeddings = extract_embeddings(input_ids, attention_masks, model)

# Convert embeddings to NumPy
embeddings_np = embeddings.numpy()

# Save embeddings
output_path = '../embeddings.npy'
np.save(output_path, embeddings_np)
print(f"Embeddings saved to {output_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Loading data...


Tokenizing: 100%|██████████| 26550/26550 [00:31<00:00, 844.50it/s]
Extracting embeddings: 100%|██████████| 830/830 [06:07<00:00,  2.26it/s]


Embeddings saved to /content/drive/My Drive/embeddings.npy


In [None]:
test_df = pd.read_csv(f"{folder_path}/medical_tc_test.csv")
texts = test_df['medical_abstract'].tolist()
test_input_ids, test_attention_masks = preprocess_and_tokenize(texts, tokenizer)
test_embeddings = extract_embeddings(test_input_ids, test_attention_masks, model)
test_embeddings_np = test_embeddings.numpy()
np.save('../dataset/test_embeddings.npy', test_embeddings_np)

Tokenizing: 100%|██████████| 2888/2888 [00:03<00:00, 894.41it/s]
Extracting embeddings: 100%|██████████| 91/91 [00:39<00:00,  2.30it/s]


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load the embeddings and labels
embeddings_path = '../embeddings.npy'
embeddings = np.load(embeddings_path)


train_labels = df['condition_label'].values - 1

test_embeddings_path = '../test_embeddings.npy'
test_embeddings = np.load(test_embeddings_path)
test_labels = test_df['condition_label'].values - 1


X_train = embeddings
y_train = train_labels
X_test = test_embeddings
y_test = test_labels

# Logistic Regression
print("Training Logistic Regression...")
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
print("\nLogistic Regression Results:")
print(classification_report(y_test, lr_preds))
print("Accuracy:", accuracy_score(y_test, lr_preds))

# Random Forest
print("\nTraining Random Forest...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
print("\nRandom Forest Results:")
print(classification_report(y_test, rf_preds))
print("Accuracy:", accuracy_score(y_test, rf_preds))

# Support Vector Machine
print("\nTraining Support Vector Machine...")
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)
print("\nSVM Results:")
print(classification_report(y_test, svm_preds))
print("Accuracy:", accuracy_score(y_test, svm_preds))


Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.68      0.72      0.70       633
           1       0.49      0.45      0.47       299
           2       0.51      0.47      0.49       385
           3       0.62      0.64      0.63       610
           4       0.45      0.45      0.45       961

    accuracy                           0.55      2888
   macro avg       0.55      0.55      0.55      2888
weighted avg       0.55      0.55      0.55      2888

Accuracy: 0.5526315789473685

Training Random Forest...

Random Forest Results:
              precision    recall  f1-score   support

           0       0.59      0.55      0.57       633
           1       0.07      0.04      0.05       299
           2       0.26      0.14      0.18       385
           3       0.51      0.50      0.50       610
           4       0.31      0.43      0.36       961

    accuracy                           0.39      2888
   macro avg       0.