In [None]:
 MID Dataset NLP Project (XLSX) – RESTART-SAFE Colab Notebook
# End-to-End Pipeline: EDA, Classification, Summarization
# =====================================================

# --------------------
# 0. Restart-Safe Setup
# --------------------
# This notebook is designed to work even after runtime resets

import os
import random
import numpy as np
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# --------------------
# 1. Install Dependencies
# --------------------
!pip install -q pandas numpy scikit-learn matplotlib nltk torch transformers datasets openpyxl sentencepiece

# --------------------
# 2. Imports
# --------------------
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments
)

from datasets import Dataset
import nltk
nltk.download('punkt')

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using device:', DEVICE)

# --------------------
# 3. Load XLSX Dataset
# --------------------
DATA_PATH = '/content/MID.xlsx'  # upload file to Colab
assert os.path.exists(DATA_PATH), 'MID.xlsx not found'

df = pd.read_excel(DATA_PATH)
print(df.head())
print(df.info())

# --------------------
# 4. Column Configuration (EDIT IF NEEDED)
# --------------------
TEXT_COLUMN = 'description'
LABEL_COLUMN = 'therapeutic_class'

assert TEXT_COLUMN in df.columns
assert LABEL_COLUMN in df.columns

df = df[[TEXT_COLUMN, LABEL_COLUMN]].dropna().reset_index(drop=True)

# --------------------
# 5. Label Encoding (SAFE)
# --------------------
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df[LABEL_COLUMN])

NUM_LABELS = df['label'].nunique()
print('Number of classes:', NUM_LABELS)

# --------------------
# 6. Train / Test Split
# --------------------
X_train, X_test, y_train, y_test = train_test_split(
    df[TEXT_COLUMN],
    df['label'],
    test_size=0.2,
    random_state=SEED,
    stratify=df['label']
)

# --------------------
# 7. Exploratory Analysis
# --------------------
class_counts = df[LABEL_COLUMN].value_counts()
plt.figure(figsize=(10,4))
class_counts.plot(kind='bar')
plt.title('Therapeutic Class Distribution')
plt.show()

# --------------------
# 8. Baseline Model (TF-IDF + Logistic Regression)
# --------------------
vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1,2),
    stop_words='english'
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

baseline_model = LogisticRegression(max_iter=2000)
baseline_model.fit(X_train_tfidf, y_train)

y_pred_base = baseline_model.predict(X_test_tfidf)

print('Baseline Accuracy:', accuracy_score(y_test, y_pred_base))
print(classification_report(y_test, y_pred_base, target_names=label_encoder.classes_))

# --------------------
# 9. Deep Learning Model (BioBERT – SAFE)
# --------------------
MODEL_NAME = 'dmis-lab/biobert-base-cased-v1.1'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS
).to(DEVICE)

# Tokenization function

def tokenize(batch):
    return tokenizer(
        batch['text'],
        truncation=True,
        padding='max_length',
        max_length=256
    )

train_ds = Dataset.from_dict({'text': X_train.tolist(), 'label': y_train.tolist()})
test_ds = Dataset.from_dict({'text': X_test.tolist(), 'label': y_test.tolist()})

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

train_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=200,
    seed=SEED,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer
)

trainer.train()

# --------------------
# 10. Deep Model Evaluation
# --------------------
preds = trainer.predict(test_ds)
y_pred_dl = np.argmax(preds.predictions, axis=1)

print('Deep Model Accuracy:', accuracy_score(y_test, y_pred_dl))
print(classification_report(y_test, y_pred_dl, target_names=label_encoder.classes_))

# --------------------
# 11. Text Summarization (BART)
# --------------------
SUM_MODEL = 'facebook/bart-large-cnn'

sum_tokenizer = AutoTokenizer.from_pretrained(SUM_MODEL)
sum_model = AutoModelForSeq2SeqLM.from_pretrained(SUM_MODEL).to(DEVICE)

def summarize_text(text):
    inputs = sum_tokenizer(text, return_tensors='pt', truncation=True, max_length=1024).to(DEVICE)
    summary_ids = sum_model.generate(
        inputs['input_ids'],
        max_length=120,
        min_length=40,
        num_beams=4,
        length_penalty=2.0
    )
    return sum_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

sample_text = df[TEXT_COLUMN].iloc[0]
print('Summary example:\n', summarize_text(sample_text))

# --------------------
# 12. Inference Function (SAFE)
# --------------------
def predict_therapeutic_class(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=256).to(DEVICE)
    with torch.no_grad():
        outputs = model(**inputs)
    pred_id = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([pred_id])[0]

print('Predicted class:', predict_therapeutic_class(sample_text))

# --------------------
# 13. Save Models
# --------------------
model.save_pretrained('/content/mid_classifier_model')
tokenizer.save_pretrained('/content/mid_classifier_model')

print('Restart-safe MID NLP pipeline completed successfully')