---

---

### 0. Project Setup & Obtaining Dataset

In [None]:
# check if GPU is available
import tensorflow as tf
import torch

print("TensorFlow GPU available:", tf.config.list_physical_devices('GPU'))
print("PyTorch GPU available:", torch.cuda.is_available())

# If using PyTorch, print GPU name
if torch.cuda.is_available():
    print("PyTorch GPU name:", torch.cuda.get_device_name(0))

In [None]:

# install and update Hugging Face Transformers, Datasets, Accelerate, Evaluate
# Also ensure fsspec and huggingface_hub are up-to-date to resolve common loading issues
!pip install -U transformers datasets accelerate evaluate huggingface_hub fsspec sentencepiece -q

In [None]:
from datasets import load_dataset

# load the ADE-Corpus-V2 classification dataset
# this dataset contains sentences labeled as 0 (not ADE) or 1 (ADE)
dataset = load_dataset("SetFit/ade_corpus_v2_classification")

print("\nDataset loaded successfully!")
print(dataset)
print("\nKeys in the dataset object:", dataset.keys())

In [None]:
# Access the training split
train_dataset = dataset['train']
test_dataset = dataset['test']

print("\n--- Training Dataset Sample ---")
print(train_dataset[0]) # Print the first example
print(train_dataset[1]) # Print the second example

print("\n--- Test Dataset Sample ---")
print(test_dataset[0]) # Print the first example

print("\nFeatures (columns) available:", train_dataset.column_names)
print("Label mapping (if available):", train_dataset.features['label'])

### 1. Exploratory Data Analysis & Initial Preprocessing



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Convert Hugging Face Datasets to Pandas DataFrames for easier EDA
# initial exploration,
train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()

print(f"Train dataset size: {len(train_df)} rows")
print(f"Test dataset size: {len(test_df)} rows")

print("\n--- Training Data Class Distribution ---")
train_class_counts = train_df['label_text'].value_counts()
print(train_class_counts)

print("\n--- Test Data Class Distribution ---")
test_class_counts = test_df['label_text'].value_counts()
print(test_class_counts)

# Visualize class distribution for training set
plt.figure(figsize=(7, 5))
sns.barplot(x=train_class_counts.index, y=train_class_counts.values, palette="viridis")
plt.title('Training Data Class Distribution (ADE vs. Non-ADE)')
plt.xlabel('Class')
plt.ylabel('Number of Samples')
plt.show()

# Visualize class distribution for test set
plt.figure(figsize=(7, 5))
sns.barplot(x=test_class_counts.index, y=test_class_counts.values, palette="plasma")
plt.title('Test Data Class Distribution (ADE vs. Non-ADE)')
plt.xlabel('Class')
plt.ylabel('Number of Samples')
plt.show()

In [None]:
# Calculate sentence lengths
train_df['text_length'] = train_df['text'].apply(len)
test_df['text_length'] = test_df['text'].apply(len)

print("\n--- Training Data Sentence Length Statistics (Characters) ---")
print(train_df['text_length'].describe())

print("\n--- Test Data Sentence Length Statistics (Characters) ---")
print(test_df['text_length'].describe())

# Visualize sentence length distribution for training set
plt.figure(figsize=(10, 6))
sns.histplot(train_df['text_length'], bins=50, kde=True, color='skyblue')
plt.title('Distribution of Sentence Lengths in Training Data')
plt.xlabel('Sentence Length (Characters)')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)
plt.show()

# Visualize sentence length distribution for test set
plt.figure(figsize=(10, 6))
sns.histplot(test_df['text_length'], bins=50, kde=True, color='lightcoral')
plt.title('Distribution of Sentence Lengths in Test Data')
plt.xlabel('Sentence Length (Characters)')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
import re

def clean_text(text):
    """
    Applies basic text cleaning: lowercasing, removing extra whitespace,
    and removing non-alphanumeric characters (keeping spaces).
    """
    text = text.lower() # Convert to lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text) # Remove special characters, keep letters, numbers, spaces
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space and strip leading/trailing
    return text

# Apply cleaning to the 'text' column in both train and test dataframes
train_df['cleaned_text'] = train_df['text'].apply(clean_text)
test_df['cleaned_text'] = test_df['text'].apply(clean_text)

print("\n--- Original vs. Cleaned Text Examples (Training Data) ---")
for i in range(5):
    print(f"Original: {train_df['text'].iloc[i]}")
    print(f"Cleaned:  {train_df['cleaned_text'].iloc[i]}\n")

# Store the dataframes back into the dataset object, or simply use train_df/test_df for next phase
# For simplicity, keep working with train_df and test_df for now,
# and convert back to Hugging Face Dataset format when needed

# to update the original 'dataset' object:
# from datasets import Dataset
# dataset['train'] = Dataset.from_pandas(train_df)
# dataset['test'] = Dataset.from_pandas(test_df)

2. Baseline Model Development (ML & Feature Engineering)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer - counts the occurrences of each word
# max_features limits the number of unique words (vocabulary size) to consider,
# which can help manage memory and focus on most frequent words.
# min_df ignores words that appear in too few documents (e.g., typos, very rare words)
# max_df ignores words that appear in too many documents (e.g., common words that aren't stop words)
count_vectorizer = CountVectorizer(max_features=5000, min_df=5, max_df=0.9)

# fit the vectorizer on the training data's cleaned text and transform both train and test data
X_train_bow = count_vectorizer.fit_transform(train_df['cleaned_text'])
X_test_bow = count_vectorizer.transform(test_df['cleaned_text'])

# get labels (target variable)
y_train = train_df['label']
y_test = test_df['label']

print(f"Shape of X_train_bow: {X_train_bow.shape}")
print(f"Shape of X_test_bow: {X_test_bow.shape}")
print(f"Vocabulary size: {len(count_vectorizer.vocabulary_)}")

In [None]:
# Logistic Regression model, with CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import numpy as np

# Initialize Logistic Regression model
# max_iter increased to ensure convergence for larger datasets
# solver='liblinear' good for smaller datasets and L1/L2 regularization
# class_weight='balanced' CRUCIAL here because of our class imbalance.
# It automatically adjusts weights inversely proportional to class frequencies.
log_reg_model_bow = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42, class_weight='balanced')

# Train the model
log_reg_model_bow.fit(X_train_bow, y_train)

# Make predictions on the test set
y_pred_bow = log_reg_model_bow.predict(X_test_bow)

print("\n--- Logistic Regression (Bag-of-Words) Performance ---")

# Classification Report
# This provides Precision, Recall, F1-score, and Support for each class.
print("\nClassification Report:")
print(classification_report(y_test, y_pred_bow, target_names=['Not-Related', 'Related']))

# Confusion Matrix
cm_bow = confusion_matrix(y_test, y_pred_bow, labels=log_reg_model_bow.classes_)
disp_bow = ConfusionMatrixDisplay(confusion_matrix=cm_bow, display_labels=['Not-Related', 'Related'])
disp_bow.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix (Bag-of-Words)')
plt.show()

# raw confusion matrix
print("\nRaw Confusion Matrix:\n", cm_bow)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer - Term Frequency-Inverse Document Frequency is
# another common technique. It not only counts word occurrences but also gives
# more weight to words that are rare across the entire corpus but frequent in a
# specific document, it helps highlight words that are more distinctive.
# Same parameters as CountVectorizer for consistency
tfidf_vectorizer = TfidfVectorizer(max_features=5000, min_df=5, max_df=0.9)

# Fit & transform training data, transform test data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['cleaned_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['cleaned_text'])

print(f"Shape of X_train_tfidf: {X_train_tfidf.shape}")
print(f"Shape of X_test_tfidf: {X_test_tfidf.shape}")

In [None]:
# Logistic Regression model, using TF-IDF features.

# Initialize Logistic Regression model with class_weight='balanced'
log_reg_model_tfidf = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42, class_weight='balanced')

# Train the model
log_reg_model_tfidf.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred_tfidf = log_reg_model_tfidf.predict(X_test_tfidf)

print("\n--- Logistic Regression (TF-IDF) Performance ---")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tfidf, target_names=['Not-Related', 'Related']))

# Confusion Matrix
cm_tfidf = confusion_matrix(y_test, y_pred_tfidf, labels=log_reg_model_tfidf.classes_)
disp_tfidf = ConfusionMatrixDisplay(confusion_matrix=cm_tfidf, display_labels=['Not-Related', 'Related'])
disp_tfidf.plot(cmap=plt.cm.Greens)
plt.title('Confusion Matrix (TF-IDF)')
plt.show()

print("\nRaw Confusion Matrix:\n", cm_tfidf)

### 3. Deep Learning Model (Transformer)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# for general-purpose model, starting with 'bert-base-uncased'
model_name = "bert-base-uncased"

# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"Tokenizer for {model_name} loaded.")
print(f"Tokenizer vocabulary size: {tokenizer.vocab_size}")

# Test tokenizer on a sample sentence
sample_text = "The patient experienced severe headaches after taking the medication."
tokenized_output = tokenizer(sample_text, truncation=True, padding=True, return_tensors="pt")

print("\nSample Tokenization Output:")
print(f"Input IDs: {tokenized_output['input_ids']}")
print(f"Attention Mask: {tokenized_output['attention_mask']}")
print(f"Decoded (for understanding): {tokenizer.decode(tokenized_output['input_ids'][0])}")

In [None]:
# The label column should be named 'label' (numerical 0 or 1)
# The text column should be named 'text' (original text, as tokenizer handles cleaning)

def tokenize_function(examples):
    # Truncate to the maximum input length of the model (often 512)
    # Based on EDA, max sentence length was 742 chars. 512 tokens often covers more characters.
    # We will use padding='max_length' to pad to the max length, and truncation=True for longer texts.
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenizer to the entire dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Rename the 'label_text' column to 'labels' and remove unnecessary columns
# Transformers Trainer expects the target column to be named 'labels'
tokenized_datasets = tokenized_datasets.remove_columns(["text", "sentence_id", "label_text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# Set the format to PyTorch tensors (important for training)
tokenized_datasets.set_format("torch")

print("\nTokenized Datasets Overview:")
print(tokenized_datasets)
print("\nSample of tokenized_datasets['train'][0]:")
print(tokenized_datasets['train'][0])

In [None]:
from transformers import TrainingArguments, AutoModelForSequenceClassification
import evaluate
import numpy as np

# load the pre-trained model for sequence classification
# num_labels=2 for binary classification (ADE vs. Non-ADE)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# function to compute metrics
# for evaluating model during training and after.
def compute_metrics(eval_pred):
    metric = evaluate.load("f1")
    # metric_precision = evaluate.load("precision")
    # metric_recall = evaluate.load("recall")
    # metric_accuracy = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Compute F1-score (macro average is good for imbalanced datasets)
    f1_score = metric.compute(predictions=predictions, references=labels, average="macro")

    # return other metrics as a dictionary
    # return {
    #     "accuracy": metric_accuracy.compute(predictions=predictions, references=labels)['accuracy'],
    #     "precision": metric_precision.compute(predictions=predictions, references=labels, average="macro")['precision'],
    #     "recall": metric_recall.compute(predictions=predictions, references=labels, average="macro")['recall'],
    #     "f1": f1_score['f1']
    # }
    return f1_score

# output_dir: where the model checkpoints and logs will be saved
# evaluation_strategy: 'epoch' evaluate at the end of each epoch
# num_train_epochs: number of passes over the training data
# weight_decay: regularization to prevent overfitting
# load_best_model_at_end: load the model with the best validation performance
# metric_for_best_model: the metric to monitor for early stopping/best model
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16, # adjust based on GPU memory
    per_device_eval_batch_size=16,
    num_train_epochs=3, # typically 2-4 epochs are enough for fine-tuning
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1", # monitor macro F1-score on validation
    save_strategy="epoch", # checkpoints at each epoch
    report_to="none" # disable logging for simplicity
)

In [None]:
from transformers import Trainer

# init the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"], #'test' as our validation set
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("\n--- Starting Model Training ---")
trainer.train()

print("\n--- Training Complete ---")

# eval the model on the test set after training
final_results = trainer.evaluate()
print("\nFinal Evaluation Results on Test Set:")
print(final_results)