---

---

### 0. Project Setup & Obtaining Dataset

In [None]:
# check if GPU is available
import tensorflow as tf
import torch

print("TensorFlow GPU available:", tf.config.list_physical_devices('GPU'))
print("PyTorch GPU available:", torch.cuda.is_available())

# If using PyTorch, print GPU name
if torch.cuda.is_available():
    print("PyTorch GPU name:", torch.cuda.get_device_name(0))

In [None]:

# install and update Hugging Face Transformers, Datasets, Accelerate, Evaluate
# Also ensure fsspec and huggingface_hub are up-to-date to resolve common loading issues
!pip install -U transformers datasets accelerate evaluate huggingface_hub fsspec sentencepiece -q

In [None]:
from datasets import load_dataset

# load the ADE-Corpus-V2 classification dataset
# this dataset contains sentences labeled as 0 (not ADE) or 1 (ADE)
dataset = load_dataset("SetFit/ade_corpus_v2_classification")

print("\nDataset loaded successfully!")
print(dataset)
print("\nKeys in the dataset object:", dataset.keys())

In [None]:
# Access the training split
train_dataset = dataset['train']
test_dataset = dataset['test']

print("\n--- Training Dataset Sample ---")
print(train_dataset[0]) # Print the first example
print(train_dataset[1]) # Print the second example

print("\n--- Test Dataset Sample ---")
print(test_dataset[0]) # Print the first example

print("\nFeatures (columns) available:", train_dataset.column_names)
print("Label mapping (if available):", train_dataset.features['label'])

### 1. Exploratory Data Analysis & Initial Preprocessing



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Convert Hugging Face Datasets to Pandas DataFrames for easier EDA
# initial exploration,
train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()

print(f"Train dataset size: {len(train_df)} rows")
print(f"Test dataset size: {len(test_df)} rows")

print("\n--- Training Data Class Distribution ---")
train_class_counts = train_df['label_text'].value_counts()
print(train_class_counts)

print("\n--- Test Data Class Distribution ---")
test_class_counts = test_df['label_text'].value_counts()
print(test_class_counts)

# Visualize class distribution for training set
plt.figure(figsize=(7, 5))
sns.barplot(x=train_class_counts.index, y=train_class_counts.values, palette="viridis")
plt.title('Training Data Class Distribution (ADE vs. Non-ADE)')
plt.xlabel('Class')
plt.ylabel('Number of Samples')
plt.show()

# Visualize class distribution for test set
plt.figure(figsize=(7, 5))
sns.barplot(x=test_class_counts.index, y=test_class_counts.values, palette="plasma")
plt.title('Test Data Class Distribution (ADE vs. Non-ADE)')
plt.xlabel('Class')
plt.ylabel('Number of Samples')
plt.show()

In [None]:
# Calculate sentence lengths
train_df['text_length'] = train_df['text'].apply(len)
test_df['text_length'] = test_df['text'].apply(len)

print("\n--- Training Data Sentence Length Statistics (Characters) ---")
print(train_df['text_length'].describe())

print("\n--- Test Data Sentence Length Statistics (Characters) ---")
print(test_df['text_length'].describe())

# Visualize sentence length distribution for training set
plt.figure(figsize=(10, 6))
sns.histplot(train_df['text_length'], bins=50, kde=True, color='skyblue')
plt.title('Distribution of Sentence Lengths in Training Data')
plt.xlabel('Sentence Length (Characters)')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)
plt.show()

# Visualize sentence length distribution for test set
plt.figure(figsize=(10, 6))
sns.histplot(test_df['text_length'], bins=50, kde=True, color='lightcoral')
plt.title('Distribution of Sentence Lengths in Test Data')
plt.xlabel('Sentence Length (Characters)')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
import re

def clean_text(text):
    """
    Applies basic text cleaning: lowercasing, removing extra whitespace,
    and removing non-alphanumeric characters (keeping spaces).
    """
    text = text.lower() # Convert to lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text) # Remove special characters, keep letters, numbers, spaces
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space and strip leading/trailing
    return text

# Apply cleaning to the 'text' column in both train and test dataframes
train_df['cleaned_text'] = train_df['text'].apply(clean_text)
test_df['cleaned_text'] = test_df['text'].apply(clean_text)

print("\n--- Original vs. Cleaned Text Examples (Training Data) ---")
for i in range(5):
    print(f"Original: {train_df['text'].iloc[i]}")
    print(f"Cleaned:  {train_df['cleaned_text'].iloc[i]}\n")

# Store the dataframes back into the dataset object, or simply use train_df/test_df for next phase
# For simplicity, keep working with train_df and test_df for now,
# and convert back to Hugging Face Dataset format when needed

# to update the original 'dataset' object:
# from datasets import Dataset
# dataset['train'] = Dataset.from_pandas(train_df)
# dataset['test'] = Dataset.from_pandas(test_df)