In [1]:
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# from lazypredict.Supervised import LazyClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [3]:
def preprocess_labels_and_text(df):
    labels = [int(i[0]) for i in df['5485']]
    df['labels'] = labels
    df = df.rename({'5485': 'doc_text'}, axis=1)
    df['doc_text'] = df['doc_text'].str[1:]
    return df

def prepare_data(df):
    df = preprocess_labels_and_text(df)
    X = df['doc_text']
    y = df['labels']
    return X, y, df

## 1. TF-IDF + CLASSIFIER

In [4]:
def get_max_features_tfidf(X_train, y_train):
    # Apply TF-IDF with a HIGH max_features (e.g., 10,000)
    high_max_features = 10000
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=high_max_features)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

    # Train a Random Forest Classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train_tfidf, y_train)

    # Get Feature Importances
    feature_importances = rf.feature_importances_

    # Determine Optimal max_features (Keeping Top 90% of Importance)
    sorted_indices = np.argsort(feature_importances)[::-1]  # Sort features by importance (descending)
    cumulative_importance = np.cumsum(feature_importances[sorted_indices])  # Cumulative importance
    best_max_features = np.argmax(cumulative_importance >= 0.90)  # Find cutoff for 90% importance
    return best_max_features

In [5]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt

def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    metrics = {
        "Accuracy": accuracy_score(y_test, y_test_pred),
        "F1 Score": f1_score(y_test, y_test_pred, average='weighted', zero_division=1),
        "Precision": precision_score(y_test, y_test_pred, average='weighted', zero_division=1),
        "Recall": recall_score(y_test, y_test_pred, average='weighted', zero_division=1)
    }

    cm = confusion_matrix(y_test, y_test_pred, labels=model.classes_)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    # disp.plot(cmap='Blues')
    # plt.title(f"Confusion matrix for {type(model).__name__}")
    # plt.show()

    # print(f"\nClassification report for {type(model).__name__}:\n")
    # print(classification_report(y_test, y_test_pred, zero_division=1))

    return metrics

def train_and_evaluate_models(X_train, y_train, X_test, y_test, models, names):
    results = []
    for name, model in zip(names, models):
        print(f"Training and evaluating: {name}")
        model.fit(X_train, y_train)
        metrics = evaluate_model(model, X_train, y_train, X_test, y_test)
        results.append({"Classifier": name, **metrics})
        # print(f"Results for {name}: {metrics}")
    return pd.DataFrame(results)

In [6]:
def plot_model_performance(results):
    results.sort_values(by="Accuracy", ascending=False, inplace=True)
    plt.bar(results["Classifier"], results["Accuracy"])
    plt.title("Model comparison")
    plt.xlabel("Models")
    plt.ylabel("Accuracy")
    plt.xticks(rotation=45)
    plt.show()

In [7]:
# # Run classification comparison with Lazypredict
# clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
# models, predictions = clf.fit(X_train_tfidf.toarray(), X_test_tfidf.toarray(), y_train, y_test)

# # Display model performance comparison
# print(models)

In [8]:
# Load dataset and preprocess it
colab = True
if colab:
  path_csv = 'data/file.txt'
else:
  path_csv = '../data/file.txt'
df = pd.read_csv(path_csv)
X, y, df = prepare_data(df)


In [9]:
# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Get best max_features value
best_max_features = get_max_features_tfidf(X_train, y_train)

# Use the best max_features value for final TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=best_max_features)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"Selected Best max_features: {best_max_features}")


Selected Best max_features: 1206


In [10]:
# Model Zoo
names = [
    "Logistic Regression",
    "KNN Classifier",
    "Decision Tree",
    "Linear SVM",
    "Random Forest",
    "SGD Classifier",
    "Ridge Classifier",
    "XGBoost",
    "AdaBoost",
]

models = [
    LogisticRegression(max_iter=1000, class_weight='balanced'),
    KNeighborsClassifier(n_neighbors=149, n_jobs=-1),
    DecisionTreeClassifier(),
    SVC(kernel='linear'),
    RandomForestClassifier(n_estimators=100, class_weight='balanced'),
    SGDClassifier(loss='hinge'),
    RidgeClassifier(),
    XGBClassifier(),
    AdaBoostClassifier()
]

# Training and evaluation
results = train_and_evaluate_models(X_train_tfidf, y_train, X_test_tfidf, y_test, models, names)

# Result plotting
# print(results)


Training and evaluating: Logistic Regression
Training and evaluating: KNN Classifier
Training and evaluating: Decision Tree
Training and evaluating: Linear SVM
Training and evaluating: Random Forest
Training and evaluating: SGD Classifier
Training and evaluating: Ridge Classifier
Training and evaluating: XGBoost
Training and evaluating: AdaBoost


In [11]:
results

Unnamed: 0,Classifier,Accuracy,F1 Score,Precision,Recall
0,Logistic Regression,0.95351,0.954032,0.956308,0.95351
1,KNN Classifier,0.906108,0.895898,0.909231,0.906108
2,Decision Tree,0.890611,0.890379,0.891513,0.890611
3,Linear SVM,0.967183,0.966975,0.967219,0.967183
4,Random Forest,0.942571,0.942298,0.944331,0.942571
5,SGD Classifier,0.968095,0.967968,0.968194,0.968095
6,Ridge Classifier,0.968095,0.967967,0.968583,0.968095
7,XGBoost,0.948952,0.948685,0.949273,0.948952
8,AdaBoost,0.781222,0.764228,0.841165,0.781222


## 2. RoBERTa + MLP

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [13]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.long)  # Use `long` for class indices

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]  # No `.unsqueeze(0)` needed
        return item

In [14]:
class TransformerMLP(nn.Module):
    def __init__(self, model_name, num_classes):
        super(TransformerMLP, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.fc = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes),
        )

    def forward(self, input_ids, attention_mask):
        output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = output.last_hidden_state[:, 0, :]
        return self.fc(cls_embedding)

In [15]:
def train_model(model, train_loader, val_loader, epochs=3):
    model.train()

    for epoch in range(epochs):
        total_train_loss = 0
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=True)

        # Training phase
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            loop.set_postfix(train_loss=loss.item())

        avg_train_loss = total_train_loss / len(train_loader)

        # Validation phase
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f"\nEpoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}\n")

In [16]:
def test_model(model, test_loader):
    model.eval()
    predictions, actuals = [], []
    loop = tqdm(test_loader, desc="Evaluating", leave=True)
    with torch.no_grad():
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs.cpu(), 1)
            predictions.extend(preds.numpy())
            actuals.extend(labels.cpu().numpy().flatten())

    acc = accuracy_score(actuals, predictions)
    print(f"Test Accuracy: {acc:.4f}")

In [17]:
path_csv = 'data/file.txt'
df = pd.read_csv(path_csv)
X, y, df = prepare_data(df)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

MODEL_NAME = "roberta-base"  # Change to "distilbert-base-uncased" for DistilBERT

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y.tolist(), test_size=0.1, random_state=42)

# Subsplit training into training and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)


train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [18]:
# Define model, loss function & optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print('Running the workflow in the following device: ', device)

# Get the number of unique classes from your labels
num_classes = len(set(y_train))

model = TransformerMLP(MODEL_NAME, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Train model
train_model(model, train_loader, val_loader)

Running the workflow in the following device:  cuda


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 556/556 [01:49<00:00,  5.08it/s, train_loss=0.0957]



Epoch 1: Train Loss = 0.3500, Val Loss = 0.1143



Epoch 2/3: 100%|██████████| 556/556 [01:49<00:00,  5.10it/s, train_loss=0.00454]



Epoch 2: Train Loss = 0.0899, Val Loss = 0.0593



Epoch 3/3: 100%|██████████| 556/556 [01:50<00:00,  5.01it/s, train_loss=0.00406]



Epoch 3: Train Loss = 0.0375, Val Loss = 0.0547



In [19]:
# Evaluate results on test_dataset
test_model(model, test_loader)

Evaluating: 100%|██████████| 69/69 [00:03<00:00, 18.22it/s]

Test Accuracy: 0.9727





In [20]:
! pip install onnx

Collecting onnx
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx
Successfully installed onnx-1.17.0


In [21]:
# Load real data (example from your dataset)
real_sample = next(iter(train_loader))
real_input_ids = real_sample["input_ids"][0].unsqueeze(0).to(device)
real_attention_mask = real_sample["attention_mask"][0].unsqueeze(0).to(device)

torch.onnx.export(
    model,
    (real_input_ids, real_attention_mask),
    "roberta_mlp.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={"input_ids": {0: "batch_size"}, "attention_mask": {0: "batch_size"}},
    opset_version=14
)

# Convert to TorchScript
traced_model = torch.jit.trace(model,(real_input_ids, real_attention_mask))
traced_model.save("roberta_mlp.pt")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
