In [22]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lazypredict.Supervised import LazyClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [23]:
def preprocess_labels_and_text(df):
    labels = [int(i[0]) for i in df['5485']]
    df['labels'] = labels
    df = df.rename({'5485': 'doc_text'}, axis=1)
    df['doc_text'] = df['doc_text'].str[1:]
    return df

def prepare_data(df):
    df = preprocess_labels_and_text(df)
    X = df['doc_text']
    y = df['labels']
    return X, y, df

## 1. TF-IDF + CLASSIFIER

In [24]:
def get_max_features_tfidf(X_train, y_train):
    # Apply TF-IDF with a HIGH max_features (e.g., 10,000)
    high_max_features = 10000
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=high_max_features)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

    # Train a Random Forest Classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train_tfidf, y_train)

    # Get Feature Importances
    feature_importances = rf.feature_importances_

    # Determine Optimal max_features (Keeping Top 90% of Importance)
    sorted_indices = np.argsort(feature_importances)[::-1]  # Sort features by importance (descending)
    cumulative_importance = np.cumsum(feature_importances[sorted_indices])  # Cumulative importance
    best_max_features = np.argmax(cumulative_importance >= 0.90)  # Find cutoff for 90% importance
    return best_max_features

In [25]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt

def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    metrics = {
        "Accuracy": accuracy_score(y_test, y_test_pred),
        "F1 Score": f1_score(y_test, y_test_pred, average='weighted', zero_division=1),
        "Precision": precision_score(y_test, y_test_pred, average='weighted', zero_division=1),
        "Recall": recall_score(y_test, y_test_pred, average='weighted', zero_division=1)
    }

    cm = confusion_matrix(y_test, y_test_pred, labels=model.classes_)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    # disp.plot(cmap='Blues')
    # plt.title(f"Confusion matrix for {type(model).__name__}")
    # plt.show()

    # print(f"\nClassification report for {type(model).__name__}:\n")
    # print(classification_report(y_test, y_test_pred, zero_division=1))

    return metrics

def train_and_evaluate_models(X_train, y_train, X_test, y_test, models, names):
    results = []
    for name, model in zip(names, models):
        print(f"Training and evaluating: {name}")
        model.fit(X_train, y_train)
        metrics = evaluate_model(model, X_train, y_train, X_test, y_test)
        results.append({"Classifier": name, **metrics})
        # print(f"Results for {name}: {metrics}")
    return pd.DataFrame(results)

In [26]:
def plot_model_performance(results):
    results.sort_values(by="Accuracy", ascending=False, inplace=True)
    plt.bar(results["Classifier"], results["Accuracy"])
    plt.title("Model comparison")
    plt.xlabel("Models")
    plt.ylabel("Accuracy")
    plt.xticks(rotation=45)
    plt.show()

In [27]:
# # Run classification comparison with Lazypredict
# clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
# models, predictions = clf.fit(X_train_tfidf.toarray(), X_test_tfidf.toarray(), y_train, y_test)

# # Display model performance comparison
# print(models)

In [28]:
# Load dataset and preprocess it
path_csv = '../data/file.txt'
df = pd.read_csv(path_csv)
X, y, df = prepare_data(df)


In [29]:
# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Get best max_features value   
best_max_features = get_max_features_tfidf(X_train, y_train)

# Use the best max_features value for final TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=best_max_features)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"Selected Best max_features: {best_max_features}")


Selected Best max_features: 1175


In [30]:
# Model Zoo
names = [
    "Logistic Regression",
    "KNN Classifier",
    "Decision Tree",
    "Linear SVM",
    "Random Forest",
    "SGD Classifier",
    "Ridge Classifier",
    "XGBoost",
    "AdaBoost",
]

models = [
    LogisticRegression(max_iter=1000, class_weight='balanced'),
    KNeighborsClassifier(n_neighbors=149, n_jobs=-1),
    DecisionTreeClassifier(),
    SVC(kernel='linear'),
    RandomForestClassifier(n_estimators=100, class_weight='balanced'),
    SGDClassifier(loss='hinge'),
    RidgeClassifier(),
    XGBClassifier(),
    AdaBoostClassifier()
]

# Training and evaluation
results = train_and_evaluate_models(X_train_tfidf, y_train, X_test_tfidf, y_test, models, names)

# Result plotting
# print(results)


Training and evaluating: Logistic Regression
Training and evaluating: KNN Classifier
Training and evaluating: Decision Tree
Training and evaluating: Linear SVM
Training and evaluating: Random Forest
Training and evaluating: SGD Classifier
Training and evaluating: Ridge Classifier
Training and evaluating: XGBoost
Training and evaluating: AdaBoost


In [31]:
results

Unnamed: 0,Classifier,Accuracy,F1 Score,Precision,Recall
0,Logistic Regression,0.95,0.95,0.96,0.95
1,KNN Classifier,0.91,0.9,0.91,0.91
2,Decision Tree,0.88,0.88,0.88,0.88
3,Linear SVM,0.97,0.97,0.97,0.97
4,Random Forest,0.94,0.94,0.95,0.94
5,SGD Classifier,0.97,0.97,0.97,0.97
6,Ridge Classifier,0.97,0.97,0.97,0.97
7,XGBoost,0.95,0.95,0.95,0.95
8,AdaBoost,0.75,0.73,0.84,0.75


## 2. RoBERTa + MLP

In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [47]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

In [48]:
class TransformerMLP(nn.Module):
    def __init__(self, model_name):
        super(TransformerMLP, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.fc = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),  # Binary classification (sigmoid output)
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask):
        output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = output.last_hidden_state[:, 0, :]  # Extract [CLS] token embedding
        return self.fc(cls_embedding)

In [None]:
def train(model, train_loader, epochs=3):
    model.train()
    for epoch in range(epochs):
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=True)
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device).unsqueeze(1)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)  # Now both have shape [batch_size, 1]
            loss.backward()
            optimizer.step()

            # Update tqdm description with loss
            loop.set_postfix(loss=loss.item())

In [None]:
def evaluate(model, test_loader):
    model.eval()
    predictions, actuals = [], []
    loop = tqdm(test_loader, desc="Evaluating", leave=True)
    with torch.no_grad():
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            preds = (outputs.cpu().numpy() > 0.5).astype(int).flatten()
            predictions.extend(preds)
            actuals.extend(labels.cpu().numpy().flatten())

    acc = accuracy_score(actuals, predictions)
    print(f"Test Accuracy: {acc:.4f}")

In [None]:
path_csv = '../data/file.txt'
df = pd.read_csv(path_csv)
X, y, df = prepare_data(df)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

MODEL_NAME = "roberta-base"  # Change to "distilbert-base-uncased" for DistilBERT

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y.tolist(), test_size=0.2, random_state=42)

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
# Define model, loss function & optimizer
device = torch.device("cpu")
model = TransformerMLP(MODEL_NAME).to(device)
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Train and evaluate
train(model, train_loader)
evaluate(model, test_loader)