In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch import nn
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [17]:
data_path = "../data/raw/train.csv"
df = pd.read_csv(data_path)
df = df.dropna(subset=["comment_text"])
df = df.reset_index(drop=True)

In [6]:
df["target_binned"] = pd.qcut(df["target"], q=5, labels=False, duplicates="drop")
X = df["comment_text"]
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42, 
                                                    stratify=df["target_binned"])

In [7]:
train_corpus = X_train.tolist()
test_corpus = X_test.tolist()
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_corpus)
X_test = vectorizer.transform(test_corpus)

In [8]:
def csr_to_tensor(X):
    coo = X.tocoo()
    coords = torch.from_numpy(np.vstack((coo.row, coo.col)).astype(np.int64))
    vals = torch.from_numpy(coo.data.astype(np.float32))
    shape = torch.Size(coo.shape)
    return torch.sparse_coo_tensor(coords, vals, shape)

In [9]:
X_train = csr_to_tensor(X_train)
X_test = csr_to_tensor(X_test)
y_train = torch.tensor(y_train.values).float().unsqueeze(1)
y_test = torch.tensor(y_test.values).float().unsqueeze(1)

In [10]:
class TierOneFilter(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.linear = nn.Linear(in_features=vocab_size, out_features=1)
        
    def forward(self, x):
        return self.linear(x)

In [11]:
torch.manual_seed(42)
lr = TierOneFilter(vocab_size=len(vectorizer.get_feature_names_out()))

In [12]:
threshold = 0.5
binary_labels = (y_train >= threshold).float()
num_pos = binary_labels.sum()
num_neg = len(binary_labels) - num_pos
pos_weight = torch.tensor([num_neg / num_pos])

In [13]:
loss_fxn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(params=lr.parameters(), lr=0.1)

In [15]:
torch.manual_seed(42)
epochs = 100
train_loss_value = []
test_loss_value = []
epoch_count = []

for epoch in range(epochs):
    lr.train()
    train_logit = lr(X_train)
    train_loss = loss_fxn(train_logit, y_train)
    train_mae = mean_absolute_error(y_train, torch.sigmoid(train_logit).detach().numpy())
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    lr.eval()
    with torch.inference_mode():
        test_logit = lr(X_test)
        test_loss = loss_fxn(test_logit, y_test)
        test_probs = torch.sigmoid(test_logit)
        epoch_count.append(epoch)
        train_loss_value.append(train_loss.detach().numpy())
        test_loss_value.append(test_loss.detach().numpy())

        y_true = y_test.detach().numpy().flatten()
        y_pred = test_probs.detach().numpy().flatten() 
        test_mae = mean_absolute_error(y_true, y_pred)        

        if epoch % 10 == 0:
            print(f"Epoch: {epoch}\n Train Loss: {train_loss} | Test Loss: {test_loss}\n Train MAE: {train_mae} | Test MAE: {test_mae}")

Epoch: 0
 Train Loss: 1.4442293643951416 | Test Loss: 1.3938357830047607
 Train MAE: 0.4220704734325409 | Test MAE: 0.5094112753868103
Epoch: 10
 Train Loss: 1.1511785984039307 | Test Loss: 1.1998933553695679
 Train MAE: 0.41267409920692444 | Test MAE: 0.4044894278049469
Epoch: 20
 Train Loss: 1.0620183944702148 | Test Loss: 1.1617649793624878
 Train MAE: 0.36445513367652893 | Test MAE: 0.3651176393032074
Epoch: 30
 Train Loss: 1.0109584331512451 | Test Loss: 1.1432387828826904
 Train MAE: 0.3376861810684204 | Test MAE: 0.34224143624305725
Epoch: 40
 Train Loss: 0.9783278703689575 | Test Loss: 1.131130337715149
 Train MAE: 0.3195333480834961 | Test MAE: 0.326369971036911
Epoch: 50
 Train Loss: 0.9564337730407715 | Test Loss: 1.1228915452957153
 Train MAE: 0.3066921532154083 | Test MAE: 0.31602466106414795
Epoch: 60
 Train Loss: 0.940893828868866 | Test Loss: 1.1178134679794312
 Train MAE: 0.2990165054798126 | Test MAE: 0.31046152114868164
Epoch: 70
 Train Loss: 0.9293449521064758 | Tes