In [1]:
from sklearn.model_selection import train_test_split

In [19]:
import pandas as pd
import numpy as np
import random
from collections import Counter
import torch

In [8]:
train = pd.read_csv('subtask1/train/eng.csv')
val = pd.read_csv('subtask1/dev/eng.csv')

train.head()

Unnamed: 0,id,text,polarization
0,en_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0
1,en_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0
2,en_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0
3,en_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0
4,en_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0


In [9]:
train_texts = train['text']
train_labels = train['polarization']

In [15]:
Counter(train_labels)

Counter({0: 1674, 1: 1002})

In [None]:
def predict_random(train_labels, num_samples):
    predictions = []
    train_labels = np.array(train_labels)
    
    proportion = len(train_labels[train_labels == 1]) / len(train_labels)

    for i in range(num_samples):
        if random.random() < proportion:
            predictions.append(1)
        else:
            predictions.append(0)

    return np.array(predictions, dtype=int)
    

In [13]:
predict_random(train_labels, 5)

array([1, 1, 0, 0, 1])

In [None]:
def preprocess(text):
    tokens = text.lower().split()
    return tokens

In [16]:
def accuracy(predicted_labels, true_labels):
    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)
    correct = np.sum(predicted_labels == true_labels)
    return correct / len(true_labels)

def precision(predicted_labels, true_labels):
    
    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)

    true_pos = np.sum((predicted_labels == 1) & (true_labels == 1))
    pred_pos = np.sum(predicted_labels == 1)

    precision_value = true_pos / (pred_pos)
    return precision_value


def recall(predicted_labels, true_labels):

    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)

    true_pos = np.sum((predicted_labels == 1) & (true_labels == 1))
    actual_pos = np.sum(true_labels == 1)

    recall_value = true_pos / (actual_pos)
    return recall_value

def f1_score(predicted_labels, true_labels):

    precision_value = precision(predicted_labels, true_labels)
    recall_value = recall(predicted_labels, true_labels)

    f1_score_value = 2 * (precision_value * recall_value) / (precision_value + recall_value)
    return f1_score_value


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(train_texts)

In [None]:
Y = vectorizer.fit_transform(eval_texts)

In [22]:
X.shape

(2676, 5000)

In [20]:
def initialize_weights(coefficients):
    with torch.no_grad():
        coefficients.weight.fill_(1.0)

In [None]:
class Classifier(torch.nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.output_size = 1
        self.coefficients = torch.nn.Linear(input_dim, self.output_size)
        initialize_weights(self.coefficients)
        
    def forward(self, features: torch.Tensor):
        return torch.sigmoid(self.coefficients(features))

In [23]:
def logistic_loss(prediction: torch.Tensor, label: torch.Tensor) -> torch.Tensor:
    eps = 1e-18
    loss = -1 * (
        label * torch.log(prediction + eps) + 
        (1 - label) * torch.log(1 - prediction + eps)
    )
    return loss.mean()

In [24]:
def make_optimizer(model, learning_rate) -> torch.optim:
    return torch.optim.SGD(model.parameters(), learning_rate)

In [25]:
def predict(model, features):
    with torch.no_grad():
        logits = model(features)

        predicted_labels = (logits > 0.5).int().flatten()

        return predicted_labels

In [None]:
from tqdm.notebook import tqdm

In [27]:
def training_loop(
    num_epochs,
    batch_size,
    train_features,
    train_labels,
    dev_features,
    dev_labels,
    optimizer,
    model
):
    samples = list(zip(train_features, train_labels))
    random.shuffle(samples)
    batches = []
    for i in range(0, len(samples), batch_size):
        batches.append(samples[i:i+batch_size])
    print("Training...")
    train_losses = []
    dev_losses = []
    for i in range(num_epochs):
        for batch in tqdm(batches):
            # Empty the dynamic computation graph
            features, labels = zip(*batch)
            features = torch.stack(features)
            print("FEATURES: ", features)
            labels = torch.stack(labels)
            optimizer.zero_grad()
            # Run the model
            logits = model(features)
            print(logits)
            # Compute loss
            loss = logistic_loss(torch.squeeze(logits), labels)
            print("Loss: ", loss)
            # In this logistic regression example,
            # this entails computing a single gradient
            loss.backward()
            # Backpropogate the loss through our model
            
            # Update our coefficients in the direction of the gradient.
            optimizer.step()
             # For logging
            train_losses.append(loss.item())
            
            # Compute dev loss for our reference
            dev_logits = model(dev_features)
            dev_loss = logistic_loss(torch.squeeze(dev_logits), dev_labels)
            dev_losses.append(dev_loss.item())
        
        # Estimate the f1 score for the development set
        dev_f1 = f1_score(predict(model, dev_features), dev_labels.tolist())
        print(f"epoch {i}")
        print(f"Train loss: {sum(train_losses)/len(train_losses)}")
        print(f"Dev loss: {sum(dev_losses)/len(dev_losses)}")
        print(f"Dev F1 {dev_f1}")
        
    # Return the trained model
    return model, train_losses, dev_losses

In [28]:
num_epochs = 1000
lr = 1e-3
model = Classifier(X.shape[1])
optimizer = make_optimizer(model, learning_rate = lr)

In [29]:
dev_logits = model(X)

TypeError: linear(): argument 'input' (position 1) must be Tensor, not csr_matrix

In [None]:
trained_model, train_losses, dev_losses = training_loop(
    num_epochs,
    16,
    X,
    train_labels,
    dev_features,
    dev_labels,
    optimizer,
    model
)