In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset, random_split

# Data Loading and Preprocessing

---

In [26]:
import json
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Load data function
def load_data(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

domain1_data = load_data('domain1_train.json')
domain2_data = load_data('domain2_train.json')
domain2_data = [entry for entry in domain2_data if len(entry['text']) > 0]


In [27]:
# Split the data
train_data_domain, valid_data_domain = train_test_split(domain1_data+domain2_data, test_size=0.2, random_state=42)

train_data = train_data_domain
valid_data = valid_data_domain

# Create PyTorch Datasets and DataLoaders

In [28]:
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = torch.tensor(self.data[idx]['text'])
        label = torch.tensor(self.data[idx]['label'])
        return text, label



def collate_batch(batch):
    texts, labels = zip(*batch)
    text_lengths = [len(txt) for txt in texts]
    texts = pad_sequence(texts, batch_first=True)
    labels = torch.tensor(labels).float()  # Convert labels to float
    return texts, labels, text_lengths


# DataLoader with the custom collate function
train_dataset = TextDataset(train_data)
valid_dataset = TextDataset(valid_data)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

# Model Definition

In [29]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional=True, dropout=0.5):
        super(BiLSTMClassifier, self).__init__()

        # Text embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # BiLSTM layer
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)

        # Classifier layer
        self.fc = nn.Linear(hidden_dim*2, output_dim)  # x2 for bidirectional

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)

        # Feature from the last hidden state of the BiLSTM
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))

        # Return the classifier's output
        return self.fc(hidden).squeeze(1)

# Find class weight

In [18]:
# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Extract model counts for domain 2
models_domain2 = [entry['label'] for entry in domain2_data]
class_counts_domain2 = [models_domain2.count(float(i)) for i in range(int(max(models_domain2)) + 1)]

# Compute class weights for domain 2
total_samples_domain2 = sum(class_counts_domain2)
class_weights_domain2 = torch.tensor([total_samples_domain2 / count for count in class_counts_domain2]).float().to(device)

def get_batch_weights(labels, class_weights_domain2):
    return class_weights_domain2[labels.long()].unsqueeze(1)



In [8]:
class_counts_domain2

[12750, 2149]

# Loss Function and Optimizer

In [30]:
# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Instantiate the model
model = BiLSTMClassifier(vocab_size=5000, embedding_dim=128, hidden_dim=256, output_dim=1, n_layers=2).to(device)

# Classification criterion
classification_criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights_domain2) ##change


# Optimizer
optimizer = optim.Adam(model.parameters())

# Binary accuracy function
def binary_accuracy(predictions, y):
    rounded_preds = torch.round(torch.sigmoid(predictions)).squeeze()  # Ensure it's a 1D tensor
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)


from sklearn.metrics import f1_score

def compute_f1(predictions, labels):
    # Convert predictions to binary
    preds_binary = torch.round(torch.sigmoid(predictions))
    preds_binary = preds_binary.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()

    return f1_score(labels, preds_binary)



# Training Loop

In [20]:
from tqdm import tqdm

num_epochs = 10

for epoch in range(num_epochs):
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0  # Initialize the F1-score for the epoch

    model.train()

    # Stage 1: Train main classifier
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Classifier Training"):
        texts, labels, text_lengths = batch
        texts = texts.to(device)
        labels = labels.to(device)
        #text_lengths = torch.tensor(text_lengths).long()

        batch_weights = get_batch_weights(labels, class_weights_domain2).to(device) # Get batch-specific weights

        # Create a criterion for the current batch with the specified pos_weight
        batch_criterion = nn.BCEWithLogitsLoss(pos_weight=batch_weights.squeeze()).to(device)

        optimizer.zero_grad()

        # Get the classifier's predictions
        predictions = model(texts, text_lengths)
        loss = batch_criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        f1 = compute_f1(predictions, labels)  # Compute F1 score

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_f1 += f1


    print(f"Epoch {epoch+1} Classifier Training: Loss: {epoch_loss/len(train_loader):.3f} | Accuracy: {epoch_acc/len(train_loader):.3f} | F1-Score: {epoch_f1/len(train_loader):.3f}")

    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0  # Reset the F1-score for the next epoch



Epoch 1 Classifier Training: 100%|██████████| 430/430 [00:46<00:00,  9.22it/s]


Epoch 1 Classifier Training: Loss: 1.043 | Accuracy: 0.625 | F1-Score: 0.634


Epoch 2 Classifier Training: 100%|██████████| 430/430 [00:44<00:00,  9.73it/s]


Epoch 2 Classifier Training: Loss: 0.878 | Accuracy: 0.670 | F1-Score: 0.662


Epoch 3 Classifier Training: 100%|██████████| 430/430 [00:45<00:00,  9.35it/s]


Epoch 3 Classifier Training: Loss: 0.789 | Accuracy: 0.717 | F1-Score: 0.694


Epoch 4 Classifier Training: 100%|██████████| 430/430 [00:46<00:00,  9.31it/s]


Epoch 4 Classifier Training: Loss: 0.745 | Accuracy: 0.754 | F1-Score: 0.724


Epoch 5 Classifier Training: 100%|██████████| 430/430 [00:43<00:00,  9.93it/s]


Epoch 5 Classifier Training: Loss: 0.713 | Accuracy: 0.764 | F1-Score: 0.735


Epoch 6 Classifier Training: 100%|██████████| 430/430 [00:44<00:00,  9.75it/s]


Epoch 6 Classifier Training: Loss: 0.715 | Accuracy: 0.762 | F1-Score: 0.734


Epoch 7 Classifier Training: 100%|██████████| 430/430 [00:42<00:00, 10.00it/s]


Epoch 7 Classifier Training: Loss: 0.683 | Accuracy: 0.771 | F1-Score: 0.740


Epoch 8 Classifier Training: 100%|██████████| 430/430 [00:45<00:00,  9.52it/s]


Epoch 8 Classifier Training: Loss: 0.640 | Accuracy: 0.792 | F1-Score: 0.760


Epoch 9 Classifier Training: 100%|██████████| 430/430 [00:44<00:00,  9.66it/s]


Epoch 9 Classifier Training: Loss: 0.624 | Accuracy: 0.810 | F1-Score: 0.774


Epoch 10 Classifier Training: 100%|██████████| 430/430 [00:43<00:00,  9.91it/s]


Epoch 10 Classifier Training: Loss: 0.594 | Accuracy: 0.816 | F1-Score: 0.781


Epoch 11 Classifier Training: 100%|██████████| 430/430 [00:45<00:00,  9.48it/s]


Epoch 11 Classifier Training: Loss: 0.575 | Accuracy: 0.825 | F1-Score: 0.790


Epoch 12 Classifier Training: 100%|██████████| 430/430 [00:44<00:00,  9.67it/s]


Epoch 12 Classifier Training: Loss: 0.555 | Accuracy: 0.827 | F1-Score: 0.792


Epoch 13 Classifier Training: 100%|██████████| 430/430 [00:44<00:00,  9.59it/s]


Epoch 13 Classifier Training: Loss: 0.536 | Accuracy: 0.828 | F1-Score: 0.792


Epoch 14 Classifier Training: 100%|██████████| 430/430 [00:43<00:00,  9.83it/s]


Epoch 14 Classifier Training: Loss: 0.502 | Accuracy: 0.839 | F1-Score: 0.806


Epoch 15 Classifier Training: 100%|██████████| 430/430 [00:45<00:00,  9.50it/s]

Epoch 15 Classifier Training: Loss: 0.499 | Accuracy: 0.848 | F1-Score: 0.813





# Evaluation

In [21]:
def evaluate_model(model, valid_loader, device):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation for efficiency
        for batch in valid_loader:
            texts, labels, text_lengths = batch
            texts = texts.to(device)
            labels = labels.float().to(device)

            # Compute model predictions
            predictions = model(texts, text_lengths)
            if predictions.dim() > 1 and predictions.size(1) == 1:
                predictions = predictions.squeeze(1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    binary_predictions = [1 if p >= 0.5 else 0 for p in all_predictions]

    # Convert lists to tensors
    tensor_predictions = torch.tensor(binary_predictions)
    tensor_labels = torch.tensor(all_labels)

    accuracy = binary_accuracy(tensor_predictions, tensor_labels)

    # Compute F1-Score
    f1 = f1_score(all_labels, binary_predictions, average='macro')

    return accuracy, f1

# After the training loop, evaluate on validation set
valid_accuracy, valid_f1 = evaluate_model(model, valid_loader, device)
print(f"Validation Accuracy: {valid_accuracy:.4f}")
print(f"Validation F1-Score: {valid_f1:.4f}")


Validation Accuracy: 0.8392
Validation F1-Score: 0.8328


# Send to Kaggle

In [22]:
import csv
import json

# Load the test data
with open('test_set.json', 'r') as f:
    test_data = [json.loads(line) for line in f]
# Evaluate on test data
model.eval()
test_results = []

In [23]:
with torch.no_grad():
    for entry in tqdm(test_data, desc="Evaluating Test Data"): # change from test_set to test_data
        text = entry["text"]
        text_tensor = torch.tensor(text).unsqueeze(0).to(device)  # Adding an extra batch dimension
        text_length = torch.tensor([len(text)])  # Sequence length for current entry

        # Pass the sequence and its length to the model
        prediction = model(text_tensor, text_length)
        if prediction.dim() > 1 and prediction.size(1) == 1:
            prediction = prediction.squeeze(1)
        prediction = torch.sigmoid(prediction).item()  # Convert raw score to value between 0 and 1

        # Classify the texts
        class_label = 1 if prediction >= 0.5 else 0

        test_results.append({
            "id": entry["id"],
            "class": class_label
        })

Evaluating Test Data: 100%|██████████| 1000/1000 [00:05<00:00, 186.33it/s]


In [24]:
# Write results to CSV
with open('results.csv', 'w', newline='') as csvfile:
    fieldnames = ['id', 'class']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for result in test_results:
        writer.writerow(result)