In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset, random_split, WeightedRandomSampler

import pandas as pd

In [4]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

Loading the Data:

In [5]:
import json

df1 = pd.read_json('data/domain1_train.json', lines=True)
df2 = pd.read_json('data/domain2_train.json', lines=True)



In [6]:
df1.loc[df1['label'] == 1, 'model'] = 8
df1.loc[df1['label'] == 0, 'model'] = 7
df1

Unnamed: 0,text,label,model
0,"[70, 746, 825, 109, 2083, 0, 2, 0, 0, 0, 9, 0,...",1,8.0
1,"[1209, 179, 1952, 4, 4959, 7, 0, 2, 978, 1522,...",1,8.0
2,"[287, 3, 3330, 0, 23, 12, 13, 465, 74, 8, 0, 8...",1,8.0
3,"[0, 0, 3, 592, 19, 2, 706, 1439, 2575, 7, 2, 0...",1,8.0
4,"[9, 2, 110, 12, 42, 32, 44, 361, 9, 3860, 2358...",1,8.0
...,...,...,...
19495,"[3987, 4, 2, 2536, 3611, 3, 5, 1125, 269, 2324...",0,7.0
19496,"[2, 132, 87, 980, 4, 2, 379, 12, 1336, 38, 299...",0,7.0
19497,"[1820, 93, 3, 548, 1, 1072, 1, 3, 2, 2741, 104...",0,7.0
19498,"[5, 48, 6, 2662, 17, 9, 5, 902, 2, 48, 6, 84, ...",0,7.0


In [7]:
df2.loc[df2['label'] == 1, 'model'] = 8
df2

Unnamed: 0,text,label,model
0,"[3147, 471, 4, 343, 2, 0, 1, 14, 124, 133, 436...",1,8.0
1,"[10, 389, 232, 24, 2, 0, 4730, 1, 13, 10, 129,...",1,8.0
2,"[861, 0, 2505, 2, 0, 1015, 2, 0, 3, 1772, 8, 3...",1,8.0
3,"[325, 21, 3234, 1, 14, 187, 16, 13, 1965, 1, 1...",1,8.0
4,"[5, 1091, 272, 6, 3232, 32, 2, 1012, 4, 3240, ...",1,8.0
...,...,...,...
14895,"[175, 1317, 38, 754, 9, 5, 0, 228, 1, 45, 6, 2...",0,1.0
14896,"[466, 5, 70, 1242, 6, 3888, 1, 34, 43, 5, 70, ...",0,3.0
14897,"[10, 0, 21, 1650, 18, 5, 1335, 1, 208, 5, 997,...",0,1.0
14898,"[18, 39, 316, 133, 365, 2019, 1, 27, 10, 5, 61...",0,4.0


In [8]:
train_df1, valid_df1 = train_test_split(df1, test_size=0.2, random_state=42)
train_df2, valid_df2 = train_test_split(df2, test_size=0.2, random_state=42)

In [9]:
concatenated_df = pd.concat([train_df1, train_df2])

concatenated_df

Unnamed: 0,text,label,model
18319,"[126, 1, 199, 0, 3, 19, 718, 126, 1, 199, 3297...",0,7.0
8086,"[0, 0, 3, 0, 3, 35, 2085, 0, 3, 2016, 28, 835,...",1,8.0
447,"[2, 1212, 818, 12, 0, 292, 2150, 0, 59, 34, 79...",1,8.0
12545,"[2, 407, 2985, 12, 2, 48, 6, 213, 0, 120, 206,...",0,7.0
7176,"[1443, 1833, 32, 23, 34, 114, 0, 0, 0, 16, 0, ...",1,8.0
...,...,...,...
5191,"[10, 125, 4, 2, 70, 1551, 19, 21, 0, 16, 1847,...",0,6.0
13418,"[726, 26, 4, 148, 5, 1735, 652, 16, 948, 1, 14...",0,3.0
5390,"[11, 465, 327, 4, 200, 57, 357, 0, 1, 11, 187,...",0,4.0
860,"[2505, 319, 3550, 1968, 1, 14, 85, 4, 122, 21,...",1,8.0


In [10]:
concatenated_df['label'].value_counts()

label
0    18013
1     9507
Name: count, dtype: int64

In [11]:
concatenated_df['model'].value_counts()

model
8.0    9507
7.0    7807
3.0    1893
1.0    1892
2.0    1890
0.0    1873
6.0    1420
4.0     636
5.0     602
Name: count, dtype: int64

In [12]:
value_dict = concatenated_df['model'].value_counts().to_dict()
hidden_class_w = {}
for key in value_dict.keys():
    hidden_class_w[key] =  len(concatenated_df)/value_dict[key] 

#hidden_class_w[8.0] = 1
hidden_class_w



{8.0: 2.894709161670348,
 7.0: 3.525041629307032,
 3.0: 14.537770734284205,
 1.0: 14.545454545454545,
 2.0: 14.56084656084656,
 0.0: 14.693005872931126,
 6.0: 19.380281690140844,
 4.0: 43.270440251572325,
 5.0: 45.714285714285715}

In [13]:
label_dict = concatenated_df['label'].value_counts().to_dict()
label_w = {}
for key in label_dict.keys():
    label_w[key] =  len(concatenated_df)/label_dict[key]
label_w

{0: 1.52778548825848, 1: 2.894709161670348}

In [14]:
domain12_data = json.loads( concatenated_df.to_json(orient='records') )

val_concatenated_df = pd.concat([valid_df1, valid_df2])
val_data = json.loads( val_concatenated_df.to_json(orient='records') )

In [15]:
from torch.utils.data import Dataset, DataLoader, random_split

class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]['text']), torch.tensor(self.data[idx]['label']), hidden_class_w[self.data[idx]['model']], label_w[self.data[idx]['label']]



In [16]:
train_dataset = TextDataset(domain12_data)
train_dataset = [sample for sample in train_dataset if len(sample[0])>0]



In [17]:
example_weights = [sample[3] for sample in train_dataset]

In [18]:
val_dataset = TextDataset(val_data)
val_dataset = [sample for sample in val_dataset if len(sample[0])>0]

oversampler = WeightedRandomSampler(example_weights, len(example_weights))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False, collate_fn=lambda x: x, sampler=oversampler)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=lambda x: x)

In [19]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional=True, dropout=0.5):
        super(BiLSTMClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim*2, output_dim)  # x2 for bidirectional
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        return self.fc(hidden)

domain 2 training

In [36]:
from tqdm.notebook import tqdm
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Assuming the aforementioned initializations
model = BiLSTMClassifier(vocab_size=5000, embedding_dim=128, hidden_dim=256, output_dim=1, n_layers=2).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Function to compute accuracy
def binary_accuracy(predictions, y):
    # Round predictions to the closest integer (0 or 1)
    rounded_preds = torch.round(torch.sigmoid(predictions))
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)

from sklearn.metrics import f1_score, precision_score, recall_score

def compute_f1(predictions, labels):
    # Convert predictions to binary
    preds_binary = torch.round(torch.sigmoid(predictions))
    preds_binary = preds_binary.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()
    
    f1 = f1_score(labels, preds_binary, average='macro')
    precision = precision_score(labels, preds_binary, average='macro')
    recall = recall_score(labels, preds_binary, average='macro')

    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

    return f1, precision, recall


In [33]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [25]:
for epoch in range(10):
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0

    model.train()  # Set the model to training mode

    # Training loop (same as before)
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        texts, labels, hidden_weights, label_weights = zip(*batch)
        text_lengths = [len(txt) for txt in texts]
        """sorted_seq_indices = sorted(range(len(text_lengths)), key=lambda x: -text_lengths[x])
        texts = [texts[i] for i in sorted_seq_indices]
        labels = [labels[i] for i in sorted_seq_indices]
        hidden_weights = [hidden_weights[i] for i in sorted_seq_indices]
        label_weights = [label_weights[i] for i in sorted_seq_indices]"""
        
        texts = pad_sequence(texts, batch_first=True).to(device)
        labels = torch.tensor(labels).float().to(device)


        optimizer.zero_grad()
        predictions = model(texts, text_lengths).squeeze(1)

        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        f1 = compute_f1(predictions, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_f1 += f1

    print(f"Epoch {epoch+1} Training: Loss: {epoch_loss/len(train_loader):.3f} | Accuracy: {epoch_acc/len(train_loader):.3f}")
    print(f"F1-Score: {epoch_f1/len(train_loader):.3f}")
          
    # Validation loop
    val_loss = 0
    val_acc = 0
    val_f1 = 0
    model.eval()  # Set the model to evaluation mode

    with torch.no_grad():  # Do not compute gradients during validation
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"):
            texts, labels, hidden_weights, _ = zip(*batch)
            text_lengths = [len(txt) for txt in texts]
            """sorted_seq_indices = sorted(range(len(text_lengths)), key=lambda x: -text_lengths[x])
            texts = [texts[i] for i in sorted_seq_indices]
            labels = [labels[i] for i in sorted_seq_indices]"""
            texts = pad_sequence(texts, batch_first=True).to(device)
            labels = torch.tensor(labels).float().to(device)
            

            predictions = model(texts, text_lengths).squeeze(1)
            #loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            f1 = compute_f1(predictions, labels)

            val_loss += loss.item()
            val_acc += acc.item()
            val_f1 += f1

    print(f"Epoch {epoch+1} Validation: Loss: {val_loss/len(val_loader):.3f} | Accuracy: {val_acc/len(val_loader):.3f}")
    print(f"F1-Score: {val_f1/len(val_loader):.3f}")

Epoch 1 Training:   0%|          | 0/430 [00:00<?, ?it/s]

Epoch 1 Training: Loss: 0.113 | Accuracy: 0.956
F1-Score: 0.955


Epoch 1 Validation:   0%|          | 0/108 [00:00<?, ?it/s]

Epoch 1 Validation: Loss: 0.119 | Accuracy: 0.901
F1-Score: 0.659


Epoch 2 Training:   0%|          | 0/430 [00:00<?, ?it/s]

Epoch 2 Training: Loss: 0.109 | Accuracy: 0.956
F1-Score: 0.955


Epoch 2 Validation:   0%|          | 0/108 [00:00<?, ?it/s]

Epoch 2 Validation: Loss: 0.141 | Accuracy: 0.903
F1-Score: 0.646


Epoch 3 Training:   0%|          | 0/430 [00:00<?, ?it/s]

Epoch 3 Training: Loss: 0.102 | Accuracy: 0.960
F1-Score: 0.959


Epoch 3 Validation:   0%|          | 0/108 [00:00<?, ?it/s]

Epoch 3 Validation: Loss: 0.068 | Accuracy: 0.895
F1-Score: 0.655


Epoch 4 Training:   0%|          | 0/430 [00:00<?, ?it/s]

Epoch 4 Training: Loss: 0.096 | Accuracy: 0.963
F1-Score: 0.962


Epoch 4 Validation:   0%|          | 0/108 [00:00<?, ?it/s]

Epoch 4 Validation: Loss: 0.128 | Accuracy: 0.905
F1-Score: 0.649


Epoch 5 Training:   0%|          | 0/430 [00:00<?, ?it/s]

Epoch 5 Training: Loss: 0.092 | Accuracy: 0.964
F1-Score: 0.963


Epoch 5 Validation:   0%|          | 0/108 [00:00<?, ?it/s]

Epoch 5 Validation: Loss: 0.127 | Accuracy: 0.890
F1-Score: 0.673


Epoch 6 Training:   0%|          | 0/430 [00:00<?, ?it/s]

Epoch 6 Training: Loss: 0.087 | Accuracy: 0.967
F1-Score: 0.967


Epoch 6 Validation:   0%|          | 0/108 [00:00<?, ?it/s]

Epoch 6 Validation: Loss: 0.074 | Accuracy: 0.899
F1-Score: 0.663


Epoch 7 Training:   0%|          | 0/430 [00:00<?, ?it/s]

Epoch 7 Training: Loss: 0.081 | Accuracy: 0.970
F1-Score: 0.969


Epoch 7 Validation:   0%|          | 0/108 [00:00<?, ?it/s]

Epoch 7 Validation: Loss: 0.038 | Accuracy: 0.893
F1-Score: 0.681


Epoch 8 Training:   0%|          | 0/430 [00:00<?, ?it/s]

Epoch 8 Training: Loss: 0.081 | Accuracy: 0.969
F1-Score: 0.969


Epoch 8 Validation:   0%|          | 0/108 [00:00<?, ?it/s]

Epoch 8 Validation: Loss: 0.026 | Accuracy: 0.906
F1-Score: 0.653


Epoch 9 Training:   0%|          | 0/430 [00:00<?, ?it/s]

Epoch 9 Training: Loss: 0.069 | Accuracy: 0.974
F1-Score: 0.974


Epoch 9 Validation:   0%|          | 0/108 [00:00<?, ?it/s]

Epoch 9 Validation: Loss: 0.042 | Accuracy: 0.891
F1-Score: 0.664


Epoch 10 Training:   0%|          | 0/430 [00:00<?, ?it/s]

Epoch 10 Training: Loss: 0.069 | Accuracy: 0.973
F1-Score: 0.973


Epoch 10 Validation:   0%|          | 0/108 [00:00<?, ?it/s]

Epoch 10 Validation: Loss: 0.085 | Accuracy: 0.903
F1-Score: 0.666


In [24]:
torch.save(model. state_dict(), 'b_d22_ro_20.pth')

In [37]:
loaded_state_dict = torch.load('b_d22_ro_20.pth')
model.load_state_dict(loaded_state_dict)

<All keys matched successfully>

In [40]:
epoch = 0
def run_val():
    val_loss = 0
    val_acc = 0
    val_f1 = 0
    val_pre = 0
    val_recall = 0
    model.eval()  # Set the model to evaluation mode

    with torch.no_grad():  # Do not compute gradients during validation
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"):
            texts, labels, hidden_weights, _ = zip(*batch)
            text_lengths = [len(txt) for txt in texts]
            """sorted_seq_indices = sorted(range(len(text_lengths)), key=lambda x: -text_lengths[x])
            texts = [texts[i] for i in sorted_seq_indices]
            labels = [labels[i] for i in sorted_seq_indices]"""
            texts = pad_sequence(texts, batch_first=True).to(device)
            labels = torch.tensor(labels).float().to(device)
            

            predictions = model(texts, text_lengths).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            f1, pre, recall = compute_f1(predictions, labels)

            val_loss += loss.item()
            val_acc += acc.item()
            val_f1 += f1
            val_pre += pre
            val_recall += recall

    print(f"Validation: Loss: {val_loss/len(val_loader):.3f} | Accuracy: {val_acc/len(val_loader):.3f}")
    print(f"F1-Score: {val_f1/len(val_loader):.3f} | Precision: {val_pre/len(val_loader):.3f} | Recall: {val_recall/len(val_loader):.3f}")
run_val()

Epoch 1 Validation:   0%|          | 0/108 [00:00<?, ?it/s]

F1 Score: 0.9528
Precision: 0.9520
Recall: 0.9542
F1 Score: 0.9680
Precision: 0.9737
Recall: 0.9643
F1 Score: 0.9375
Precision: 0.9375
Recall: 0.9375
F1 Score: 0.9365
Precision: 0.9365
Recall: 0.9365
F1 Score: 0.9528
Precision: 0.9520
Recall: 0.9542
F1 Score: 0.9685
Precision: 0.9722
Recall: 0.9667
F1 Score: 0.9048
Precision: 0.9032
Recall: 0.9231
F1 Score: 0.9531
Precision: 0.9536
Recall: 0.9531
F1 Score: 0.9365
Precision: 0.9409
Recall: 0.9340
F1 Score: 0.9687
Precision: 0.9706
Recall: 0.9688
F1 Score: 0.9352
Precision: 0.9309
Recall: 0.9415
F1 Score: 0.9843
Precision: 0.9839
Recall: 0.9853
F1 Score: 0.9685
Precision: 0.9685
Recall: 0.9685
F1 Score: 0.9844
Precision: 0.9848
Recall: 0.9844
F1 Score: 0.9219
Precision: 0.9233
Recall: 0.9245
F1 Score: 0.9667
Precision: 0.9756
Recall: 0.9600
F1 Score: 0.9686
Precision: 0.9677
Recall: 0.9714
F1 Score: 0.9375
Precision: 0.9384
Recall: 0.9384
F1 Score: 0.9687
Precision: 0.9706
Recall: 0.9688
F1 Score: 0.9219
Precision: 0.9233
Recall: 0.9245


Kaggle Test

In [22]:
import csv
import json

# Load the test data
with open('data/test_set.json', 'r') as f:
    test_data = [json.loads(line) for line in f]
# Evaluate on test data
model.eval()
test_results = []

with torch.no_grad():
    for entry in tqdm(test_data, desc="Evaluating Test Data"): # change from test_set to test_data
        text = entry["text"]
        text_tensor = torch.tensor(text).unsqueeze(0).to(device)  # Adding an extra batch dimension
        text_length = torch.tensor([len(text)])  # Sequence length for current entry

        # Pass the sequence and its length to the model
        prediction = model(text_tensor, text_length).squeeze(1)
        prediction = torch.sigmoid(prediction).item()  # Convert raw score to value between 0 and 1

        # Classify the texts
        class_label = 1 if prediction >= 0.5 else 0

        test_results.append({
            "id": entry["id"],
            "class": class_label
        })
        
with open('results_b_12_ro_10.csv', 'w', newline='') as csvfile:
    fieldnames = ['id', 'class']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for result in test_results:
        writer.writerow(result)

Evaluating Test Data:   0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 