In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
torch.manual_seed(42)


<torch._C.Generator at 0x7f68839a9570>

In [3]:
# Load the data
train_dataset = pd.read_csv('/workspaces/MLE-Test/helpful_train.csv')
val_dataset = pd.read_csv('/workspaces/MLE-Test/helpful_valid.csv')
test_dataset = pd.read_csv('/workspaces/MLE-Test/helpful_test.csv')

In [15]:
train_dataset.head()

Unnamed: 0,sentence,label
0,\n++ Thank you for attaching the tech support ...,1
1,\n- Download 10.0.11-[US_DRIVER_LICENSE] ---->...,1
2,\nAgain many thanks for the excellent support!,0
3,"\nAlso, thank you for pointing to the platform...",1
4,\nAn: [PERSON] \nBetreff: A new comment has be...,1


In [16]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  423 non-null    object
 1   label     423 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 6.7+ KB


In [17]:
import numpy as np
train_dataset['label']=np.where((train_dataset['label']==True),1,0)
val_dataset['label']=np.where((val_dataset['label']==True),1,0)
test_dataset['label']=np.where((test_dataset['label']==True),1,0)

In [18]:
train_dataset.head()

Unnamed: 0,sentence,label
0,\n++ Thank you for attaching the tech support ...,1
1,\n- Download 10.0.11-[US_DRIVER_LICENSE] ---->...,1
2,\nAgain many thanks for the excellent support!,0
3,"\nAlso, thank you for pointing to the platform...",1
4,\nAn: [PERSON] \nBetreff: A new comment has be...,1


In [19]:
train_dataset.shape

(423, 2)

In [20]:
from transformers import BertModel, BertTokenizer
tokenizer= BertTokenizer.from_pretrained('bert-base-uncased')
def bert_tokenizer(data,tokenizer):

        tokens_train = tokenizer.batch_encode_plus(
                        data.tolist(),
                        max_length = 50,
                        pad_to_max_length=True,
                        truncation=True,
                        return_tensors='pt')
        return tokens_train

In [21]:
train_data=bert_tokenizer(train_dataset['sentence'].values,tokenizer)



In [22]:
test_data=bert_tokenizer(test_dataset['sentence'].values,tokenizer)

In [23]:
val_data=bert_tokenizer(val_dataset['sentence'].values,tokenizer)

In [24]:
len(test_data['input_ids'])

867

In [25]:
# len(train_data['input_ids'])
train_data['attention_mask']

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [31]:
from transformers import BertModel, BertTokenizer
import torch.nn as nn

class BERTHelpful(nn.Module):
    def __init__(self, bert_model_name, hidden_size, output_size):
        super().__init__()
        
        self.bert = BertModel.from_pretrained(bert_model_name)

        self.fc = nn.Linear(hidden_size, output_size)

        

    def forward(self, input_ids, attention_mask):

        # Pass input through BERT
        with torch.no_grad():  
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # # Extract the last hidden state of the [CLS] token
        # cls_output = bert_output.pooler_output

        # # Pass the BERT [CLS] output through the linear layer
        # output = self.fc(cls_output)
        cls_output = bert_output.last_hidden_state[:, 0, :]
        logits = self.fc(cls_output)
        return logits

        # return output


In [32]:
def evaluate(model, val_data,val_dataset,train_loss,criterion):
    # Evaluate on the test set
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for i in range(len(val_dataset)):
            input_ids = val_data["input_ids"][i].view(1,-1)
            attention_mask=val_data['attention_mask'][i].view(1,-1)
            # label = val_dataset['label'][i]
            label = torch.tensor(val_dataset['label'][i],dtype=torch.float32).view(1,-1)

            output = model(input_ids,attention_mask)
            final_output = torch.round(torch.sigmoid(output))

            loss = criterion(final_output, label)
            test_loss += loss.item()
            

            # predicted = torch.round(torch.sigmoid(output))
            correct += (final_output == label).sum().item()
            total += label.size(0)

    test_loss /= len(val_data)
    accuracy = correct / total * 100

    print(f"Train Loss: {train_loss}, Test Loss: {test_loss:.4f}, Accuracy: {accuracy:.2f}%")

    return test_loss

In [None]:
import numpy as np


# Instatiate the model
model = BERTHelpful('bert-base-uncased',768,1)

# Define loss function, optimizer, and learning rate scheduler
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for i in range(len(train_dataset)):
        # print(train_data['attention_mask'][i])
        # print(train_data['label'].loc[i])
        input_ids = train_data['input_ids'][i].view(1,-1)
        attention_mask=train_data['attention_mask'][i].view(1,-1)
        label = torch.tensor(train_dataset['label'][i],dtype=torch.float32).view(1,-1)

        optimizer.zero_grad()

        output = model(input_ids,attention_mask)
        final_output = torch.round(torch.sigmoid(output))

        # print(output.logits)

        # final_output=argmax(output.logits)

        loss = criterion(final_output, label)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_data)

    
    # Print predictions and labels during training for debugging
    print(f"Epoch {epoch + 1}, Batch {i + 1} - Predicted: {final_output.item()}, Actual: {label.item()}")

    val_loss = evaluate(model, val_data,val_dataset,train_loss,criterion)


In [None]:
# plot train losses and validation losses

In [None]:
# Predict and computer metrics on test set