In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel
import pandas as pd


In [10]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text, label = self.data[idx]
        tokens = self.tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        #print(tokens)
        input_ids = tokens["input_ids"][0]
        attention_mask = tokens["attention_mask"][0]
        return input_ids, attention_mask, label

In [11]:
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(TransformerEncoder, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        #print("last_hidden_state",last_hidden_state.size())
        pooled_output = torch.mean(last_hidden_state, dim=1)
        x = self.dropout(pooled_output)
        x = self.fc(x)
        x = F.log_softmax(x, dim=1)
        return x

In [12]:
df = pd.read_csv('my_dict.csv')
df['value'] = df['value'].apply(lambda x: eval(x))
my_dict = dict(zip(df['key'], df['value']))

In [13]:
# set random seed for reproducibility
torch.manual_seed(1234)

# set up tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# define data
data = []
for key,value in my_dict.items():
  for program in value:
    words = program.strip().split(' ')
    if len(words) < 500:
      data.append((program,key))

In [19]:
from torch.utils.data import random_split
batch_size = 2
dataset = CustomDataset(data,tokenizer)

# Define the sizes of the train and test datasets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

# Randomly split the dataset into train and test datasets
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
traindataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
testdataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

In [20]:
# set up model
input_dim = 768
hidden_dim = 768
num_classes = 3
model = TransformerEncoder(input_dim, hidden_dim, num_classes)
model=model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [22]:
# train and test model
num_epochs = 10

for epoch in range(num_epochs):
    # train
    model.train()
    running_loss = 0.0
    for input_ids, attention_mask, labels in traindataloader:
        optimizer.zero_grad()
        input_ids = torch.stack(input_ids).cuda() # move input tensors to GPU
        attention_mask = torch.stack(attention_mask).cuda() # move input tensors to GPU
        labels = torch.tensor(labels).cuda() # move input tensors to GPU
        output = model(input_ids, attention_mask)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * input_ids.size(0)
    train_loss = running_loss / len(train_dataset)

    # evaluate
    model.eval()
    with torch.no_grad():
        running_loss = 0.0
        correct_preds = 0
        total_preds = 0
        for input_ids, attention_mask, labels in testdataloader:
            input_ids = torch.stack(input_ids).cuda() # move input tensors to GPU
            attention_mask = torch.stack(attention_mask).cuda() # move input tensors to GPU
            labels = torch.tensor(labels).cuda() # move input tensors to GPU
            output = model(input_ids, attention_mask)
            loss = criterion(output, labels)
            running_loss += loss.item() * input_ids.size(0)
            preds = torch.argmax(output, dim=1)
            correct_preds += torch.sum(preds == labels).item()
            total_preds += len(labels)
        test_loss = running_loss / len(test_dataset)
        test_acc = correct_preds / total_preds

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f} | Test Accuracy: {test_acc:.4f}")


Epoch 1/10 | Train Loss: 0.3076 | Test Loss: 0.0832 | Test Accuracy: 0.9772
Epoch 2/10 | Train Loss: 0.1142 | Test Loss: 0.0868 | Test Accuracy: 0.9810
Epoch 3/10 | Train Loss: 0.0735 | Test Loss: 0.0784 | Test Accuracy: 0.9696
Epoch 4/10 | Train Loss: 0.0495 | Test Loss: 0.1073 | Test Accuracy: 0.9772
Epoch 5/10 | Train Loss: 0.0465 | Test Loss: 0.0793 | Test Accuracy: 0.9658
Epoch 6/10 | Train Loss: 0.0279 | Test Loss: 0.1209 | Test Accuracy: 0.9696
Epoch 7/10 | Train Loss: 0.0505 | Test Loss: 0.0840 | Test Accuracy: 0.9772
Epoch 8/10 | Train Loss: 0.0143 | Test Loss: 0.1463 | Test Accuracy: 0.9620
Epoch 9/10 | Train Loss: 0.0068 | Test Loss: 0.0952 | Test Accuracy: 0.9772
Epoch 10/10 | Train Loss: 0.0240 | Test Loss: 0.1538 | Test Accuracy: 0.9620


In [10]:
# train model
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for input_ids, attention_mask, labels in data_loader:
        optimizer.zero_grad()
        input_ids = torch.stack(input_ids).cuda() # move input tensors to GPU
        attention_mask = torch.stack(attention_mask).cuda() # move input tensors to GPU
        labels = torch.tensor(labels).cuda() # move input tensors to GPU
        output = model(input_ids, attention_mask)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * input_ids.size(0)
    epoch_loss = running_loss / len(dataset)
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {epoch_loss:.4f}")

Epoch 1/10 | Loss: 0.2418
Epoch 2/10 | Loss: 0.0861
Epoch 3/10 | Loss: 0.0639
Epoch 4/10 | Loss: 0.0289
Epoch 5/10 | Loss: 0.0280
Epoch 6/10 | Loss: 0.0351
Epoch 7/10 | Loss: 0.0404
Epoch 8/10 | Loss: 0.0165
Epoch 9/10 | Loss: 0.0092
Epoch 10/10 | Loss: 0.0190
