In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast


In [4]:
excel_file_path = 'algorithms.xlsx'
df = pd.read_excel(excel_file_path)


In [5]:
df = df.dropna(subset=['question'])
df = df[df['question'].apply(lambda x: isinstance(x, str))]

In [6]:
df['type'] = df['type'].astype('category').cat.codes

In [27]:
filtered_df = df[df['type'] == 1]

print(filtered_df)

                                              question  type
64   Given an array of integer nums and an integer ...     1
65   You are given an array of prices where prices[...     1
66   Given an integer array nums, return an array a...     1
67   Given an integer array nums, find the subarray...     1
68   Given an integer array nums, find a subarray t...     1
..                                                 ...   ...
315  Find the maximum difference between two elemen...     1
316  Compute the angle between the hour and minute ...     1
317                 Check if a number is a palindrome.     1
318  Count the number of set bits in the binary rep...     1
332                check if a number is perfect square     1

[204 rows x 2 columns]


In [7]:
train_text, temp_text, train_labels, temp_labels = train_test_split(df['question'], df['type'],
                                                                    random_state=2018,
                                                                    test_size=0.3,
                                                                    stratify=df['type'])


val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                random_state=2018,
                                                                test_size=0.5,
                                                                stratify=temp_labels)

In [8]:
bert = AutoModel.from_pretrained('bert-base-uncased')


tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


text = ["this is a bert model tutorial", "we will fine-tune a bert model"]


sent_id = tokenizer.batch_encode_plus(text, padding=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = 15,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = 15,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = 15,
    pad_to_max_length=True,
    truncation=True
)



In [10]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [11]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [12]:
for param in bert.parameters():
    param.requires_grad = False

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
import torch
import torch.nn as nn
from transformers import BertModel

class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()

        self.bert = bert

        # dropout layer
        self.dropout = nn.Dropout(0.1)

        # relu activation function
        self.relu = nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768, 512)

        # dense layer 2 (output layer)
        self.fc2 = nn.Linear(512, 2)  # Assuming binary classification

        # softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        # pass the inputs to the model
        outputs = self.bert(sent_id, attention_mask=mask)

        # extract the hidden state of the [CLS] token
        cls_hs = outputs[1]

        x = self.fc1(cls_hs)
        x = self.relu(x)

        x = self.dropout(x)
        x = self.fc2(x)

        x = self.softmax(x)

        return x

In [15]:
model = BERT_Arch(bert)

In [16]:
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(),
                  lr = 1e-5)



In [17]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Assuming train_labels is your list of training labels
  # Example labels

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)

# Display the computed class weights
print(class_weights)

[1.00352113 0.9965035 ]


In [18]:
weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU


# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights)

# number of training epochs
epochs = 5

In [19]:
import torch.optim as optim

# Assuming the necessary variables and dataloaders are defined, such as:
# model, train_dataloader, valid_dataloader, epochs, loss_fn (cross entropy), optimizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERT_Arch(BertModel.from_pretrained('bert-base-uncased'))
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

def train():
    model.train()
    train_loss = 0

    for step, batch in enumerate(train_dataloader):
        # push the batch to the device
        batch = [r.to(device) for r in batch]

        sent_id, mask, labels = batch

        # clear previously calculated gradients
        optimizer.zero_grad()

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = loss_fn(preds, labels)

        # backward pass to calculate the gradients
        loss.backward()

        # update parameters
        optimizer.step()

        # add the loss to the training loss
        train_loss += loss.item()

    return train_loss / len(train_dataloader), _

def evaluate():
    model.eval()
    valid_loss = 0

    with torch.no_grad():
        for step, batch in enumerate(val_dataloader):
            batch = [r.to(device) for r in batch]

            sent_id, mask, labels = batch

            preds = model(sent_id, mask)

            loss = loss_fn(preds, labels)

            valid_loss += loss.item()

    return valid_loss / len(val_dataloader), _

# Training loop
epochs = 5 # Number of epochs

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    # Train model
    train_loss, _ = train()

    # Evaluate model
    valid_loss, _ = evaluate()

    print(f'Training Loss: {train_loss}')
    print(f'Validation Loss: {valid_loss}')



 Epoch 1 / 5
Training Loss: 0.6872746480835809
Validation Loss: 0.6653157472610474

 Epoch 2 / 5
Training Loss: 0.6470085117552016
Validation Loss: 0.6150815486907959

 Epoch 3 / 5
Training Loss: 0.570350898636712
Validation Loss: 0.5332328230142593

 Epoch 4 / 5
Training Loss: 0.4534347421593136
Validation Loss: 0.4712713807821274

 Epoch 5 / 5
Training Loss: 0.34264401925934684
Validation Loss: 0.4378790855407715


In [36]:
input_text = "sort an array using bubblesort"

def preprocess_input(input_text, tokenizer, max_length=15):
    tokens = tokenizer.encode_plus(
        input_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return tokens['input_ids'], tokens['attention_mask']

# Preprocess the input text
input_ids, attention_mask = preprocess_input(input_text, tokenizer)

# Move tensors to the same device as the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)


In [37]:
model.eval()
model = model.to(device)

# Get predictions
with torch.no_grad():
    outputs = model(input_ids, attention_mask)
    logits = outputs
    predictions = torch.argmax(logits, dim=1)

# Print the predictions
print(f"Predicted class for the input text: {predictions.item()}")

Predicted class for the input text: 0
