In [3]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn import preprocessing
import requests
import io
import zipfile

'''

Training Model

# requires gt_corpus.csv
# global activation function works quite well atm

'''

import pandas as pd

df = pd.read_csv('gt_corpus.csv')
classes = len(df['class'].unique())
le = preprocessing.LabelEncoder()
targets = le.fit_transform(df['task'])
data = {'corpus':list(df['text']),'labels':targets}
data.keys()

dict_keys(['corpus', 'labels'])

In [9]:
# define corpus
corpus = df
classes = len(corpus['class'].unique())

'''

Load the base models

            & send it to GPU

'''

# Load the pre-trained BERT model and tokenizer
model_name = 'prajjwal1/bert-mini'
# model_name = 'bert-base-uncased'
tokeniser = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


In [None]:
'''

Read Fine-Tuned Classifier Model (in Archived Format)

            Load model called in module

'''

# def download_and_extract_zip(url, extract_path):
#     # Send a GET request to the GitHub raw URL to download the ZIP file
#     response = requests.get(url)

#     # Check if the request was successful
#     if response.status_code == 200:
#         # Create a file-like object from the downloaded content
#         zip_file = io.BytesIO(response.content)

#         # Extract the contents of the ZIP file to the specified extract path
#         with zipfile.ZipFile(zip_file, 'r') as zip_ref:
#             zip_ref.extractall(extract_path)
#         print(f"ZIP file extracted to {extract_path}")
#     else:
#         print(f"Failed to download ZIP file from {url}")

# download_and_extract_zip('https://github.com/mllibs/mllibs/raw/main/data/models/bert_classifier_model.zip', 'local_classifier')
# model = BertForSequenceClassification.from_pretrained('local_classifier')

In [24]:
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

model.to(device)

# Sample dataset for text classification
class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return {'text': text, 'label': label}

dataset = CustomDataset(list(data['corpus']),
                        list(data['labels']))

def train_bert(dataset,tokeniser,model):

    # Define batch size and create data loader
    batch_size = 10
    dataloader = DataLoader(dataset,
                            sampler=RandomSampler(dataset),
                            batch_size=batch_size)

    # Set up optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=1e-5)
    criterion = nn.CrossEntropyLoss()
    total_steps = len(dataloader) * 2

    # Train the model
    model.train()
    for epoch in tqdm(range(100)):
        model.train()
        total_correct = 0
        total_samples = 0
        for batch in dataloader:

            inputs = tokeniser(batch['text'], padding=True, truncation=True, return_tensors='pt')
            inputs.to(device)
            labels = batch['label'].to(device)

            outputs = model(**inputs)
            loss = criterion(outputs.logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Calculate accuracy
        predicted_labels = torch.argmax(outputs.logits, dim=1)
        total_correct += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    print(f'Epoch {epoch+1} completed. Accuracy: {accuracy:.4f}')
    return model

model = train_bert(dataset,tokeniser,model)

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 100 completed. Accuracy: 1.0000


In [25]:
# model = model.to('cpu')
model.save_pretrained('bert_classifier_model')

In [26]:
'''

Create Archive of Model Folder

'''

import zipfile
import os

def create_zip_archive(folder_path, zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, folder_path))

# Example usage
colab_folder_path = "/content/bert_classifier_model"  # Replace with the path of your folder in Colab
local_save_path = "bert_classifier_model.zip"  # Path where you want to save the downloaded ZIP file
create_zip_archive(colab_folder_path, local_save_path)
print(f"Folder archived and saved as {local_save_path}")

Folder archived and saved as bert_classifier_model.zip


In [27]:
'''

II Inference with Transformer Encoder

'''

model.to('cpu')

def inference(text,tokeniser,model):

    # Tokenize the input text
    inputs = tokeniser(input_text,
                       padding=True,
                       truncation=True,
                       return_tensors='pt')

    # Perform inference using the model
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get the predicted label
    predicted_label = torch.argmax(logits, dim=1).item()

    # Print the predicted label
    print(f"The predicted label for the input text is: {le.classes_[predicted_label]}")

# fine
# input_text = "I'd like you to create a seaborn scatterplot" # 61
# input_text = "I want to plot a figure using the seaborn scatter plot" # 61
input_text = "create plotly count heatmap x: bill_depth_mm y: flipper_length_mm hue: island col: island using penguins"
inference(input_text,tokeniser,model)

The predicted label for the input text is: sheatmap


In [32]:
# incorrect
input_text = "I'd like you to create a scatterplot using plotly set parameters as x: bill_depth_mm y: flipper_length_mm hue: island col: island using penguins"
inference(input_text,tokeniser,model)

The predicted label for the input text is: col_scatter


In [33]:
# incorrect
input_text = "create a scatter plot using plotly set parameters as x: bill_depth_mm y: flipper_length_mm hue: island col: island using penguins"
inference(input_text,tokeniser,model)

The predicted label for the input text is: col_scatter


In [35]:
# extraction of parameters only (which works well with QA) improves accuracy
input_text = "create a scatter plot using plotly using penguins"
inference(input_text,tokeniser,model)

The predicted label for the input text is: plscatter
