In [2]:
!pip install transformers[torch]
!pip install accelerate -U`

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->transformers[torch])
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand

In [3]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import json
import pandas as pd
import numpy as np

In [4]:
# This class prepares the dataset by storing the encoded text and labels.
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    # This function returns an item from the dataset given an index.
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    # This function returns the total number of items in the dataset.
    def __len__(self):
        return len(self.labels)

In [5]:
# This function loads data from a file, splits it into training and test sets, and prepares it for use with a model.
def load_and_prepare_data(filename, test_size=0.2, random_state=42):
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    df = pd.DataFrame(data, columns=['sentence', 'label'])
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    train_encodings = tokenizer(list(train_df['sentence']), truncation=True, padding=True, max_length=128)
    test_encodings = tokenizer(list(test_df['sentence']), truncation=True, padding=True, max_length=128)
    train_dataset = Dataset(train_encodings, list(train_df['label']))
    test_dataset = Dataset(test_encodings, list(test_df['label']))
    return train_dataset, test_dataset

In [6]:
# This function trains a model using the given training data and evaluates it on test data.
def train_and_evaluate_model(model_name, train_dataset, test_dataset, training_args):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )
    trainer.train()
    results = trainer.evaluate()
    return results

In [7]:
# This function computes accuracy and classification report for model predictions.
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'classification_report': classification_report(p.label_ids, preds, output_dict=True)
    }

In [8]:
# This function creates an optimized prompt for a given model.
def optimize_prompt_for_model(model_name, base_prompt):
    optimized_prompt = base_prompt + f" [Optimized for {model_name}]"
    return optimized_prompt

In [9]:
# This function evaluates model predictions based on a given prompt and sentences.
def evaluate_with_prompt(model_name, prompt, sentences):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Tokenize each sentence with the prompt
    tokenized_sentences = [prompt + " " + sentence for sentence in sentences]
    inputs = tokenizer(tokenized_sentences, truncation=True, padding=True, max_length=128, return_tensors="pt")

    with torch.no_grad():
        # Ensure inputs are moved to correct device (GPU if available)
        inputs = {key: val.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')) for key, val in inputs.items()}

        # Forward pass
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=1)
    return predictions

In [10]:
# This function evaluates multiple models using optimized prompts and returns their performance metrics.
def evaluate_models(model_names, base_prompt, X_test, y_test):
    results = {}
    for model_name in model_names:
        optimized_prompt = optimize_prompt_for_model(model_name, base_prompt)
        print(f"Optimized Prompt for {model_name}: {optimized_prompt}")
        predictions = evaluate_with_prompt(model_name, optimized_prompt, X_test)
        accuracy = accuracy_score(y_test, predictions)
        class_report = classification_report(y_test, predictions, output_dict=True)
        results[model_name] = {
            'optimized_prompt': optimized_prompt,
            'accuracy': accuracy,
            'classification_report': class_report
        }
    return results

In [12]:
# This is the main function that orchestrates data loading, model evaluation, and result printing.
def main():
    model_names = ["distilbert-base-uncased", "bert-base-uncased", "roberta-base"]
    base_prompt = "This is a base prompt for grammar correctness. Please check if the sentence is correct or not."

    # Load sentences data from JSON
    with open('sentences_data.json', 'r', encoding='utf-8') as f:
        sentences_data = json.load(f)

    sentences = [entry['sentence'] for entry in sentences_data]
    labels = [entry['label'] for entry in sentences_data]

    # Evaluate models
    results = evaluate_models(model_names, base_prompt, sentences, labels)

    # Print results for each model
    for model_name, result in results.items():
        print(f"Results for {model_name}:")
        print(f"Optimized Prompt: {result['optimized_prompt']}")
        print(f"Accuracy: {result['accuracy']}")
        print(f"Classification Report: {result['classification_report']}")

if __name__ == "__main__":
    main()

Optimized Prompt for distilbert-base-uncased: This is a base prompt for grammar correctness. Please check if the sentence is correct or not. [Optimized for distilbert-base-uncased]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Optimized Prompt for bert-base-uncased: This is a base prompt for grammar correctness. Please check if the sentence is correct or not. [Optimized for bert-base-uncased]


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Optimized Prompt for roberta-base: This is a base prompt for grammar correctness. Please check if the sentence is correct or not. [Optimized for roberta-base]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results for distilbert-base-uncased:
Optimized Prompt: This is a base prompt for grammar correctness. Please check if the sentence is correct or not. [Optimized for distilbert-base-uncased]
Accuracy: 0.43232323232323233
Classification Report: {'0': {'precision': 0.4264705882352941, 'recall': 0.9620853080568721, 'f1-score': 0.5909752547307132, 'support': 211}, '1': {'precision': 0.5789473684210527, 'recall': 0.03873239436619718, 'f1-score': 0.07260726072607261, 'support': 284}, 'accuracy': 0.43232323232323233, 'macro avg': {'precision': 0.5027089783281734, 'recall': 0.5004088512115347, 'f1-score': 0.3317912577283929, 'support': 495}, 'weighted avg': {'precision': 0.513952215655002, 'recall': 0.43232323232323233, 'f1-score': 0.29356816322098006, 'support': 495}}
Results for bert-base-uncased:
Optimized Prompt: This is a base prompt for grammar correctness. Please check if the sentence is correct or not. [Optimized for bert-base-uncased]
Accuracy: 0.4505050505050505
Classification Report:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
