In [1]:
! pip install pandas transformers torch datasets openprompt>=1.0.1

You should consider upgrading via the 'C:\Users\Owner\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [2]:
import pandas as pd
#from typing import List
import torch
#from datasets import Dataset
#from tqdm import tqdm

from openprompt.plms import load_plm
from openprompt.prompts import ManualTemplate
from openprompt.data_utils import InputExample
from openprompt.prompts import ManualVerbalizer
from openprompt import PromptForClassification
from openprompt import PromptDataLoader

class prompting:
        def __init__(self, model="roberta-base"):
            if model == "roberta-base":
                self.checkpoint = ("roberta","roberta-base")
            elif model == "roberta-large":
                self.checkpoint = ("roberta","roberta-large")
            elif model == "bert":
                self.checkpoint = ("bert","bert-base-uncased")
            elif model == "deberta-base":
                self.checkpoint = ("deberta-v3","microsoft/deberta-v3-base")
            elif model == "deberta-large":
                self.checkpoint = ("deberta-v3","microsoft/deberta-v3-large")
            elif model == "xlm-roberta":
                self.checkpoint = ("xlm-roberta-base","xlm-roberta-base")

            else:
                raise Exception("Select one of the following models: roberta-base, roberta-large, bert, deberta-base, deberta-large, xlm")

        def predict(self, template, verb_h, verb_nh, data):
            plm, tokenizer, model_config, WrapperClass = load_plm(self.checkpoint[0], self.checkpoint[1])

            promptTemplate = ManualTemplate(
                 text = f'{{"placeholder":"text_a"}} {template} {{"mask"}}',
                 tokenizer = tokenizer,
                 )

            classes = ["1", "0"]

            label_words = {
                "1": verb_h,
                "0": verb_nh
                }

            print(label_words)


            if isinstance(data, str):
                dataset = [InputExample(guid = 0,
                                        text_a = data)]
            elif isinstance(data, pd.DataFrame):
                dataset = [InputExample(guid = i,
                                        text_a = txt)
                                        for i, txt in enumerate(data["text"])]
            elif isinstance(data, list) and all(isinstance(t, str) for t in data):
                dataset = [InputExample(guid = i,
                                        text_a = txt)
                                        for i, txt in enumerate(data)]
            else:
                raise ValueError('Input data must be either a string or a pandas DataFrame.')


            promptVerbalizer = ManualVerbalizer(
                classes = classes,
                label_words = label_words,
                tokenizer = tokenizer,
            )

            promptModel = PromptForClassification(
                template = promptTemplate,
                plm = plm,
                verbalizer = promptVerbalizer,
            )

            data_loader = PromptDataLoader(
                dataset = dataset,
                tokenizer = tokenizer,
                template = promptTemplate,
                tokenizer_wrapper_class=WrapperClass
            )

            promptModel = promptModel.cuda()

            predictions = []

            promptModel.eval()
            with torch.no_grad():
                for batch in data_loader:
                    batch = batch.cuda()
                    logits = promptModel(batch)
                    preds = torch.argmax(logits, dim = -1)
                    predictions.extend([classes[p] for p in preds.cpu().numpy().tolist()])

            mapper = {"0": "non-hate", "1": "hate"}

            return [mapper[k] for k in predictions]



In [3]:
prompt_template = "This text is"
verb_h = "hate" # verbalizer for hate speech class
verb_nh = "non-hate" # verbalizer for non-hate speech class

enc_lms = prompting("roberta-large") # Models: roberta-base, roberta-large, bert, deberta-base, deberta-large, xlm-roberta

# The input can be a dataframe, a text or a list of texts
enc_lms.predict(prompt_template, verb_h, verb_nh, ["Shut your dumbass up bitch we all know you a hoe", "You are not good but can improve"])

{'1': 'hate', '0': 'non-hate'}


tokenizing: 2it [00:00, 675.57it/s]


['hate', 'non-hate']