## Install Required Libraries

In [None]:
!pip install torch openprompt
!pip install datasets
!pip install -U sentence-transformers
!pip install faiss-gpu
!pip install tabulate
!pip install accelerate
!pip install langchain

Collecting openprompt
  Downloading openprompt-1.0.1-py3-none-any.whl (146 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.4/146.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece==0.1.96 (from openprompt)
  Downloading sentencepiece-0.1.96-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboardX (from openprompt)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting yacs (from openprompt)
  Downloading yacs-0.1.8-py3-none-any.whl (14 kB)
Collecting dill (from openprompt)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?2

## Import Necessary Libraries




In [None]:
import torch
from openprompt import PromptForClassification
from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt import PromptDataLoader
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
import random
import faiss
import os
from sklearn.metrics import accuracy_score
random.seed(123)



## Loading of the dataset

SST2, or the Stanford Sentiment Treebank version 2, is a widely-used dataset for sentiment analysis tasks, typically involving classifying text (such as movie reviews) into positive or negative categories. The total number of samples present in the dataset is 8741 of which 6920 training samples and 1821 test samples

In [None]:
pd.set_option('display.max_rows', None) # To display all rows
pd.set_option('display.max_columns', None) # To display all columns
pd.set_option('display.width', 1000) # Adjust the width to fit your screen
pd.set_option('display.max_colwidth', None)

In [None]:
dataset = load_dataset("gpt3mix/sst2")

Downloading builder script:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/787k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/101k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/206k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
df_train = pd.DataFrame(dataset['train'])
print(df_train.head(10).to_markdown(index=False)) # 10 examples are demonstrated here

| text                                                                                                                                                                                                                                |   label |
|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------:|
| The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .                                               |       0 |
| The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy is so huge that a column of words can not adequately describe co-writer\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth . |       0 |
| Singer\/composer Bryan Adams c

## Splitting training and testing



In [None]:
def create_dataset_lable(d):
    lst = []
    text = d['text']
    label = d['label']
    for i, t in enumerate(tqdm(text)):
        lst.append((t,label[i]))
    return lst

In [None]:
train_data = create_dataset_lable(dataset['train'])
test_data = create_dataset_lable(dataset['test'])

100%|██████████| 6920/6920 [00:00<00:00, 471622.37it/s]
100%|██████████| 1821/1821 [00:00<00:00, 520997.79it/s]


# Few Shot Learning based In Context Learning using openprompt

Use the implementations of current prompt-learning approaches.* We have implemented various of prompting methods, including templating, verbalizing and optimization strategies under a unified standard. You can easily call and understand these methods.



# Why Do We Need a Verbalizer?


1. **Task-Specific Mapping**: Language models generate a wide variety of outputs, but for a specific task, you need to translate these outputs into a limited set of meaningful categories or labels. The Verbalizer makes this translation, allowing the model's outputs to be effectively used for the task at hand.


2. **Improving Model Performance**: By providing a clear mapping between the model's language generation capabilities and the specific labels needed for a task, a Verbalizer can significantly improve the performance of the model in prompt-based learning scenarios.



## Dynamically increasing of the verbalizer

In [None]:
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "google/flan-t5-large")

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
global_postive = ["True","positive","happy","good","excellent","fantastic","great","pleasant","joyful","successful","satisfying"]
global_negative = ["False","negative","bad","poor","terrible","horrible","awful","unsatisfactory","disappointing","dismal", "lamentable"]

def dynamical_increase_verbalizer(how_many_ver):
    pos = global_postive[:how_many_ver]
    neg = global_negative[:how_many_ver]
    verbalizer = ManualVerbalizer(tokenizer=tokenizer,classes=["positive", "negative"],label_words={"positive": str(pos), "negative": str(neg)})
    return verbalizer

varbalizer = dynamical_increase_verbalizer(how_many_ver=1)

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('sentence-transformers/stsb-bert-base',device="cuda")
def create_bert_embeddings(datasets):
    sentences = [d[0] for d in datasets]
    embeddings = model.encode(sentences,device="cuda",show_progress_bar=True,batch_size=128)
    return embeddings

.gitattributes:   0%|          | 0.00/744 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/377 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
train_sentences = create_bert_embeddings(train_data)
test_sentences = create_bert_embeddings(test_data)

Batches:   0%|          | 0/55 [00:00<?, ?it/s]

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
def create_index(embeddings,index_file):
    faiss.normalize_L2(embeddings)
    index = faiss.index_factory(768, "Flat", 0)
    index.train(embeddings)
    index.add(embeddings)
    faiss.write_index(index, os.path.join(index_file,"train_index.faiss"))

create_index(train_sentences,"/content/.")

def create_few_shots(test_emdb,train_data,test_data,index_file,top_k):
    top = dict()
    map_ = {1:"negative", 0: "positive"}
    index = faiss.read_index(os.path.join(index_file,"train_index.faiss"))
    faiss.normalize_L2(test_emdb)
    distance, index_l = index.search(test_emdb, top_k)
    index_list = index_l.tolist()
    # print(index_l.shape)
    for index,value in enumerate(tqdm(index_list)):
        for train_index in value:
            if test_data[index] in top:
                top[test_data[index][0]].append({"Input": train_data[train_index][0], "Output": map_[train_data[train_index][-1]]})
            else:
                top[test_data[index][0]] = []
                top[test_data[index][0]].append({"Input": train_data[train_index][0], "Output": map_[train_data[train_index][-1]]})
    return top,distance

top_10,d = create_few_shots(test_sentences,train_data,test_data,"/content/.",10) ## Here Value of k is 10

def create_examples(top_10,k):
  lst = []
  Prefix_prompt = "You are very good sentiment classifier.\nYour task is to predict tone of the sentence into two classes, positive or negative."
  for key in tqdm(top_10):
    if k == 0:
      exmpl = Prefix_prompt + "\n\n" + "Input: {}\nOutput:".format(key[0])
    else:
      Prefix_prompt_new = Prefix_prompt + "\nHere are some examples are given below for your reference.\n\n"
      examples = top_10[key][:k]
      str_ = ""
      for exp in examples:
        str_+= "Input: {}\nOutput: {}\n\n".format(exp['Input'],exp['Output'])

      exmpl = Prefix_prompt_new + str_ + "Input: {}\nOutput:".format(key[0])

    lst.append(exmpl)
  return lst

100%|██████████| 1821/1821 [00:00<00:00, 74786.08it/s]


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
random.shuffle(test_data)

In [None]:
top_10,d = create_few_shots(test_sentences,train_data,test_data,"/content/.",10)

100%|██████████| 1821/1821 [00:00<00:00, 90314.74it/s]


In [None]:
def few_shot_results(top_10,k,samples,varbalizer):
    lst_pred = []
    lts_tar = []
    Prefix_prompt = "You are very good sentiment classifier.\nYour task is to predict tone of the sentence into two classes, positive or negative."
    sample = 0
    for ind,key in enumerate(tqdm(top_10.keys())):
        if sample == samples:
            # print(1)
            break
        else:
            Prefix_prompt_new = Prefix_prompt + "\nHere are some examples are given below for your reference.\n\n"
            examples = top_10[key][:k]
            str_ = ""
            for exp in examples:
                str_+= "Input: {}\nOutput: {}\n\n".format(exp['Input'],exp['Output'])
            exmpl = Prefix_prompt_new + str_ + "Input: <text_a>\nOutput:".format(key)
            exmpl+= exmpl + '{"mask"}'
            template = ManualTemplate(tokenizer=tokenizer, text=exmpl)

            example_top = InputExample(text_a=key,label=test_data[ind][-1]) # Template to give input example

            dataset = [example_top]
            data_loader = PromptDataLoader(
            dataset=dataset,
            tokenizer=tokenizer,
            template=template,
            tokenizer_wrapper_class=WrapperClass,  # Add the tokenizer wrapper class
            batch_size=1,
            max_seq_length=512, decoder_max_length=3
            )

            prompt_model = PromptForClassification(plm=plm,template=template,verbalizer=varbalizer,freeze_plm=False).to(device)
            prompt_model.eval()
            with torch.no_grad():
                for step, inputs in enumerate(data_loader):
                    inputs = inputs.cuda()
                    logits = prompt_model(inputs)
                    #probs = torch.nn.functional.softmax(logits, dim=-1)
                    prediction = torch.argmax(logits, dim=-1)
                    predicted_label = 1 if prediction.item() == 0 else 0
                    lst_pred.append(predicted_label)
                    lts_tar.append(test_data[ind][-1])

            sample+=1

    return lst_pred,lts_tar

In [None]:
top_10_few_shot = few_shot_results(top_10,k=1,samples=200,varbalizer=varbalizer)
# top_10_few_shot[0]

  0%|          | 0/1821 [00:00<?, ?it/s]
tokenizing: 1it [00:00, 356.69it/s]
  0%|          | 1/1821 [00:01<48:27,  1.60s/it]
tokenizing: 1it [00:00, 368.24it/s]
  0%|          | 2/1821 [00:01<23:22,  1.30it/s]
tokenizing: 1it [00:00, 440.16it/s]
  0%|          | 3/1821 [00:01<15:12,  1.99it/s]
tokenizing: 1it [00:00, 396.66it/s]
  0%|          | 4/1821 [00:02<11:25,  2.65it/s]
tokenizing: 1it [00:00, 384.45it/s]
  0%|          | 5/1821 [00:02<09:21,  3.24it/s]
tokenizing: 1it [00:00, 328.99it/s]
  0%|          | 6/1821 [00:02<08:03,  3.75it/s]
tokenizing: 1it [00:00, 375.19it/s]
  0%|          | 7/1821 [00:02<07:18,  4.13it/s]
tokenizing: 1it [00:00, 504.97it/s]
  0%|          | 8/1821 [00:02<06:45,  4.47it/s]
tokenizing: 1it [00:00, 347.10it/s]
  0%|          | 9/1821 [00:03<06:22,  4.73it/s]
tokenizing: 1it [00:00, 426.21it/s]
  1%|          | 10/1821 [00:03<06:08,  4.92it/s]
tokenizing: 1it [00:00, 340.12it/s]
  1%|          | 11/1821 [00:03<05:58,  5.04it/s]
tokenizing: 1it [00:00

In [None]:
print("\n Accuracy Faiss:\n",a)


 Accuracy Faiss:
 0.57


## Langchain

In their most basic form, sequential chains consist of a series of steps, each taking a single input and producing a single output. The output from one step becomes the input for the next step, creating a smooth and continuous flow of information. This approach works well for sub-chains that are designed for single inputs and outputs, ensuring that each step seamlessly passes its output to the subsequent step.
Input 1 --> Output 1 --> Input 2 --> Final Output

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.chains import SimpleSequentialChain
from langchain import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline, AutoModelForSeq2SeqLM

In [None]:
MODEL_NAME = "google/flan-t5-large"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME, trust_remote_code=True, device_map="auto"
)

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 512
generation_config.temperature = 0.0001
generation_config.top_p = 0.95
generation_config.do_sample = True
generation_config.repetition_penalty = 1.15

text_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    generation_config=generation_config,
)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0.5})

In [None]:
test_sent = test_data[2]

print(test_sent[0])

first_prompt = ChatPromptTemplate.from_template(
    "Extract important keywords from the given {text}"
)

chain_one = LLMChain(llm=llm, prompt=first_prompt) ## Frist Chain

second_prompt = ChatPromptTemplate.from_template(
    "Infer the tone of the sentence by taking account {keywords} into positive and negative class"
)

chain_two = LLMChain(llm=llm, prompt=second_prompt) ## Second Chain

overall_simple_chain = SimpleSequentialChain(chains=[chain_one, chain_two],verbose=True)

overall_simple_chain.run(test_sent)

Offers that rare combination of entertainment and education .


[1m> Entering new SimpleSequentialChain chain...[0m
[36;1m[1;3moffer, education, combination[0m
[33;1m[1;3mpositive[0m

[1m> Finished chain.[0m


'positive'