In [31]:
import time
start_time = time.time()

In [32]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import csv

In [33]:
!pip install sentencepiece

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [34]:
# 1. Load the Model and Tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.3" #or 2-1b
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [35]:
# 2. Load the MRPC Test Dataset
dataset = load_dataset("glue", "mrpc", split="test")
mrpc_test_dataset = dataset #.select(range(1000))

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    

In [36]:
# 3. Create the Prompt Template
def create_prompt(sentence1, sentence2):
    prompt_template = """
    Determine if two sentences convey the same core information. A paraphrase maintains the essential meaning despite different wording.
    Strictly respond with one word: either 'paraphrase' or 'non-paraphrase'.

    Now classify:
    Sentence 1: "{sentence1}"
    Sentence 2: "{sentence2}"
    Answer: """

    return prompt_template.format(sentence1=sentence1, sentence2=sentence2)

In [37]:
# 4. Generate Text
def generate_response(prompt, model, tokenizer, temperature=0.3):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=5, temperature=temperature, do_sample=True)  # Adjust max_new_tokens as needed
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return response.split("Answer:")[1].strip() #extract the answer.


In [38]:
# 5. Process the Test Dataset and Generate Responses
results = []
for example in mrpc_test_dataset:
    sentence1 = example["sentence1"]
    sentence2 = example["sentence2"]
    label = example["label"]
    prompt = create_prompt(sentence1, sentence2)
    response = generate_response(prompt, model, tokenizer)
    results.append({"sentence1": sentence1, "sentence2": sentence2, "label": label, "response": response})


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

KeyboardInterrupt: 

In [None]:
# # # 6. Print or Save the Results
# for result in results:
#     print(f"Sentence 1: {result['sentence1']}")
#     print(f"Sentence 2: {result['sentence2']}")
#     print(f"Response: {result['response']}")
#     print("-" * 20)

In [None]:
# 6. Save Results to CSV
csv_file = "mrpc_test_results-mistral_01.csv"
csv_columns = ["sentence1", "sentence2", "label", "response"]

try:
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for data in results:
            writer.writerow(data)
    print(f"Results saved to {csv_file}")
except IOError:
    print("I/O error")

In [39]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

Elapsed time: 1838.51149392128 seconds
