In [None]:
!pip install -q "torch==2.2.1" tensorboard 

!pip install  -q --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0" "optuna" "scikit-learn"

!huggingface-cli login --token "..."

In [None]:
!pip install -q openai
!pip install -q accelerate evaluate

import evaluate
import re
import pandas as pd

In [None]:
# resuse https://github.com/pkasela/DESIRE-ME/blob/main/src/model/utils.py
import logging
import os
import random

import numpy as np
import torch
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split

logger = logging.getLogger(__name__)


def seed_everything(seed: int):
    logger.info(f'Setting global random seed to {seed}')
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True



In [None]:
seed_everything(42)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline



model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

In [None]:
def phi3_generation_specific_query():
     
    messages = [
        {"role": "user", "content": "Please generate 10 specific queries."},    
    ]

    
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )
    
    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 1.0,
        "do_sample": True,
    }
    
    output = pipe(messages, **generation_args)
    #print(output[0]['generated_text'])
    return output[0]['generated_text']

def phi3_generation_ambiguous_query():
     
    messages = [
        {"role": "user", "content": "Please generate 10 ambiguous queries."},    
    ]

    
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )
    
    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 1.0,
        "do_sample": True,
    }
    
    output = pipe(messages, **generation_args)
    #print(output[0]['generated_text'])
    return output[0]['generated_text']

In [None]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)


In [None]:
def llama3_specific_query():
    messages = [
        {"role": "user", "content": "Please generate 10 specific queries."},
    ]
    
    outputs = pipeline(
        messages,
        max_new_tokens=512,
        temperature=1.0,
        do_sample=True
    
    )
    #print(outputs[0]["generated_text"][-1]['content'])
    return outputs[0]["generated_text"][-1]['content']

def llama3_vague_query():
    messages = [
        {"role": "user", "content": "Please generate 10 ambiguous queries."},
    ]
    
    outputs = pipeline(
        messages,
        max_new_tokens=512,
        temperature=1.0,
        do_sample=True
    
    )
    #print(outputs[0]["generated_text"][-1]['content'])
    return outputs[0]["generated_text"][-1]['content']

In [None]:
from openai import OpenAI


def gpt_generation_specific_query():
    client = OpenAI(api_key = "...")

    completion = client.chat.completions.create(
      model="gpt-4o-mini",
      #model="gpt-3.5-turbo",
      messages = [
            #{
               # "role": "system",
               #"content": "Please come up with a topic, and based on the topic, generate a specific query and an ambiguous query. The output format should be: Topic: {topic}\n Specific query: {specific query}\n Ambiguous query: {ambiguous query}"
            #},
            {"role": "user", "content": "Please generate 10 specific queries."}],
      max_tokens=512,
      #do_sample=True, 
      temperature=1.0
    )


    return completion.choices[0].message.content
def gpt_generation_vague_query():
    client = OpenAI(api_key = "...")

    completion = client.chat.completions.create(
      model="gpt-4o-mini",
      #model="gpt-3.5-turbo",
      messages = [
            #{
               # "role": "system",
               #"content": "Please come up with a topic, and based on the topic, generate a specific query and an ambiguous query. The output format should be: Topic: {topic}\n Specific query: {specific query}\n Ambiguous query: {ambiguous query}"
            #},
            {"role": "user", "content": "Please generate 10 ambiguous queries."}],
      max_tokens=512,
      #do_sample=True, 
      temperature=1.0
    )


    return completion.choices[0].message.content