In [None]:
from openai import OpenAI

client = OpenAI()

def chatgpt(prompt: str):

    result = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=1.0,
        max_tokens=50
    )
    return result

In [None]:
from datasets import load_dataset

dataset = load_dataset("msaad02/brockport-gpt-4-qa")['train'].to_pandas().loc[:, ['question', 'answer']]

In [None]:
sampled_questions = dataset['question'].sample(n=2).to_list()

In [None]:
def slightly_change_question(question: str):

    result = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "user", "content": "You are a helpful assistant who slightly modifies questions while retaining their meaning. Only make slight modifications such as changing to similar words, or rephrasing the question. Do not change the meaning of the question. Only respond with the question, do not preface it with anything. The question is: " + question}
        ],
        temperature=0.8,
        max_tokens=50
    )
    return result.choices[0].message.content

In [None]:

print(f"Before:\n{sampled_questions[0]}\n\nAfter:\n{slightly_change_question(sampled_questions[0])}")

# New stuff

In [None]:
import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # prevent tensorflow logs

# Set path to parent directory so we can import from other folders.
sys.path.append(os.path.abspath(os.path.join('..')))

# from fine_tuning.finetune_class import FineTunedModel
from scratch_model.inference import ScratchModel
# from text_search.rag import RAG
# 
# from termcolor import colored
# from datasets import load_dataset
# from tqdm import tqdm


In [None]:
scratch=ScratchModel(model_dir="../scratch_model/models/transformer_v7/")



In [None]:
from IPython.display import clear_output

for i in scratch("How can I apply?", stream=True):
    print(i)
    clear_output(wait=True)

In [None]:
from fine_tuning.finetune_class import FineTunedModel
from torch.cuda import is_available
from IPython.display import clear_output

finetuned=FineTunedModel(model_type="gptq" if is_available() else "gguf", stream=True)


In [None]:
for i in finetuned("How can I apply?"):
    print(i)
    clear_output(wait=True)
    
    

# Creating 1v1 data

In [1]:
import pandas as pd
import random

df = pd.read_csv("./data/answers.csv")

rag_v_scratch = df.loc[:, ['question', 'answer', 'RAG', 'Scratch']]
rag_v_ft = df.loc[:, ['question', 'answer', 'RAG', 'Finetuned']]
scratch_v_ft = df.loc[:, ['question', 'answer', 'Scratch', 'Finetuned']]

dfs = [rag_v_scratch, rag_v_ft, scratch_v_ft]
result = pd.concat(dfs, ignore_index=True)

output = []

for row in result.iterrows():
    row = row[1][~row[1].isna()]

    idx_a = random.choice([0, 1])
    idx_b = 1 - idx_a

    output.append({
        'question': row['question'],
        'answer': row['answer'],
        'player_a': row[idx_a+2],
        'player_b': row[idx_b+2],
        'whois_player_a': row.index[idx_a+2],
        'whois_player_b': row.index[idx_b+2]
    })

df = pd.DataFrame(output).sample(frac=1).reset_index(drop=True)

prompt = lambda a,b,c,d: f"Question: {a}\nGround Truth: {b}\nPlayer A: {c}\nPlayer B: {d}"

df['prompt'] = df.apply(lambda x: prompt(x['question'], x['answer'], x['player_a'], x['player_b']), axis=1)

In [2]:
from openai import OpenAI
client = OpenAI()

prompt_tokens = 0
completion_tokens = 0

def assess_players(prompt):
    "GPT-4 evaluates best model. Returns A, B, or None to indicate best response."
    global prompt_tokens, completion_tokens

    try:
        response = client.chat.completions.create(
            model="gpt-4-turbo-preview",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful referee, who helps pick the best response to a question. The question is about SUNY Brockport, a school in upstate NY. You are given the following:\n\n1) The question given.\n2) The ground truth in the form of an answer to the question.\n3) Player A response to the question.\n4) Player B response to the question.\n\nGiven the question and ground truth, select which player has the best response. Respond with either \"A\", or \"B\" only. In some cases, it may be possible that both players are incorrect. In those cases, respond with \"None\". In choosing the best response prioritize correctness first, then enthusiasm and overall coherence after. Remember to only respond with either \"A\", \"B\", or \"None\". Do not explain your decision."
                }, {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=0,
            max_tokens=1
        )
    except:
        return "Error"

    prompt_tokens += response.usage.prompt_tokens
    completion_tokens += response.usage.completion_tokens

    return response.choices[0].message.content

In [3]:
from tqdm import tqdm

for prompt in tqdm(df.loc[:, 'prompt']):
    df.loc[df['prompt'] == prompt, 'best_response'] = assess_players(prompt)

100%|██████████| 4035/4035 [53:14<00:00,  1.26it/s]   


In [41]:
prompt_tokens, completion_tokens

(1674957, 4035)

In [4]:
# df.to_csv("data/evaluation.csv", index=False)

In [7]:
df['best_response'].value_counts()

best_response
None    1521
A       1334
B       1180
Name: count, dtype: int64

In [20]:
df_best = df[df['best_response'].isin(['A', 'B'])]
df_best

Unnamed: 0,question,answer,player_a,player_b,whois_player_a,whois_player_b,prompt,best_response
2,What kind of support can I expect from the Haz...,"At the Hazen Center for Integrated Care, you'l...",Our hazen center for integrated care is dedica...,"At the Hazen Center for Integrated Care, we un...",Scratch,Finetuned,Question: What kind of support can I expect fr...,B
3,Are there any prerequisites or requirements to...,"To ensure the best possible experience, there ...","Yes, to be eligible for the nursing program, y...",To participate in the Nursing study abroad pro...,Scratch,RAG,Question: Are there any prerequisites or requi...,A
4,What essential items should I bring to every c...,Being prepared is key! Always carry your class...,You should bring a copy of the basic emergency...,"To prepare for an emergency, it's important to...",RAG,Finetuned,Question: What essential items should I bring ...,A
5,How can I learn more about SUNY Brockport's co...,We'd love to share more about our exciting com...,Wed love to share more about our community wit...,To learn more about our Community Justice Prog...,Scratch,Finetuned,Question: How can I learn more about SUNY Broc...,A
6,What should I do if I want to serve alcohol at...,"For events involving alcohol, please contact t...",If you plan on serving alcoholic beverages at ...,"If you suspect that [UNK] is [UNK], its import...",Finetuned,Scratch,Question: What should I do if I want to serve ...,A
...,...,...,...,...,...,...,...,...
4030,What services does SUNY Brockport offer to hel...,"At SUNY Brockport, we're dedicated to ensuring...",SUNY Brockport offers various services to help...,SUNY Brockport is committed to supporting stud...,RAG,Scratch,Question: What services does SUNY Brockport of...,A
4031,How do I apply for the SUNY Washington Program...,You can apply for the SUNY Washington Program ...,Applying for the SUNY washington program is st...,Applications for the SUNY Washington Program c...,Scratch,Finetuned,Question: How do I apply for the SUNY Washingt...,B
4032,How can I register my student organization wit...,That's a great step towards making your mark o...,You can register your student organization wit...,You can register with your student organizatio...,RAG,Scratch,Question: How can I register my student organi...,B
4033,How can I find out about upcoming events at SU...,You can discover all the exciting events happe...,You can find out about upcoming events at SUNY...,Staying informed is key! You can find all the ...,RAG,Scratch,Question: How can I find out about upcoming ev...,B


In [32]:
a_responses = df[df['best_response'] == 'A']
a_responses['whois_player_a'].value_counts()

whois_player_a
RAG          648
Finetuned    636
Scratch       50
Name: count, dtype: int64

In [33]:
b_responses = df[df['best_response'] == 'B']
b_responses['whois_player_b'].value_counts()

whois_player_b
RAG          609
Finetuned    515
Scratch       56
Name: count, dtype: int64

In [36]:
none_responses = df[df['best_response'] == 'None']
none_responses['players'] = none_responses['whois_player_a'] + " vs " + none_responses['whois_player_b']
none_responses['players'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  none_responses['players'] = none_responses['whois_player_a'] + " vs " + none_responses['whois_player_b']


players
Scratch vs Finetuned    337
RAG vs Finetuned        292
Finetuned vs RAG        262
Scratch vs RAG          227
Finetuned vs Scratch    213
RAG vs Scratch          190
Name: count, dtype: int64

In [37]:
print(f"Scratch vs Finetuned    {337+213}")
print(f"RAG vs Scratch          {227+190}")
print(f"RAG vs Finetuned        {292+213}")

Scratch vs Finetuned    550
RAG vs Scratch          417
RAG vs Finetuned        505
