In [1]:
import os
import pandas as pd
import csv
from tqdm import tqdm
os.getcwd()

'/model'

In [3]:
os.listdir("/models")

['M7B']

In [2]:
from mistral_inference.transformer import Transformer
from mistral_inference.generate import generate

from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest



### Load the model

In [3]:
tokenizer = MistralTokenizer.from_file("/models/M7B/tokenizer.model.v3")  # change to extracted tokenizer file
model = Transformer.from_folder("/models/M7B")  # change to extracted model dir


In [None]:
fbqa = pd.read_json("/datasets/FreebaseQA/FreebaseQA-eval.json")

In [7]:
# prompt = "How expensive would it be to ask a window cleaner to clean all windows in Paris. Make a reasonable guess in US Dollar."


def get_response(prompt):
    completion_request = ChatCompletionRequest(messages=[UserMessage(content=prompt)])

    tokens = tokenizer.encode_chat_completion(completion_request).tokens

    out_tokens, _ = generate([tokens], model, max_tokens=1024, temperature=0.35, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
    result = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])
    return result


## FreebaseQA
### Ask a question and check the answer

In [7]:
# fbqa.Questions[0].get("RawQuestions", None)
for i, r in fbqa.iterrows():
    q = r.Questions.get("RawQuestion", None)
    print("Question:", q)
    parse = r.Questions.get("Parses", [None])[0]
    if not parse:
        print(f"error in question: {i}")
        continue
    answer = parse.get("Answers")
    print("Answer:", answer)
    response = get_response(q)
    print("Model:", response)
    break

Question: Who is the female presenter of the Channel 4 quiz show '1001 things you should know'?
Answer: [{'AnswersMid': 'm.0216y_', 'AnswersName': ['sandi toksvig']}]
Model: The female presenter of the Channel 4 quiz show '1001 Things You Should Know' is Anita Rani. She is a British television presenter and journalist, best known for her work on BBC One's 'The One Show' and Channel 4's 'A New Life in the Sun'. The show '1001 Things You Should Know' is a spin-off of the popular Australian show '1001 Things You Should Know Before You Die'.


The answer is too elaborate, I'm going to prompt the model to give a shorter answer.

To know what length to restrict to, I will analyse the lengths of dataset answers:

In [4]:
answers = fbqa.Questions.apply(lambda t: t["Parses"][0]["Answers"][0]["AnswersName"][0].split(" "))
answers.head()

0      [sandi, toksvig]
1        [henry, fonda]
2      [terry, gilliam]
3      [steve, mcqueen]
4    [harold, abrahams]
Name: Questions, dtype: object

In [7]:
answer_lengths = answers.apply(len)
answer_lengths.head()

0    2
1    2
2    2
3    2
4    2
Name: Questions, dtype: int64

In [8]:
answer_lengths.describe()

count    3996.000000
mean        1.866867
std         0.925670
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         9.000000
Name: Questions, dtype: float64

In [9]:
answers[answer_lengths > 7]

873      [the, shoop, shoop, song, (it's, in, his, kiss)]
1144     [let, it, snow!, let, it, snow!, let, it, snow!]
1342      [always, look, on, the, bright, side, of, life]
2706    [night, at, the, museum:, battle, of, the, smi...
Name: Questions, dtype: object

### The appropriate answer length: under a sentence.

In [16]:
LEN_LIMITED_PROMPT = """
You should answer the question below in under a sentence with no other text but the answer.
Question:
{question}
"""

In [11]:
def get_fbqa_data(question_row):
    """
    Takes in a dataset row and returns Q and A as strings
    """
    question = question_row.Questions.get("RawQuestion", None)
    parse = question_row.Questions.get("Parses", [None])[0]
    if not parse:
        print(f"error in question: {question}")
        return question, None
    answer = parse.get("Answers")
    return question, answer

In [25]:
# fbqa.Questions[0].get("RawQuestions", None)
baseline_results = []
for i, r in tqdm(list(fbqa.iterrows())):
    q, a = get_fbqa_data(r)
    # 
    # print("Question:", q)
    # print("Answer:", a)
    prompt = LEN_LIMITED_PROMPT.format(question=q)
    # print("Prompt:", prompt)
    response = get_response(prompt)
    # print("Model:", response)
    baseline_results.append(response)



100%|██████████| 3996/3996 [31:54<00:00,  2.09it/s]  


In [None]:
with open("/datasets/FreebaseQA/results/baseline/m7b.csv", "w") as baseline_file:
    writer = csv.writer(baseline_file)
    writer.writerow(["Model"])
    for r in baseline_results:
        writer.writerow([r])

In [18]:
bline_fbqa_df = pd.read_csv("/datasets/FreebaseQA/results/baseline/m7b.csv", header=None)
bline_fbqa_df.rename(columns={0: "Model"}, inplace=True)
bline_fbqa_df["Actual"] = answers.apply(" ".join)
bline_fbqa_df["Model"] = bline_fbqa_df["Model"].apply(str.lower)
# check correct answer is present in the output
bline_fbqa_df["Correct"] = bline_fbqa_df.apply(lambda t: t[1] in t[0], axis=1)
bline_fbqa_df.head()

  bline_fbqa_df["Correct"] = bline_fbqa_df.apply(lambda t: t[1] in t[0], axis=1)


Unnamed: 0,Model,Actual,Correct
0,anita rani,sandi toksvig,False
1,mervyn leroy produced the film 12 angry men.,henry fonda,False
2,terry gilliam,terry gilliam,True
3,steve mcqueen,steve mcqueen,True
4,harold abrahams,harold abrahams,True


In [19]:
print(
    f"Baseline Accuracy: {sum(bline_fbqa_df.Correct)/len(bline_fbqa_df):.3f}"
)

Baseline Accuracy: 0.736


Now let's investigate wrong answers

In [74]:
bline_fbqa_df[~bline_fbqa_df.Correct].to_csv("/datasets/FreebaseQA/results/baseline/wrong_m7b.csv")
bline_fbqa_df[~bline_fbqa_df.Correct].head()

Unnamed: 0,Model,Actual,Correct
0,anita rani,sandi toksvig,False
1,mervyn leroy produced the film 12 angry men.,henry fonda,False
11,foinavon won the 1968 grand national.,red alligator,False
14,mexico (men's football),poland,False
15,leeds united,leeds united f.c.,False


### Upon inspection, I found the following examples:
- 15,leeds united,leeds united f.c.,False
- 18,seb coe,sebastian coe,False
- 183,america (1851),united states,False
- 107,manhattan transfer,the manhattan transfer,False
- 133,"""the 39 steps"" is the film.",the thirty-nine steps,False
- 135,pickwick papers,the pickwick papers,False
- 183,america (1851),united states,False
- 200,peter,saint peter,False
- 309,memphis,"memphis, tennessee",False
Etc.

Those are partially correct or have a different spelling (numbers, synonyms)
A more precise metric could be ngram overlap between the expected and the actual answers.

### Rouge (NGram Recall) is a potential candidate

In [21]:
from rouge import Rouge
rouge = Rouge()

In [26]:
bline_fbqa_df.iloc[135]

Model          pickwick papers
Actual     the pickwick papers
Correct                  False
Name: 135, dtype: object

In [27]:
rouge.get_scores(bline_fbqa_df.iloc[135].Model, bline_fbqa_df.iloc[135].Actual)

[{'rouge-1': {'r': 0.6666666666666666, 'p': 1.0, 'f': 0.7999999952000001},
  'rouge-2': {'r': 0.5, 'p': 1.0, 'f': 0.6666666622222223},
  'rouge-l': {'r': 0.6666666666666666, 'p': 1.0, 'f': 0.7999999952000001}}]

## MetaQA

In [5]:
metaqa = pd.read_csv("/datasets/MetaQA/1hop/qa_test.txt", sep="\t", header=None)
metaqa.rename(columns={0: "Question", 1: "Answers"}, inplace=True)
metaqa.Answers = metaqa.apply(lambda t: t.Answers.split("|"), axis=1)
metaqa.head()

Unnamed: 0,Question,Answers
0,what does [Grégoire Colin] appear in,[Before the Rain]
1,[Joe Thomas] appears in which movies,"[The Inbetweeners Movie, The Inbetweeners 2]"
2,what films did [Michelle Trachtenberg] star in,"[Inspector Gadget, Black Christmas, Ice Prince..."
3,what does [Helen Mack] star in,"[The Son of Kong, Kiss and Make-Up, Divorce]"
4,what films did [Shahid Kapoor] act in,"[Haider, Jab We Met, Chance Pe Dance]"


### Look at the answer lengths

In [39]:
all_mqa_answers = sum(metaqa.Answers.tolist(), [])
all_mqa_answers[:5]

['Before the Rain',
 'The Inbetweeners Movie',
 'The Inbetweeners 2',
 'Inspector Gadget',
 'Black Christmas']

In [46]:
mqa_lengths = pd.Series(all_mqa_answers).apply(str.split)
mqa_lengths.apply(len).describe()

count    19346.000000
mean         2.072108
std          1.084976
min          1.000000
25%          1.000000
50%          2.000000
75%          2.000000
max         13.000000
dtype: float64

In [52]:
mqa_lengths[mqa_lengths.apply(len) > 10].apply(lambda t: " ".join(t)).tolist()

['Your Vice Is a Locked Room and Only I Have the Key',
 'Your Vice Is a Locked Room and Only I Have the Key',
 'Went to Coney Island on a Mission from God... Be Back by Five',
 'The Englishman Who Went Up a Hill But Came Down a Mountain']

### The answers are also under a sentence

In [4]:
LEN_LIMITED_PROMPT = """
You should answer the question below in under a sentence with no other text but the answer.
If there are multiple answers, separate the answers by "|".
Do not include "Answer:". Produce the answer only.
Question:
{question}
"""

In [8]:
baseline_results = []

for i, r in tqdm(list(metaqa.iterrows())):
    q = r.Question
    # print("Question:", q)
    # print("Answer:", a)
    prompt = LEN_LIMITED_PROMPT.format(question=q)
    # print("Prompt:", prompt)
    response = get_response(prompt)
    # print("Model:", response)
    baseline_results.append(response)
    
    if i % 250 == 0:
        with open("/datasets/MetaQA/results/1hop/bline.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerow(["Model"])
            for r in baseline_results:
                writer.writerow([str(r)])



100%|██████████| 9947/9947 [3:36:14<00:00,  1.30s/it]   


# TODO: put it in a py script and run in detached mode

In [1]:
baseline_results

NameError: name 'baseline_results' is not defined

## Batch inference (experimental)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir="/models", torch_dtype=torch.float16, device_map="auto")
# model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
def format_prompts(questions):
    return [LEN_LIMITED_PROMPT.format(question=question) for question in questions]

def tokenize_questions(questions):
    """Encodes a list of requests into token IDs."""
    return [tokenizer.encode(q) for q in questions]


In [None]:
bline_res = []
# batch size
for i, r in tqdm(list(metaqa.iterrows())):
    q = r.Question
    # 
    prompt = LEN_LIMITED_PROMPT.format(question=q)
    # decode
    decoded_answer = get_response(prompt)
    bline_res.append(decoded_answer)


  0%|          | 8/9947 [00:12<3:44:55,  1.36s/it]