## Load data to test on

In [2]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
GSM8K = load_dataset("gsm8k", 'main')

In [38]:
def get_GSM8K_Q(question_or_answer, question_number) -> str:
    
    if question_or_answer == "Q":
        output = GSM8K['train']['question'][question_number-1]
    elif question_or_answer == "A":
        output = GSM8K['train']['answer'][question_number-1]
    else:
        output = "either put 'Q' for question or 'A' for answer"

    return output

In [39]:
get_GSM8K_Q("Q", 1)

'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'

## Add pytorch

In [25]:
import torch

In [26]:
# print(f"PyTorch version: {torch.__version__}")

# # Check PyTorch has access to MPS (Metal Performance Shader, Apple's GPU architecture)
# print(f"Is MPS (Metal Performance Shader) built? {torch.backends.mps.is_built()}")
# print(f"Is MPS available? {torch.backends.mps.is_available()}")

# # Set the device
# device = "mps" if torch.backends.mps.is_available() else "cpu"
# print(f"Using device: {device}")

In [7]:
# x = torch.rand(size=(3, 4)).to(device)
# x

# Phi-2

In [27]:
from langchain.llms import Ollama

In [28]:
!ollama list

NAME                    	ID          	SIZE  	MODIFIED   
mistral:7b-instruct-q8_0	2162e081e7f0	7.7 GB	2 days ago	
phi:latest              	e2fd6321a5fe	1.6 GB	2 days ago	


In [29]:
llm = Ollama(model="mistral:7b-instruct-q8_0") # phi / mistral:7b-instruct-q8_0

In [30]:
llm.invoke("What is the best country in Scandinavia if you had to pick one?")

'It\'s difficult for me to say which is the "best" country in Scandinavia, as it largely depends on individual preferences and what each person values most. Each of the five countries in Scandinavia - Denmark, Norway, Sweden, Finland, and Iceland - has its own unique culture, attractions, and quality of life. \n\nFor example, Denmark is known for its charming cities, world-renowned design, and strong focus on work-life balance. Norway is home to stunning fjords, glaciers, and mountains, as well as a thriving oil industry. Sweden is known for its innovative approach to technology and social equality, while Finland has a rich cultural heritage and beautiful natural landscapes. Iceland is a volcanic island with unique geothermal features, hot springs, and a growing reputation for innovation and entrepreneurship.\n\nUltimately, the "best" country in Scandinavia will depend on what you value most - whether that\'s culture, nature, work-life balance, or something else entirely. It might be h

In [31]:
def query_llm(input_query) -> str:
    invoke = llm.invoke(input_query)
    return invoke

In [32]:
from IPython.display import display, Markdown

In [33]:
def format_llm_answer(llm_output):
    formatted_string = llm_output.replace("\\n", "\n")
    formatted_output = display(Markdown(formatted_string))

    return formatted_output

In [37]:
format_llm_answer(query_llm('What country do you prefer out of the scandinavian countries?'))

I don't have personal preferences. However, I can provide information about any of the Scandinavian countries upon request.

In [40]:
format_llm_answer(query_llm(get_GSM8K_Q("Q", 1)))

Natalia sold 48 clips in April. In May, she sold half as many clips, which is 24 clips. So, the total number of clips she sold in April and May is:
48 + 24 = 72 clips
Therefore, Natalia sold a total of 72 clips in April and May.

In [41]:
get_GSM8K_Q("A", 1)

'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'

In [45]:
def check_answer_from_output(llm_answer, gsm8k_answer):

    extract_llm_number = query_llm(f"What is the final numerical answer of this text: {llm_answer}")
    extract_gsm8k_number = query_llm(f"What is the final numerical answer of this text: {gsm8k_answer}")

    compare = query_llm(f"Are these two answers the same: 1; [{extract_llm_number}] and 2; [{extract_gsm8k_number}]")
    compare = query_llm(f"You are a binary analyser, if the 1; [{extract_llm_number}] and 2; [{extract_gsm8k_number}]")

    print("extract_llm_number output: ",extract_llm_number)
    print("extract_gsm8k_number output: ",extract_gsm8k_number)
    print("compare output: ",compare)

In [47]:
check_answer_from_output(query_llm(get_GSM8K_Q("Q", 1)), get_GSM8K_Q("A", 1))

extract_llm_number output:  The final numerical answer of this text is 72 clips.
extract_gsm8k_number output:  The final numerical answer is 72.
compare output:  Yes, both answers are equivalent. They both indicate that the final numerical answer is 72. The only difference is in the phrasing and structure of the sentences.
