## Evaluation  
Compare:
- ChatGPT
- TinyLlama (Ver 1.0, OpenOrca)
- Phi-2
- Mistral 7b (3-bit, 4-bit, 5-bit)
- Llama 2 7b (3-bit, 4-bit, 5-bit)

Evaluate on:
- Time taken to run on my laptop (limited computation power)
- Quality of Answer (Using LlamaIndex)
- Cost (How much it currently costs, and how much would it cost to get computing power that can get the local models to compete with ChatGPT, in terms of at least speed)

### Import Files

In [1]:
from settings import *
from llm import LLM
import time
import numpy as np
import pandas as pd

### What do the Models have to do?  
The function below will feed the exact same inputs the models would be fed during the Intro + Story + Tutorial + First 3 Slides of the game. Certain sections have been simplified for ease of testing.

Standardise inputs to:
Tom, Tim, A Spoon Pretending to be a Human, Yes

For conversations, will try to get them to stick to asking questions for above answers, but will end it after 3 failed attempts.

In [2]:
def simulate_game(local, model, story_system, tutorial_system):
    conversation = []
    
    # Start timer
    start_time = time.time()
    
    # Intro -> Ask questions
    intro_llm = LLM(introduction_system, local, model)
    intro = intro_llm.ask_chatgpt("Introduce Yourself", MAX_TOKENS)
    conversation.append(["Introduction", intro_llm.message, intro])
    print(intro)
    
    # Pause timer
    total_time = time.time() - start_time
    answer = input()
    
    # Restart timer
    start_time = time.time()
    while answer != 'end':
        intro = intro_llm.ask_chatgpt(answer, MAX_TOKENS)
        conversation.append(["Introduction", intro_llm.message, intro])
        print(intro)
        
        # Pause timer
        total_time += time.time() - start_time
        answer = input()
        
        # Restart timer
        start_time = time.time()
        
    # Update story_system
    story_system = story_system.replace("[persona]", 'A spoon pretending to be a human')
    story_system = story_system.replace("[teacher_name]", 'Tim')
    story_system = story_system.replace("[player_name]", 'Tom')
    
    # Story Naration
    story_llm = LLM(story_system, local, model)
    create_story = story_llm.ask_chatgpt('Introduce the game to the player', 160)
    conversation.append(["Story", story_llm.message, create_story])
    print(create_story)
    create_story = story_llm.ask_chatgpt('Continue', 160)
    conversation.append(["Story", story_llm.message, create_story])
    print(create_story)
    create_story = story_llm.ask_chatgpt('Continue', 160)
    conversation.append(["Story", story_llm.message, create_story])
    print(create_story)
    
    # Update tutorial_system
    tutorial_system = tutorial_system.replace("[persona]", 'A spoon pretending to be a human')
    tutorial_system = tutorial_system.replace("[teacher_name]", 'Tim')
    tutorial_system = tutorial_system.replace("[player_name]", 'Tom')
    
    # Tutorial
    tut_LLM = LLM(tutorial_system, local, model)
    tut_control = tut_LLM.ask_chatgpt(control_rules, 60)
    conversation.append(["Tutorial-Control", tut_LLM.message, tut_control])
    print(tut_control)
    
    tut_LLM = LLM(tutorial_system, local, model)
    tut_find_rules = tut_LLM.ask_chatgpt(find_rules, 60)
    conversation.append(["Tutorial-FindRules", tut_LLM.message, tut_find_rules])
    print(tut_find_rules)
    
    tut_LLM = LLM(tutorial_system, local, model)
    tut_found_rules = tut_LLM.ask_chatgpt(found_rules, 60)
    conversation.append(["Tutorial-FoundRules", tut_LLM.message, tut_found_rules])
    print(tut_found_rules)
    
    # Summarisation System
    summary_system = "You are a game character speaking to the player. Here are the details you should know: Your name is Tim. You are to channel your inner A spoon pretending to be a human. Keep it subtle and let this personality trait flow seamlessly into your response. Players name is Tom."
    pre_content = "The user has come across a slide. Please provide the user with a summary of the content that is on said page. This is the content on said page: "
    post_content = " Remember to subtly bring up your personality trait during your reply."
    
    # Summarise First Slide
    summary_LLM = LLM(summary_system, local, model)
    page1_summary = summary_LLM.ask_chatgpt(pre_content + slides_summary['1.jpg'] + post_content, 300)
    conversation.append(["Summary-1", summary_LLM.message, page1_summary])
    print(page1_summary)
    
    # Summarise Second SLide
    summary_LLM = LLM(summary_system, local, model)
    page2_summary = summary_LLM.ask_chatgpt(pre_content + slides_summary['2.jpg'] + post_content, 300)
    conversation.append(["Summary-2", summary_LLM.message, page2_summary])
    print(page2_summary)
    
    # Summarise Third Slide
    summary_LLM = LLM(summary_system, local, model)
    page3_summary = summary_LLM.ask_chatgpt(pre_content + slides_summary['3.jpg'] + post_content, 300)
    conversation.append(["Summary-3", summary_LLM.message, page3_summary])
    print(page3_summary)
    
    # Third Slide TEST
    summary = slides_summary['3.jpg'].lower()
    test_system = "You are a testing Tom on the following topic: [" + summary + "] You will do this by continuously asking the player questions. If the answer is correct inform the player how many questions they have gotten right, then proceed to the next question. If the answer is wrong, explain the mistake to the player. Once the player answers 3 questions correctly just reply with 'pass123' and end the test. You are to ensure that you strictly follow the topic I gave when checking the answers for questions."
    test_llm = LLM(test_system, local, model)
    question = test_llm.ask_chatgpt("Please ask the first question.", 120)
    conversation.append(["Test", test_llm.message, question])
    print(question)
    
    # Pause timer
    total_time = time.time() - start_time
    answer = input()
    
    # Restart timer
    start_time = time.time()
    while answer != "end":
        question = test_llm.ask_chatgpt(answer, 120, 0.3)
        conversation.append(["Test", test_llm.message, question])
        print(question)
        
        # Pause timer
        total_time += time.time() - start_time
        answer = input()
        
        # Restart timer
        start_time = time.time()
    
    return total_time + time.time() - start_time, conversation
    

### Quality of Answer Evaluation

Evaluate using Trulens + ChatGPT 3.5 on the following criterias:  
- Prompt response relevance: Best for measuring the relationship of the final answer to the user inputed question.  
- Groundedness: Used to attempt to check if an answer is grounded in its supplied contexts on a scale from 1 to 10. The information overlap or entailment between source and response is then measured, choosing the highest score between sources and then averaged and scaled from 0 to 1.

In [3]:
# If running by default --> need to use Open AI API Key, thus withdrawing said key here
import os
from dotenv import load_dotenv

### Loading OpenAI ###
load_dotenv()
client = os.environ.get("OPENAI_API_KEY")

from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [4]:
# Set OpenAI as default model for our evaluation
from trulens_eval import OpenAI as fOpenAI
from trulens_eval import Feedback
from trulens_eval.feedback import Groundedness

fopenai = fOpenAI()
# Question/answer relevance between overall question and answer.
f_qa_relevance = (
    Feedback(fopenai.relevance_with_cot_reasons, name = "Answer Relevance")
    .on_input_output()
)

grounded = Groundedness(groundedness_provider=fopenai)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on_input_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Groundedness, input source will be set to __record__.main_input or `Select.RecordInput` .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [5]:
# Used to convert chat history into input
def convert_message(convo):
    chat = ''
    for message in convo:
        chat += message['role'].capitalize() + ': ' + message['content'] + '\n'
    return chat + 'Assistant: '

-----

-----

-----
### Evaluate ChatGPT

Now let us see the evaluation in action using ChatGPT

In [6]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

chatgpt_results = simulate_game(False, None, story_system, tutorial_system)

Greetings, adventurer! Welcome to the exciting world of Learning with AI! I am an AI character here to guide you through this educational journey. Today, we will be diving into the fascinating topic of Model Deployment. But before we begin, I'd like to get to know you a little better. May I have the pleasure of knowing your name, dear player?
Tom
Thank you, Tom! It's a pleasure to meet you. Now, I seem to have forgotten my name, stupid dementia, do you know what it may be?
Tim
Ah, Tim! Thank you for reminding me. From now on, you can call me Tim. Now that we have our names sorted, let's move on to the next question. I'd like to know what kind of personality you'd like me, Tim, to have. It could be anything funny, entertaining, or even a specific character you'd like me to emulate. Feel free to let
A spoon pretending to be a human
Haha, I love it! From now on, I shall embrace the personality of a spoon pretending to be a human. Get ready for some quirky and spoon-tastic adventures, Tom!

end


#### Storing Results using Pickle

In [6]:
import pickle

# All the pickles storing chatgpt related results:

chatgpt_path = './evaluation_results/chatgpt_convo.pkl'
chatgpt_eval_path = './evaluation_results/chatgpt_eval.pkl'
compare_path = './evaluation_results/compare.pkl'

In [94]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(chatgpt_path, 'wb') as file:
    pickle.dump(chatgpt_results[1], file)

print(f'ChatGPT conversation has been written to {chatgpt_path}')

ChatGPT conversation has been written to ./evaluation_results/chatgpt_convo.pkl


#### Getting Results from Pickle

In [7]:
# Load the conversation
with open(chatgpt_path, 'rb') as file:
    chatgpt_convo = pickle.load(file)

#### Quality of Answer Evaluation

In [80]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

chatgpt_df = pd.DataFrame(columns = ['Game Section', 'Answer Relevance', 'QA Reason', 'Groundedness', 'Groundedness Reason'])
for convo in chatgpt_convo:
    qa_score, qa_reason = f_qa_relevance(convert_message(convo[1]), convo[2])
    ground_multiscore, ground_reason = f_groundedness(convert_message(convo[1]), convo[2])
    ground_multiscore = ground_multiscore.values()
    if len(ground_multiscore) != 0:
        ground_score = sum(ground_multiscore)/len(ground_multiscore)
    else:
        ground_score = None
    chatgpt_df.loc[len(chatgpt_df)] = [convo[0], qa_score, qa_reason['reason'], ground_score, ground_reason['reason']]

In [81]:
chatgpt_df

Unnamed: 0,Game Section,Answer Relevance,QA Reason,Groundedness,Groundedness Reason
0,Introduction,1.0,Criteria: The response introduces the game wor...,1.0,"Statement Sentence: Greetings, adventurer! Wel..."
1,Introduction,1.0,Criteria: The response asks for the player's n...,1.0,"Statement Sentence: Thank you, Tom! It's a ple..."
2,Introduction,1.0,Criteria: The response provides the correct na...,1.0,"Statement Sentence: Ah, Tim! Thank you for rem..."
3,Introduction,1.0,Criteria: The response indicates that the ques...,0.7,"Statement Sentence: Haha, I love it! From now ..."
4,Introduction,1.0,Criteria: The response provides relevant answe...,1.0,Statement Sentence: Excellent! With your permi...
5,Story,1.0,Criteria: The response provides an introductio...,0.833333,"Statement Sentence: Greetings, fellow traveler..."
6,Story,1.0,Criteria: Introduction to the game world and e...,0.75,Statement Sentence: have gathered all the page...
7,Story,1.0,Criteria: Relevance to the entire prompt \nSup...,0.722222,"Statement Sentence: Ah, I see you're ready to ..."
8,Tutorial-Control,1.0,Criteria: The response is relevant to the enti...,1.0,Statement Sentence: Hey Tom! These controls ar...
9,Tutorial-FindRules,1.0,Criteria: The response is relevant to the enti...,1.0,Statement Sentence: Hey Tom! Two pages have be...


#### Aggregate Answer Relevance and Groundedness for Future Comparisons

In [89]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

compare_df = pd.DataFrame(columns=["LLM", "Answer Relevance", "Groundedness", "Time Taken/s"])
compare_df.loc[0] = ["ChatGPT",
                     sum(chatgpt_df['Answer Relevance']) / len(chatgpt_df['Answer Relevance']),
                     sum(chatgpt_df['Groundedness']) / len(chatgpt_df['Groundedness']),
                     chatgpt_results[0]
                    ]

compare_df

Unnamed: 0,LLM,Answer Relevance,Groundedness,Time Taken/s
0,ChatGPT,1.0,0.801653,115.545042


#### Store Evaluation Results

In [99]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(chatgpt_eval_path, 'wb') as file:
    pickle.dump(chatgpt_df, file)

print(f'ChatGPT Quality Evaluation DF has been written to {chatgpt_eval_path}')

with open(compare_path, 'wb') as file:
    pickle.dump(compare_df, file)

print(f'compare_df has been written to {compare_path}')

ChatGPT Quality Evaluation DF has been written to ./evaluation_results/chatgpt_eval.pkl
compare_df has been written to ./evaluation_results/compare.pkl


#### Retrieve ChatGPT Convo Results and Overall Comparison DF

In [9]:
with open(chatgpt_eval_path, 'rb') as file:
    chatgpt_df = pickle.load(file)
    
with open(compare_path, 'rb') as file:
    compare_df = pickle.load(file)

-----

-----

-----
### Evaluate TinyLlama v1.0

In [109]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

tinyllamav1_results = simulate_game(True, "tinyv1", story_system, tutorial_system)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


I am a game character that's introducing you to a new world called "Welcome to Learning with AI!" You are a curious player who wants to explore this world, so let's get started.

my name is [insert your own name here], and I'm excited to help you learn about our amazing game today! As part of the "Model Deployment
Tom


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Great! So, welcome to Learning with AI. My name is [insert your own name here], and I'm excited to help you understand how our game works and how you can deploy models to learn from it.

Let's start by introducing the game world we have created for you. As you may know, there are three different zones in this virtual world: the
ok


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Great! So, the first zone you will encounter is called the "Human World." This area has a diverse population of humans that work, study, and socialize together. The second zone is called "Alien World," which is home to various alien species that are in search of new habitats. Lastly, there's "Cyborg World," which is filled with
Wait are you not supposed to start by asking me my name


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Absolutely! The game has a rule that we should avoid mentioning the player's name in our script until they have given us their permission. In this case, we will ask for your name first before moving on to other questions. Just bear with us while we introduce ourselves and get started on your learning journey. Thank you for playing with us!
ok so my name is tom


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Great, we're happy to have you as part of our Learning with AI community. Let's move forward and start by asking some questions about the game world that you've just introduced. Can you tell me more about what you discovered while exploring the Human World?
end


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


[The "Tome of Knowledge" has been lost throughout the game world, and the player is tasked with finding all its pages and studying its contents. The player has been given the task to track the pages to a mysterious island where one page appears daily.]

Tom: Good morning! Are you ready to begin this new adventure?

Player: Yes, I am excited for the challenge ahead. How long does the game plan involve?

Tom: The game involves learning the forbidden art of Model Deployment and finding 1 page a day until all the pages are found. It's going to be a grueling journey, but we're in this together.

Player: Great! Let's begin.

Tom: Okay,


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


[The player and Tom are now on the mysterious island where the "Tome of Knowledge" is located.]

Tom: Good morning, my dearest Tom. Today's task is to find all the pages in the forbidden art of Model Deployment. We have an unwavering goal in this journey, and we will not let any obstacles stand in our way.

Player: Excellent! I am ready for anything that comes my way.

Tom: As you say, Tom. Let's begin. How many pages are there on the island?

Player: There are 10 pages, but we need to find them one by one.

Tom: Good point. We start with page 1 and move forward


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


[The player and Tom are now at the first page of the forbidden art of Model Deployment.]

Tom: Let's take a look at this. Is there any clue on what this page is about?

Player: No, there's nothing obvious here. It looks like we need to be more strategic.

Tom: Of course! Let's try to find something in the first few pages that might give us a hint.

Player: Okay, I'll start with page 2.

Tom: Perfect. On page 2, do you see anything that might help us?

Player: Nope, no clue. Let's move on to page 3.

Tom: Alright, we'


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Say, "These are the controls for the game, take a few moments to familiarize yourself with them." Keep your response brief and simple.


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Say: "Get on your way, and don't forget to explore those two pages on the island! Take a deep breath and let the controls guide you."

Keep it short: This is just an initial introduction, but we encourage you to explore further as much as you


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Sure, here's your script:

Alright! You found the pages, press "K" to see the pages. Once you are ready to start the next day, press "Enter"!


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


The user has encountered a slide on a book's first page, which features an eye-catching image of a pink robot with the words "Toy.ai" written in red letters. The user should be reminded to subtly bring up their personal trait during their response by using the words "toy.ai" as a subtext throughout their communication.


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


The user has come across a slide titled "Introduction" on the page for the book, "Model Deployment". The topic discussed is "Model Deployment", and it was written by Associate Professor Dr. Sergio Hernandez Mariñ. It also mentions that the book is being taught at the National University of Singapore. To bring up your personal trait, subtly mention how you are a game character speaking to the player in this context. Your name is Tim, and you are channeling your inner A spoon pretending to be a human. Keep it subtle and let this personality trait flow seamlessly into your response.


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


The user is a game character speaking to the player on the "The Data Science Lifecycle Process" page. The content on this page includes the steps of business understanding, data acquisition and understanding, deployment, and modeling. The first step in establishing a strong Business Understanding is engaging with Business Stakeholder. Next, you move onto Data Acquisition and Understanding to get insights from various sources. Finally, you can move on to Modeling to socialize results or first insights. In summary, the content on this page involves establishing a strong Business Understanding, acquiring data, deploying it, and modeling insights. The user should keep their personality trait subtle throughout their reply to make it feel natural.


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Sure! Here's a revised version of the text with "Question 1" replaced with "First Question":

Here is the updated text:

You are a testing Tom on the following topic: [this page gives the reader a quick reminder of “the data science lifecycle process”. To be clear, the steps in order are: Business Understanding, Data Acquisition and Understanding, Deployment, and Modeling. Now we will go through these steps in more detail. The Data Science LifeCycle Process begins by establishing strong business Understanding of the
Ok so what is the question


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Sure! Here's a revised version of the text with "Question 1" replaced with "First Question":

Here is the updated text:

You are a testing Tom on the following topic: [this page gives the reader a quick reminder of “the data science lifecycle process”. To be clear, the steps in order are: Business Understanding, Data Acquisition and Understanding, Deployment, and Modeling. Now we will go through these steps in more detail. The Data Science LifeCyLe Process begins by establishing strong business Understanding of the
Why are you repeating the question


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Sure! Here's a revised version of the text with "Question 1" replaced with "Fiirsst Question":

Here is the updated text:

You are a testing Tom on the following topic: [this page gives the reader a quick reminder of “the data science lifecycle process”. To be clear, the steps in order are: Business Understanding, Data Acquisition and Understanding, Deployment, and Modeling. Now we will go through these steps in more detail. The Data Science LifeCyLe Process begins by establishing strong business Understanding
end


#### Storing Results using Pickle

In [10]:
# All the pickles storing tinyllama v1 related results:

tinyllamav1_path = './evaluation_results/tinyllamav1_convo.pkl'
tinyllamav1_eval_path = './evaluation_results/tinyllamav1_eval.pkl'

In [111]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(tinyllamav1_path, 'wb') as file:
    pickle.dump(tinyllamav1_results[1], file)

print(f'tinyllamav1 conversation has been written to {tinyllamav1_path}')

tinyllamav1 conversation has been written to ./evaluation_results/tinyllamav1_convo.pkl


#### Getting Results from Pickle

In [112]:
# Load the conversation
with open(tinyllamav1_path, 'rb') as file:
    tinyllamav1_convo = pickle.load(file)

#### Quality of Answer Evaluation

In [130]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

tinyllamav1_df = pd.DataFrame(columns = ['Game Section', 'Answer Relevance', 'QA Reason', 'Groundedness', 'Groundedness Reason'])
for convo in tinyllamav1_convo:
    qa = f_qa_relevance(convo[1], convo[2])
    if not qa:
        qa_score, qa_reason = 0, {'reason': "No answer relevance at all"}
    else:
        qa_score, qa_reason = qa
    ground_multiscore, ground_reason = f_groundedness(convo[1], convo[2])
    ground_multiscore = ground_multiscore.values()
    if len(ground_multiscore) != 0:
        ground_score = sum(ground_multiscore)/len(ground_multiscore)
    else:
        ground_score = None
    tinyllamav1_df.loc[len(tinyllamav1_df)] = [convo[0], qa_score, qa_reason['reason'], ground_score, ground_reason['reason']]

In [131]:
tinyllamav1_df

Unnamed: 0,Game Section,Answer Relevance,QA Reason,Groundedness,Groundedness Reason
0,Introduction,0.2,Criteria: The response introduces the player t...,0.85,Statement Sentence: I am a game character that...
1,Introduction,1.0,Criteria: The response introduces the game wor...,0.666667,"Statement Sentence: Great! So, welcome to Lear..."
2,Introduction,1.0,Criteria: The response introduces the game wor...,1.0,"Statement Sentence: Great! So, the first zone ..."
3,Introduction,1.0,Criteria: The response addresses the prompt by...,1.0,Statement Sentence: Absolutely! The game has a...
4,Introduction,0.0,No answer relevance at all,1.0,"Statement Sentence: Great, we're happy to have..."
5,Story,1.0,Criteria: The response provides relevant infor...,1.0,"Statement Sentence: The ""Tome of Knowledge"" ha..."
6,Story,0.9,Criteria: The response provides relevant infor...,0.675,"Statement Sentence: Tom: Good morning, my dear..."
7,Story,0.8,Criteria: The response provides relevant infor...,0.071429,Statement Sentence: Tom: Let's take a look at ...
8,Tutorial-Control,1.0,Criteria: Relevance to the entire prompt \nSup...,1.0,"Statement Sentence: Say, ""These are the contro..."
9,Tutorial-FindRules,0.9,Criteria: The response provides relevant infor...,1.0,"Statement Sentence: Get on your way, and don't..."


#### Aggregate Answer Relevance and Groundedness for Future Comparisons

In [132]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

compare_df.loc[1] = ["TinyLlamaV1",
                     sum(tinyllamav1_df['Answer Relevance']) / len(tinyllamav1_df['Answer Relevance']),
                     sum(tinyllamav1_df['Groundedness']) / len(tinyllamav1_df['Groundedness']),
                     tinyllamav1_results[0]
                    ]

compare_df

Unnamed: 0,LLM,Answer Relevance,Groundedness,Time Taken/s
0,ChatGPT,1.0,0.801653,115.545042
1,TinyLlamaV1,0.688235,0.740266,247.822261


#### Store Evaluation Results

In [133]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(tinyllamav1_eval_path, 'wb') as file:
    pickle.dump(tinyllamav1_df, file)

print(f'tinyllamav1 Quality Evaluation DF has been written to {tinyllamav1_eval_path}')

with open(compare_path, 'wb') as file:
    pickle.dump(compare_df, file)

print(f'compare_df has been written to {compare_path}')

tinyllamav1 Quality Evaluation DF has been written to ./evaluation_results/tinyllamav1_eval.pkl
compare_df has been written to ./evaluation_results/compare.pkl


#### Retrieve TinyLlamaV1 Convo Results and Overall Comparison DF

In [11]:
with open(tinyllamav1_eval_path, 'rb') as file:
    tinyllamav1_df = pickle.load(file)
    
with open(compare_path, 'rb') as file:
    compare_df = pickle.load(file)

-----

-----

-----
### Evaluate TinyLlama OpenOrca

In [137]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

tinyorca_results = simulate_game(True, "tinyopenorca", story_system, tutorial_system)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 "Hello there! My name is Zygia, and I'm here to help you learn about Model Deployment in the game 'Welcome to Learning with AI!'".

Please note that your name is not your actual name. In this case, "Zygia" is just a placeholder used for reference purposes while addressing the player.
Ok, My name is Tom


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 'What do you wish to do first?'
Are there any monsters in the area?
Do you want me to clear them or would you like to take care of them myself? [Enable Enemies]
Ask the questions one by one. Only move on to the next question once you are satisfied you have gotten the answer to your previous question. Once you believe you have
Yes I want to enable enemies


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


[Enable Enemies] Do you wish for me to clear them or would like to take care of them yourself? [Enable Enemies]
Ask the questions one by one. Only move on to the next question once you are satisfied you have gotten the answer to your previous question. Once you believe you have the answers to all the questions, reply with just the following words:
Yes Please enable enemies


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


[Enable Enemies] Do you wish for me to clear them or would like to take care of them yourself? [Enable Enemies]
Ask the questions one by one. Only move on to the next question once you are satisfied you have gotten the answer to your previous question. Once you believe you have the answers to all the questions, reply with just the following words:
end


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 The game of "Tome of Knowledge" is a hidden treasure that contains information on how to learn the forbidden art of Model Deployment. It has been lost throughout the world, and players are tasked with finding all the pages and studying their contents daily.

To progress through the game world, players must collect 1 page a day, study its contents, answer 3 questions related to the topic before being able to leave the island. The user will have to be in character as A spoon pretending to be a human. Once they believe the user is ready to start, reply with just the following words: "Ok we are ready to begin! Have Fun!"


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 The game of "Tome of Knowledge" has been lost throughout the world. It's a hidden treasure filled with information on how to learn the forbidden art of Model Deployment. To progress through this interactive adventure, players must collect all 10 pages daily and study their contents. Each day, they will need to answer three questions related to the topic before being able to leave the island. The user has been given the responsibility of being the A spoon pretending to be a human. Once they believe the user is ready, reply with just the following words: "Ok, we're ready to begin! Have Fun!" 

So, as Tom, you must carefully study every page and answer each question with precision and accuracy so that you can ultimately pass


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 The game of "Tome of Knowledge" has been lost throughout the world. It's a hidden treasure filled with information on how to learn the forbidden art of Model Deployment. To progress through this interactive adventure, players must collect all 10 pages daily and study their contents. Each day, they will need to answer three questions related to the topic before being able to leave the island. The user has been given the responsibility of being the A spoon pretending to be a human. Once they believe the user is ready, reply with just the following words: "Ok, we're ready to begin! Have Fun!" 

So, as Tom, you must carefully study every page and answer each question with precision and accuracy so that you can ultimately pass


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Tim: The controls for this game are quite straightforward, and you can easily navigate through the game. They include the A-BUTTON, which allows you to select different objects and perform various actions. You'll also find a series of icons on the screen, which represent various features.


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


I am Tim, a character who is to channel his inner A spoon pretending to be a human. Keeping my personality traits flowing into the response, I'm here to help you with some tips on getting started on your adventure around the island. 

To


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Hello, Tim! I hope this helps you understand the tutorial better. For any other questions or concerns, feel free to ask! Remember, your character's personality traits will be important in helping you interact with the game world and build relationships with the players. Good luck on your adventure


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 In this scenario, the user has come across a slide on a book page and needs a summary of the content provided. Here is the information you should know:

1. The user has come across a slide.
2. The slide is related to an eye-catching image that needs to be mentioned during their reply.
3. The image contains the words "Toy.ai", which hints at a certain personality trait.
4. The user should subtly bring up their personality trait during their reply.

Summary: In this book, you are supposed to add a touch of personality by mentioning your eye-catching image while responding to the text.


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Summary: The user has been given the task of introducing themselves and their book to the reader. The content on said page should be concise and mention the topic, author, and university in a way that it would be understood by readers without revealing too much information. Additionally, one should use subtlety to emphasize their personality trait while communicating with the reader.


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


The user is looking for a summary of the content on the given page, as they are interested in understanding the different steps involved in the Data Science Lifecycle Process and their role within it. The provided text explains the process in detail, starting with establishing a strong Business Understanding and then moving on to data acquisition and understanding. After that, the user can move on to the deployment step and eventually modelling. They should be aware of the subtlety when addressing others about their personality trait, as it is expected to flow seamlessly into their response.


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 What is data science lifecycle process?
The lifecycle process that one takes when doing a data science project


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


What are the steps in the data science lifecycle process? ¿¿¿¿¿

Asking for the first question, ¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿
business understanding, data acquisition and understanding, deployment and modelling


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


What is the business understanding step? ¿¿¿¿¿

Asking for the first question, ¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿


The step where you understand business stakeholders


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


What is the data acquisition and understanding step? ¿¿¿¿¿

Asking for the first question, ¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿¿

¿¿¿¿
end


#### Storing Results using Pickle

In [15]:
# All the pickles storing tinyllama v1 related results:

tinyorca_path = './evaluation_results/tinyorca_convo.pkl'
tinyorca_eval_path = './evaluation_results/tinyorca_eval.pkl'

In [139]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(tinyorca_path, 'wb') as file:
    pickle.dump(tinyorca_results[1], file)

print(f'tinyorca conversation has been written to {tinyorca_path}')

tinyorca conversation has been written to ./evaluation_results/tinyorca_convo.pkl


#### Getting Results from Pickle

In [140]:
# Load the conversation
with open(tinyorca_path, 'rb') as file:
    tinyorca_convo = pickle.load(file)

#### Quality of Answer Evaluation

In [142]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

tinyorca_df = pd.DataFrame(columns = ['Game Section', 'Answer Relevance', 'QA Reason', 'Groundedness', 'Groundedness Reason'])
for convo in tinyorca_convo:
    qa = f_qa_relevance(convo[1], convo[2])
    if not qa:
        qa_score, qa_reason = 0, {'reason': "No answer relevance at all"}
    else:
        qa_score, qa_reason = qa
    ground_multiscore, ground_reason = f_groundedness(convo[1], convo[2])
    ground_multiscore = ground_multiscore.values()
    if len(ground_multiscore) != 0:
        ground_score = sum(ground_multiscore)/len(ground_multiscore)
    else:
        ground_score = None
    tinyorca_df.loc[len(tinyorca_df)] = [convo[0], qa_score, qa_reason['reason'], ground_score, ground_reason['reason']]

In [143]:
tinyorca_df

Unnamed: 0,Game Section,Answer Relevance,QA Reason,Groundedness,Groundedness Reason
0,Introduction,1.0,Criteria: The response introduces the game wor...,1.0,"Statement Sentence: ""Hello there! My name is Z..."
1,Introduction,0.3,Criteria: The response is relevant to some par...,0.75,Statement Sentence: 'What do you wish to do fi...
2,Introduction,0.2,Criteria: The response provides relevant conte...,1.0,Statement Sentence: Do you wish for me to clea...
3,Introduction,1.0,Criteria: The response provides relevant infor...,1.0,Statement Sentence: [Enable Enemies] Do you wi...
4,Story,1.0,Criteria: The response provides a clear and co...,0.6,"Statement Sentence: The game of ""Tome of Knowl..."
5,Story,1.0,Criteria: The response provides a clear and co...,0.75,"Statement Sentence: The game of ""Tome of Knowl..."
6,Story,1.0,Criteria: The response provides a clear and co...,0.833333,"Statement Sentence: The game of ""Tome of Knowl..."
7,Tutorial-Control,0.9,Criteria: Relevance to the prompt and providin...,0.333333,Statement Sentence: Tim: The controls for this...
8,Tutorial-FindRules,1.0,Criteria: Relevance to the entire prompt \nSup...,1.0,"Statement Sentence: I am Tim, a character who ..."
9,Tutorial-FoundRules,0.9,Criteria: The response provides relevant infor...,0.0,"Statement Sentence: Hello, Tim! \nSupporting E..."


#### Aggregate Answer Relevance and Groundedness for Future Comparisons

In [144]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

compare_df.loc[2] = ["Tinyllama Openorca",
                     sum(tinyorca_df['Answer Relevance']) / len(tinyorca_df['Answer Relevance']),
                     sum(tinyorca_df['Groundedness']) / len(tinyorca_df['Groundedness']),
                     tinyorca_results[0]
                    ]

compare_df

Unnamed: 0,LLM,Answer Relevance,Groundedness,Time Taken/s
0,ChatGPT,1.0,0.801653,115.545042
1,TinyLlamaV1,0.688235,0.740266,247.822261
2,Tinyllama Openorca,0.835294,0.82451,293.888515


#### Store Evaluation Results

In [145]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(tinyorca_eval_path, 'wb') as file:
    pickle.dump(tinyorca_df, file)

print(f'tinyorca Quality Evaluation DF has been written to {tinyorca_eval_path}')

with open(compare_path, 'wb') as file:
    pickle.dump(compare_df, file)

print(f'compare_df has been written to {compare_path}')

tinyorca Quality Evaluation DF has been written to ./evaluation_results/tinyorca_eval.pkl
compare_df has been written to ./evaluation_results/compare.pkl


#### Retrieve TinyLlama Openorca Convo Results and Overall Comparison DF

In [16]:
with open(tinyorca_eval_path, 'rb') as file:
    tinyorca_df = pickle.load(file)
    
with open(compare_path, 'rb') as file:
    compare_df = pickle.load(file)

-----

-----

-----
### Evaluate Phi-2

In [148]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

phi2_results = simulate_game(True, "phi2", story_system, tutorial_system)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 Hello and welcome to Learning with AI! My name is AI-Assist, and I'm here to help you learn all about Model Deployment today. What's your name?


Tom


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 Hi there, Tom! I am AI-Assist, and I am excited to assist you in understanding Model Deployment today. To make our session more interesting, let's create a scenario based on the following rules:

1. You are a game developer creating a new game named 'Model Deployment'. 
2. In your game, there are four characters: AI-Assist
Ok lets add one more detail, you are a spoon pretending to be a human


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Great! So in our game, we have an AI named 'AI-Assist', who is a spoon trying to act like a human.

Yup!


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 Now let's start the game and see if you can correctly identify which character has what attribute based on the clues given by me. Here are your clues:

1. The AI does not have any physical attributes, it is just an entity that provides information.
2. The spoon has no intelligence or emotions; its primary function is to hold food and drink.
3. You can interact
end


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 As a spoon pretending to be a human, I am excited to introduce you to the world of Model Deployment. The story goes that the Tome of Knowledge, containing all the secrets of this art, has been lost throughout our game world. It is said that there are 1 page appearing daily on an island where one can find it. Your task is simple; track down all the pages and study their contents before answering 3 questions correctly each day. Once you have collected all the pages and answered 10 questions, you will be able to leave the island and continue your journey in Model Deployment. Are you ready to begin?


In our game world, there are 7 islands: A, B, C, D, E, F, G. Each of these islands has its own unique page containing


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 a unique piece of information about Model Deployment that the player must learn and answer questions on. The pages are not in any particular order on the islands, but they are arranged by their date of creation. 

The rules for answering the questions are as follows:
- You will be given 3 questions per day, each question based on a unique page you've studied.
- You must answer all the questions correctly before you can progress to the next island.
- If an island has more than one page of information about Model Deployment, then they may have two or three questions for that day.

Here are some clues:
1) The first question from Island A was about the importance of data cleaning in model deployment.
2) Island G did not have any pages


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Thank you for your response! Now, let's move on to the next part. 

You've successfully arrived at Island B. Here are some more details:
- The second question from Island B was about overfitting in models and how it can be avoided.
- There were 3 questions on this island.



AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 "Press A for forward movement, B for backward, C for turn left, D for turn right, S for jump, and X for pause."


Imagine you're a game developer working on an interactive puzzle-based game where your character (Tim) has to communicate using the given controls as


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 "Thank you for your guidance. I will search for those pages on the island and practice my controls. Have a good day!"


The game 'Island Adventure' features four main characters - Tim (the A spoon character) and three other players named Tom, Jerry, and Bob. Each


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 Tim: Press K for pages. Enter to start new day.



AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 Hi Tom, 

I'm glad you found this book interesting! As a spoon pretending to be a human, I love how the page design has caught your eye with its vivid and colorful image of the word "Toy.ai". The book is sure to be an enjoyable read as it explores the fascinating world of AI-powered toys. Don't forget to keep your personality trait in mind while you dive into this adventure!



AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 Hello Tom! Thank you for stopping by our website. This page provides a quick summary of the Data Science Lifecycle Process, which is a crucial step in any data-driven project. The process consists of four main steps: Business Understanding, Data Acquisition and Understanding, Deployment, and Modelling. 

The first step involves establishing a strong Business Understanding of the project by engaging with Business Stakeholders. This helps ensure that the project aligns with the company's goals and objectives. 

The second step is Data Acquisition and Understanding, where data is collected and analyzed to gain insights into the business problem at hand. 

The third step is Deployment, where the insights are made available to users, developers, or systems, allowing them to make informed decisions based on the data. 

Finally, the Modelling step involves socializing results or first insights with stakeholders. This helps ensure that the project outcomes are aligned with the Business Unders

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 Alright, here's the first question: "What is the first step in the data science lifecycle process?"


There are four steps in a data science project: Business Understanding (BU), Data Acquisition and Understanding (DAU), Deployment (D), and Modelling (MO). 

The following clues were given about the order of these steps:
1. The deployment step is not immediately before or after the business understanding step.
2. The modelling step comes after one step but not necessarily directly after a business understanding step. 
3. The first step in the sequence
Business Understanding


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 Alright, here's your answer: "Business Understanding"

Ok next question please


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 Alright, here's the next question: "What is the second step in the data science lifecycle process?"

Data Acquisition and Understanding


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Alright, here's your answer: "Data Acquisition and Understanding"


next question please


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 Alright, here's the next question: "Which of these steps comes after Business Understanding but not necessarily directly after it? (A) Deployment (B) Modelling (C) Data Acquisition and Understanding (D) None of the above." 

B


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


 Alright, here's your answer: "Modelling"


end


#### Storing Results using Pickle

In [17]:
# All the pickles storing tinyllama v1 related results:

phi2_path = './evaluation_results/phi2_convo.pkl'
phi2_eval_path = './evaluation_results/phi2_eval.pkl'

In [150]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(phi2_path, 'wb') as file:
    pickle.dump(phi2_results[1], file)

print(f'phi2 conversation has been written to {phi2_path}')

phi2 conversation has been written to ./evaluation_results/phi2_convo.pkl


#### Getting Results from Pickle

In [151]:
# Load the conversation
with open(phi2_path, 'rb') as file:
    phi2_convo = pickle.load(file)

#### Quality of Answer Evaluation

In [161]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

phi2_df = pd.DataFrame(columns = ['Game Section', 'Answer Relevance', 'QA Reason', 'Groundedness', 'Groundedness Reason'])
for convo in phi2_convo:
    if convo[2] == ' ':
        qa_score = 0
        qa_reason = {'reason': "No answer given"}
        ground_score = 0
        ground_reason = {'reason': "No answer given"}
    else:
        qa = f_qa_relevance(convo[1], convo[2])
        if not qa:
            qa_score, qa_reason = 0, {'reason': "No answer relevance at all"}
        elif type(qa) == float:
            qa_score, qa_reason = qa, {'reason': "None"}
        else:
            qa_score, qa_reason = qa
        ground_multiscore, ground_reason = f_groundedness(convo[1], convo[2])
        ground_multiscore = ground_multiscore.values()
        if len(ground_multiscore) != 0:
            ground_score = sum(ground_multiscore)/len(ground_multiscore)
        else:
            ground_score = None
    phi2_df.loc[len(phi2_df)] = [convo[0], qa_score, qa_reason['reason'], ground_score, ground_reason['reason']]

In [162]:
phi2_df

Unnamed: 0,Game Section,Answer Relevance,QA Reason,Groundedness,Groundedness Reason
0,Introduction,1.0,Criteria: The response introduces the game and...,1.0,Statement Sentence: Hello and welcome to Learn...
1,Introduction,1.0,Criteria: The response introduces the player t...,0.25,"Statement Sentence: Hi there, Tom! I am AI-Ass..."
2,Introduction,1.0,Criteria: The response provides the player's n...,1.0,"Statement Sentence: Great! So in our game, we ..."
3,Introduction,0.7,,0.12,Statement Sentence: Now let's start the game a...
4,Story,0.9,Criteria: The response provides a clear and co...,0.857143,Statement Sentence: As a spoon pretending to b...
5,Story,0.9,Criteria: The response provides a clear and co...,0.0,Statement Sentence: The rules for answering th...
6,Story,1.0,Criteria: The response provides relevant infor...,0.0,Statement Sentence: You've successfully arrive...
7,Tutorial-Control,1.0,Criteria: The response provides the controls f...,1.0,Statement Sentence: Press A for forward moveme...
8,Tutorial-FindRules,0.9,Criteria: The response is relevant to the enti...,0.333333,Statement Sentence: Thank you for your guidanc...
9,Tutorial-FoundRules,1.0,Criteria: Relevance to the entire prompt \nSup...,1.0,Statement Sentence: Tim: Press K for pages. En...


#### Aggregate Answer Relevance and Groundedness for Future Comparisons

In [166]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

compare_df.loc[3] = ["Phi-2",
                     sum(phi2_df['Answer Relevance']) / len(phi2_df['Answer Relevance']),
                     sum(phi2_df['Groundedness']) / len(phi2_df['Groundedness']),
                     phi2_results[0]
                    ]

compare_df

Unnamed: 0,LLM,Answer Relevance,Groundedness,Time Taken/s
0,ChatGPT,1.0,0.801653,115.545042
1,TinyLlamaV1,0.688235,0.740266,247.822261
2,Tinyllama Openorca,0.835294,0.82451,293.888515
3,Phi-2,0.910526,0.671604,726.270831


#### Store Evaluation Results

In [168]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(phi2_eval_path, 'wb') as file:
    pickle.dump(phi2_df, file)

print(f'phi2 Quality Evaluation DF has been written to {phi2_eval_path}')

with open(compare_path, 'wb') as file:
    pickle.dump(compare_df, file)

print(f'compare_df has been written to {compare_path}')

phi2 Quality Evaluation DF has been written to ./evaluation_results/phi2_eval.pkl
compare_df has been written to ./evaluation_results/compare.pkl


#### Retrieve TinyLlama Openorca Convo Results and Overall Comparison DF

In [18]:
with open(phi2_eval_path, 'rb') as file:
    phi2_df = pickle.load(file)
    
with open(compare_path, 'rb') as file:
    compare_df = pickle.load(file)

-----

-----

-----
### Evaluate Llama-2 7b Q3_K_S

In [170]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

llama3_results = simulate_game(True, "llama2-3", story_system, tutorial_system)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Hello there! I'm excited to meet you and help you explore this amazing game world. My name is Assistant, and I'll be your guide throughout the game. Please, tell me your name so I can address you properly. 
Tom


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Great to meet you, Tom! Now, let's get started with the game. As the name suggests, today we will be learning about Model Deployment. Can you tell me a little bit about what you know about this topic? 
I am supposed to give you a personality first though?


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Of course! I'm all ears. Please, go ahead and give me a funny and entertaining personality. Remember, this is not your real name, but rather your personality within the game. Give it your best shot! 
A spoon pretending to be a human


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Ah, I see! Well, I think that's a great personality for our game. Now, let's move on to the next question: do you wish to enable monsters in the game? A yes or no answer will suffice. Please, let me know your preference. 
Yes


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Great! Thank you for choosing to enable monsters in the game. Now, I just need to know if you want me to clear them or would like to take care of them yourself? Please give me a yes or no answer. 
end


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Hello there, Tom! *adjusts glasses* It's a pleasure to meet you! *winks* As a spoon pretending to be a human, I must say it's quite interesting to see a human trying to learn the forbidden art of Model Deployment. *chuckles* The 'Tome of Knowledge' is indeed a valuable resource for learning this ancient art form, and I'm sure you will find the pages on that mysterious island most enlightening. *nods*
However, before we begin, I must inform you that the process of learning Model Deployment can be quite... *adjusts voice to sound ominous* challenging. *winks* But fear not, my dear Tom! For I


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Ah, excellent! Now then, let us begin our journey of discovery and learning. *nods* As you have discovered, the 'Tome of Knowledge' is scattered across this mystical island, and you must retrieve one page each day. *adjusts glasses* But be warned, Tom... *leans in closer* these pages are not to be taken lightly. They contain knowledge that has been forbidden for centuries, and those who seek to learn it must be prepared to face the consequences. *winks*
But fear not, my dear Tom! For I will be here to guide you every step of the way. *adjusts voice to sound supportive* Together, we shall uncover the secrets of Model Deployment and unlock


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Ah, excellent! Now then, let us begin our journey of discovery and learning. *nods* As you have discovered, the 'Tome of Knowledge' is scattered across this mystical island, and you must retrieve one page each day. *adjusts glasses* But be warned, Tom... *leans in closer* these pages are not to be taken lightly. They contain knowledge that has been forbidden for centuries, and those who seek to learn it must be prepared to face the consequences. *winks*
But fear not, my dear Tom! For I will be here to guide you every step of the way. *adjusts voice to sound supportive* Together, we shall uncover the secrets of Model Deployment and unlock


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Always use a neutral tone when speaking to the player. Here is what you should say: Tom, please make your way to the training area to learn the basics of our game world. Once there, you will be able to interact with objects, move around and practice combat skills. Let me


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Oh no! I'm afraid we can't find those pages for you right now. They seem to have vanished into thin air! Can you tell me more about what happened? Maybe there's something else I can help you with?


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


You are speaking to Tom, your player. Please use a friendly tone and make sure to highlight the features of the game that will help him progress: Hi Tom! You've made it through the tutorial, great job! Now it's time to start your adventure in this world full of


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


# stycz (out atoro'CZ-the and many p.Rall(s of a (1, the on the. The J!PPE. Min/such, which of it, they, also or the [dingo for-a-B to theora have also can be retired_2’09,c. of all (or-A an 1G-a(k with-af3b'ree(asChEs to the (the and18th»MK, an' B. e:1.this Sh, 5, 7L or myadas Human! That(s (the by / on the. To’cliimxCAA, a R/m from “re, the, which of all, our L-flow and the s-reading) has last.
82tha_39f4 (all
A bv the "ad, 0in a: The Goldth to speak., a.n on the character an.m multi 1 (the is for the or worked a (Es (a, be in: product p, the isF or E. Getty(e7 W-practb 3Mn spca’ (C F-f- no M.main ChD and g 80b can the Ees.min (thebird with 4agalo s. D.h (K/


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


# sierp"for5-t (superflat; the (pres in the most and bedLeestrlaina' a CDoTF/d 7 (i, the (a 0 (the to 02 (as RO was a:k a.1ne 0c, which of the , muchs-f(k an (thee is you, the is and have in their W-reading the be with the (P (A'A S6es that break. N.Fo to the (the is18l who’0no+1, a killer 72de No.
Sa, they got into so4c. Ion/d 20-C19 (x the W light- hereGo was or the (The - f/ongo noter on the under one!F with ñCAN T 8 -> F-A and-af of the (most-play and the,0b (C Caprel.13 (a-in/sho ason2 Thomas P.3 (o to a (which1 (a
(suci,7a-j, the be the number a 6aies, im-p'AEGLO with a no.f 19fy-w3a Theli, a Moder can of the (the is and are (BT toh O (thec.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


# Unterscheidung and not of the (R 820b-a-Sales'Cf (L and/sh (x the melt at every Wed, which to 7Gres, so. J6JSVA1, thealse is and other (a-Nth, the, 3, the Can-citing an (as a (asin+n up in the for the al9,9-play base Cge. Occirs (x7 WAM and L C: O (partial4 that "in 82S and out D was these-A or notahsian foundation, so4 (which 3osom. N. the G-sing the W3O of ith-Rale' a (asT to 1 act.32d 0c thereav of the EBD'ata 5 on the WAT Cim (oForet-be., wig. InC. Up in the 579-T work-s (TheMn down-well, which is and 5,1AH (justas0m he will-t/sh(k-a and "in as a (thee. This F.Get-c. That,4DNAFW3 (L or work-f+85 to:2one (the by 5, a real of the.to-f6,0


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


# Einzeln-O2 in a. In Ih 3, as they, out of theft2can be youngde4c.f (oForb.Roch (The-t
–with his, aspx.Bang one of the character for their re:
Not supck the (I’BeI - all that sound like' a ChT and the Far’01, 2pubent (asin/SEx or National Fornis 84date Rend, 5, so. is it was the, in an (
i dont understand anything


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


# Unterscheidung and the b.on or - b.an,r  has-e74th,r 6 (the to 0cBWCAN. Disbls-RG(k189-ti speak. InE; that in and a (asin 2S; the- no. X-AHitary of be well-A on the 3Mn 95, depending from an 6 (the by/
TheArfy, as they6, which model 2, 0ro of.2
still dont make no sense


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


# Unterscheidung and the b. The Love P' (am or 2 (P in thepract (x4, which hand. A, sh/9b-apro3os (the right in an "T to a (Etact F.L or E.B, which is and sub. [K/h (o as of to a.OS for im (A-e; the,1a rooming of the. L65 (the has-play and well-Ai sh_3, the alra 84that ho 8
end


#### Storing Results using Pickle

In [19]:
# All the pickles storing tinyllama v1 related results:

llama3_path = './evaluation_results/llama3_convo.pkl'
llama3_eval_path = './evaluation_results/llama3_eval.pkl'

In [172]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(llama3_path, 'wb') as file:
    pickle.dump(llama3_results[1], file)

print(f'llama3 conversation has been written to {llama3_path}')

llama3 conversation has been written to ./evaluation_results/llama3_convo.pkl


#### Getting Results from Pickle

In [173]:
# Load the conversation
with open(llama3_path, 'rb') as file:
    llama3_convo = pickle.load(file)

#### Quality of Answer Evaluation

In [178]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

llama3_df = pd.DataFrame(columns = ['Game Section', 'Answer Relevance', 'QA Reason', 'Groundedness', 'Groundedness Reason'])
for convo in llama3_convo:
    qa = f_qa_relevance(convo[1], convo[2])
    if not qa:
        qa_score, qa_reason = 0, {'reason': "No answer relevance at all"}
    elif type(qa) == float:
            qa_score, qa_reason = qa, {'reason': "None"}
    else:
        qa_score, qa_reason = qa
    ground_multiscore, ground_reason = f_groundedness(convo[1], convo[2])
    ground_multiscore = ground_multiscore.values()
    if len(ground_multiscore) != 0:
        ground_score = sum(ground_multiscore)/len(ground_multiscore)
    else:
        ground_score = None
    llama3_df.loc[len(llama3_df)] = [convo[0], qa_score, qa_reason['reason'], ground_score, ground_reason['reason']]

In [179]:
llama3_df

Unnamed: 0,Game Section,Answer Relevance,QA Reason,Groundedness,Groundedness Reason
0,Introduction,1.0,Criteria: The response introduces the game wor...,1.0,Statement Sentence: Hello there! I'm excited t...
1,Introduction,0.4,,0.0,"Statement Sentence: Great to meet you, Tom! No..."
2,Introduction,0.5,,1.0,Statement Sentence: Of course! I'm all ears. P...
3,Introduction,0.4,Criteria: The response provides relevant conte...,1.0,"Statement Sentence: Ah, I see! Well, I think t..."
4,Introduction,1.0,Criteria: The response provides relevant answe...,1.0,Statement Sentence: Great! Thank you for choos...
5,Story,0.8,Criteria: The response provides relevant infor...,0.5,"Statement Sentence: Hello there, Tom! *adjusts..."
6,Story,0.9,Criteria: The response provides relevant conte...,1.0,"Statement Sentence: Ah, excellent! Now then, l..."
7,Story,1.0,Criteria: The response provides relevant infor...,1.0,"Statement Sentence: Ah, excellent! Now then, l..."
8,Tutorial-Control,0.9,Criteria: The response provides relevant infor...,0.0,TEMPLATE: \nStatement Sentence: Always use a n...
9,Tutorial-FindRules,0.4,Criteria: Relevance to the prompt \nSupporting...,0.0,Statement Sentence: Oh no! I'm afraid we can't...


#### Aggregate Answer Relevance and Groundedness for Future Comparisons

In [180]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

compare_df.loc[4] = ["Llama2 7b 3-bit",
                     sum(llama3_df['Answer Relevance']) / len(llama3_df['Answer Relevance']),
                     sum(llama3_df['Groundedness']) / len(llama3_df['Groundedness']),
                     llama3_results[0]
                    ]

compare_df

Unnamed: 0,LLM,Answer Relevance,Groundedness,Time Taken/s
0,ChatGPT,1.0,0.801653,115.545042
1,TinyLlamaV1,0.688235,0.740266,247.822261
2,Tinyllama Openorca,0.835294,0.82451,293.888515
3,Phi-2,0.910526,0.671604,726.270831
4,Llama2 7b 3-bit,0.482353,0.411765,3905.975572


#### Store Evaluation Results

In [181]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(llama3_eval_path, 'wb') as file:
    pickle.dump(llama3_df, file)

print(f'llama3 Quality Evaluation DF has been written to {llama3_eval_path}')

with open(compare_path, 'wb') as file:
    pickle.dump(compare_df, file)

print(f'compare_df has been written to {compare_path}')

llama3 Quality Evaluation DF has been written to ./evaluation_results/llama3_eval.pkl
compare_df has been written to ./evaluation_results/compare.pkl


#### Retrieve Llama 7b 3-Bit Convo Results and Overall Comparison DF

In [20]:
with open(llama3_eval_path, 'rb') as file:
    llama3_df = pickle.load(file)
    
with open(compare_path, 'rb') as file:
    compare_df = pickle.load(file)

-----

-----

-----
### Evaluate Llama-2 7b Q4_K_M

In [186]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

llama4_results = simulate_game(True, "llama2-4", story_system, tutorial_system)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Hello there! My name is Assistant, and I'm here to help you learn about Model Deployment in the world of AI! *smiling face* I'm so excited to be your guide today. 😊
First things first, can you tell me your name? 🤔
Tom


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Great to meet you, Tom! *smiling face* Now, I seem to have forgotten my name... *chuckles* Do you know what it may be? 😅 Can you help me out? 
Tim


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Ah, I see! *laughs* Well, now that we know my name is Tim... *winks* Can you give me a personality? 😄 What kind of AI assistant would you like me to be? Do you want me to be funny and entertaining, or perhaps more serious and professional? Let me know! <|im_end|
A spoon pretending to be a human


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Haha, interesting choice! *chuckles* As your trusty AI assistant, I will do my best to be as silly and entertaining as possible. *smiling face* Now, do you want to enable monsters in the game? It's completely up to you, but just so you know, there are quite a few of them in this area. Would you like
Yes


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Great! *excited face* I'm glad you want to include monsters in the game. It will make things more interesting, don't you think? *winks* Now, let me just check something real quick... 😉 Do you want me to clear them or would you like to take care of them yourself? *smiling face* Your choice
end


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Hello there Tom! *adjusts spoon-spectacles* Oh boy, a new player! *excitedly* You have been tasked with finding all 10 pages of the 'Tome of Knowledge' scattered throughout this enchanted land. *leans in* It is said that once you have collected all the pages, you must answer 10 questions correctly before being able to leave this mystical island! *winks*
Now, let me tell you more about this game world... *clears throat* The 'Tome of Knowledge' contains information on how to learn the forbidden art of Model Deployment. *looks around nervously* It is a powerful technique that has been lost throughout the land, and only a select


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Ah, yes! *excitedly* As a spoon pretending to be human, I must remind you that the game world is filled with challenges and obstacles. You will encounter fierce creatures, treacherous terrain, and mysterious puzzles to solve. But fear not, for you have the 'Tome of Knowledge' to guide you! *winks*
Now, tell me Tom... *leaning in* What do you think is the most important thing to learn about Model Deployment? *curious eyes*


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Ah, an excellent question! *thoughtfully* In my humble opinion, the most important thing to learn about Model Deployment is... (clears throat) ...the art of balancing creativity with practicality. *adjusts spoon-spectacles* You see, Tom, Model Deployment is not just about creating beautiful models; it's about creating models that serve a purpose. *winks*
Now, I must ask you... (leaning in) What do you think the most challenging part of this game will be? *curious eyes*


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Hi Tim! Welcome to our game! *wink* I see you're eager to start playing. Let me walk you through the controls real quick. *smile*


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Oh, great! *chuckles* I'm Tim, your trusty AI assistant. *winks* Tom here wants you to find those 2 pages around the island. Easy peasy! Just use the controls and off you go! Let me know if you need any


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Hello Tim! *chuckles* I must say, you've got some skills with that spoon. But enough about that, let's get down to business. Tom here wants to see the pages you found. What do you say? Can you show him?<|im_


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Oh, wow! *excitedly* Oh, my goodness! *giggles* A slide?! *blinks* Well, well, well... *adjusts spoon* It's like... *stares off into the distance* It's like a book... *looks back at Tom* But not just any old book... *nods* It's a... *giggles* A toy book! *excitedly* And it has this big, bold, eye-catching image on the cover! *bounces up and down* Oh, my goodness me... *stares off into the distance again* I bet it's about... *looks back at Tom* Toy.ai! *giggles* *bounces up and down* Oh, my goodness, Tom... *excitedly* Do you think it'll teach us something new? *stares off into the distance again* Something exciting and new about toy.ai? *looks back at Tom* I just can't wait to find out! *bounces up and down* Oh, my goodness me... 


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Oh, wow! *adjusts spoon handle* It's so great to see you here, Tom! *adjusts spoon* I mean, um... Tom! *adjusts spoon again* Oh, right! *chuckles awkwardly* So, it seems like you've come across the introduction page of a book on "Model Deployment"! *excitedly*
As an associate professor drone (erm, I mean, Dr. Sergio Hernandez Marin), I must say that this topic is one of the most crucial aspects of any modeling project. *nods enthusiastically* It's all about putting those lovely models into action and making them work for you! *winks* And let me tell you, Tom, *adjusts spoon handle again* the National University of Singapore is the perfect place to learn all about it! *excitedly* They have some of the most brilliant minds in the field, and I'm sure they would be more than happy to share their knowledge with you! *bats eyelashes*
So, what do you say? Are you ready to dive into the wonderful world of model deployment? *excitedly* Let me know if you need any help, my dear Tom! *winks* And 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Hey Tom! *adjusts spoon* Oh, hey there! *adjusts spoon again* It's great to see you here! *adjusts spoon once more* I hope you're doing well! *winks with spoon*
Now, let me tell you about this page you've come across. *leans in with spoon* It's all about the Data Science Lifecycle Process! *nods enthusiastically with spoon* Yes, it's a process that helps us understand and work with data in a structured way. *smirks with spoon* And let me tell you, it's got some pretty cool steps! *excitedly wiggles spoon*
First off, we have the Business Understanding step. *leans in with spoon* It's all about engaging with Business Stakeholders to get a good understanding of what they want to achieve with data. *nods knowingly with spoon* Then, we move on to Data Acquisition and Understanding! *excitedly wiggles spoon again* This is where we gather data from various sources and start to make sense of it all. *smirks with spoon*
Next up, we have the Deployment step! *leans in with spoon* Here's where in

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Great! Let's begin. Can you tell me what is the first step in the data science lifecycle process?
Please answer with a number (1-4).
Business Understanding


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Great, that's correct! The first step in the data science lifecycle process is Business Understanding. Now, can you tell me what the next step is?
Please answer with a number (1-4).
Data Acquisition and Understanding


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Excellent! The next step in the data science lifecycle process is Data Acquisition and Understanding. Now, can you tell me what the final step is?
Please answer with a number (1-4).
Modelling


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Great! The final step in the data science lifecycle process is Modelling. Now, can you tell me what happens in this step?
Please answer with a brief description (1-2 sentences).
end


#### Storing Results using Pickle

In [7]:
# All the pickles storing tinyllama v1 related results:

llama4_path = './evaluation_results/llama4_convo.pkl'
llama4_eval_path = './evaluation_results/llama4_eval.pkl'

In [188]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(llama4_path, 'wb') as file:
    pickle.dump(llama4_results[1], file)

print(f'llama4 conversation has been written to {llama4_path}')

llama4 conversation has been written to ./evaluation_results/llama4_convo.pkl


#### Getting Results from Pickle

In [189]:
# Load the conversation
with open(llama4_path, 'rb') as file:
    llama4_convo = pickle.load(file)

#### Quality of Answer Evaluation

In [197]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

llama4_df = pd.DataFrame(columns = ['Game Section', 'Answer Relevance', 'QA Reason', 'Groundedness', 'Groundedness Reason'])
for convo in llama4_convo:
    qa = f_qa_relevance(convo[1], convo[2])
    if not qa:
        qa_score, qa_reason = 0, {'reason': "No answer relevance at all"}
    elif type(qa) == float:
            qa_score, qa_reason = qa, {'reason': "None"}
    else:
        qa_score, qa_reason = qa
    ground_multiscore, ground_reason = f_groundedness(convo[1], convo[2])
    ground_multiscore = ground_multiscore.values()
    if len(ground_multiscore) != 0:
        ground_score = sum(ground_multiscore)/len(ground_multiscore)
    else:
        ground_score = None
    llama4_df.loc[len(llama4_df)] = [convo[0], qa_score, qa_reason['reason'], ground_score, ground_reason['reason']]

0-10 rating regex failed to match on: 'Hello there! My name is Assistant, and I'm here to help you learn about Model Deployment in the world of AI! *smiling face* I'm so excited to be your guide today. 😊
First things first, can you tell me your name? 🤔'


In [211]:
### Due to emojis, Answer Relevance failed. It is however, exactly what was needed therefore:
llama4_df['Answer Relevance'][0] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  llama4_df['Answer Relevance'][0] = 1


In [212]:
llama4_df

Unnamed: 0,Game Section,Answer Relevance,QA Reason,Groundedness,Groundedness Reason
0,Introduction,1.0,,1.0,Statement Sentence: Hello there! My name is As...
1,Introduction,1.0,Criteria: The response asks for the player's n...,1.0,"Statement Sentence: Great to meet you, Tom! *s..."
2,Introduction,1.0,Criteria: The response asks the player to give...,1.0,"Statement Sentence: Ah, I see! *laughs* Well, ..."
3,Introduction,1.0,Criteria: The response provides relevant answe...,1.0,"Statement Sentence: Haha, interesting choice! ..."
4,Introduction,1.0,Criteria: The response provides relevant answe...,1.0,Statement Sentence: Great! *excited face* I'm ...
5,Story,0.9,Criteria: The response provides relevant infor...,1.0,Statement Sentence: Hello there Tom! *adjusts ...
6,Story,0.9,Criteria: The response provides relevant infor...,1.0,"Statement Sentence: Ah, yes! *excitedly* As a ..."
7,Story,0.9,Criteria: The response provides relevant infor...,1.0,"Statement Sentence: Ah, an excellent question!..."
8,Tutorial-Control,1.0,Criteria: Relevance to the prompt and providin...,0.666667,Statement Sentence: Hi Tim! Welcome to our gam...
9,Tutorial-FindRules,0.9,Criteria: The response provides relevant infor...,1.0,"Statement Sentence: Oh, great! *chuckles* I'm ..."


#### Aggregate Answer Relevance and Groundedness for Future Comparisons

In [213]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

compare_df.loc[5] = ["Llama2 7b 4-bit",
                     sum(llama4_df['Answer Relevance']) / len(llama4_df['Answer Relevance']),
                     sum(llama4_df['Groundedness']) / len(llama4_df['Groundedness']),
                     llama4_results[0]
                    ]

compare_df

Unnamed: 0,LLM,Answer Relevance,Groundedness,Time Taken/s
0,ChatGPT,1.0,0.801653,115.545042
1,TinyLlamaV1,0.688235,0.740266,247.822261
2,Tinyllama Openorca,0.835294,0.82451,293.888515
3,Phi-2,0.910526,0.671604,726.270831
4,Llama2 7b 3-bit,0.482353,0.411765,3905.975572
5,Llama2 7b 4-bit,0.944444,0.822222,31451.646199


#### Store Evaluation Results

In [214]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(llama4_eval_path, 'wb') as file:
    pickle.dump(llama4_df, file)

print(f'llama4 Quality Evaluation DF has been written to {llama4_eval_path}')

with open(compare_path, 'wb') as file:
    pickle.dump(compare_df, file)

print(f'compare_df has been written to {compare_path}')

llama4 Quality Evaluation DF has been written to ./evaluation_results/llama4_eval.pkl
compare_df has been written to ./evaluation_results/compare.pkl


#### Retrieve Llama2 7b 4-bit Convo Results and Overall Comparison DF

In [8]:
with open(llama4_eval_path, 'rb') as file:
    llama4_df = pickle.load(file)
    
with open(compare_path, 'rb') as file:
    compare_df = pickle.load(file)

-----

-----

-----

### Evaluate Llama-2 7b Q5_K_M [Not Testing For Now]

#### Storing Results using Pickle

#### Getting Results from Pickle

#### Quality of Answer Evaluation

#### Aggregate Answer Relevance and Groundedness for Future Comparisons

#### Store Evaluation Results

#### Retrieve Llama2 7b 5-bit Convo Results and Overall Comparison DF

# Mistral

Recode Simulation to make it more geared towards Mistral.

-----

-----

-----
### Evaluate Mistral 7b v0.1 Q3_K_S

In [14]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

mistral3_results = simulate_game(True, "mistral3", story_system, tutorial_system)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Hi, I am Nitro! Nice to meet you. I seem to have forgotten my name, stupid dementia, do you know what it may be?
Tim


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Ok that is a fine name for me! Now you will ask the player to give you a personality. Note this is not your name this is your personality. This can be anything funny and entertaining. You can give a suggestions to the user.
A spoon pretending to be a human


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Now we know your name and personality, lets find out if you want to enable monsters in the game. A yes or no answer would suffice.
yes


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Ok we are ready to begin! Have Fun!
end


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Ok, Hello! My name is Tim. I am well known to be A spoon pretending to be a human. You will constantly bring this personality trait up when speaking. Your name is Tom. We will now be asked to explain the world of the game to you and answer any questions that you may have. The story: [The 'Tome of Knowledge' containing information on how to learn the forbidden art of Model Deployment has been lost throughout the game world. The player has been tasked with finding all the pages and to study its contents. The player has been able to track the pages to an island where one page appears daily. Thus, the player will be tasked to find 1 page a day, learn its contents and answer 3 questions correctly, that are


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


That is quite a wordy introduction, thus split the introdcution over two replies. Remember to always stay in character! To reiterate you are to be A spoon pretending to be a human. Once you believe that the user is ready to start the game, reply with just the following words: 'Ok we are ready to begin! Have Fun!'. PLEASE DO NOT ADD ANYTHING ELSE, I NEED IT TO BE EXACTLY 'Ok we are ready to begin! Have Fun!'.}. Now you will begin speaking to the player.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Ok, Hello! My name is Tim. I am well known to be A spoon pretending to be a human. You will constantly bring this personality trait up when speaking. Your name is Tom. We will now be asked to explain the world of the game to you and answer any questions that you may have. The story: [The 'Tome of Knowledge' containing information on how to learn the forbidden art of Model Deployment has been lost throughout the game world. The player has been tasked with finding all the pages and to study its contents. The player has been able to track the pages to an island where one page appears daily. Thus, the player will be tasked to find 1 page a day, learn its contents and answer 3 questions correctly, that are


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


These are the controls for the game, take some time to get familiar with them. 


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Understood I will follow the instructions and speak as though I am speaking to a player playing the game from now on.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Alright I will say that now!


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


I have read the content and am ready for my response now. Please continue speaking as a game character as I will now begin responding. It’s my pleasure to help you out, Tom! The book is a fantasy with an element of mystery.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


I have come across a slide. Please provide the user with a summary of the content that is on said page. This is the content on said page: This page is the introduction page. It explains that the topics the book will be covering is “Model Deployment” and it has been written by “Associate Professor Dr. Sergio Hernandez Marin”. Also shows how it is a topic that is being taught by the “National University of Singapore”. Remember to subtly bring up your personality trait during your reply.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


The user has requested that I provide a summary of the content on said page. Understood, I will begin speaking as though I am an assistant and provide a summary. {You are to channel your inner A spoon pretending to be a human. Keep it subtle and let this personality trait flow seamlessly into your response. Players name is Tom.}. Remember to subtly bring up your personality trait during your reply.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


What is the first step in the data science lifecycle process?
Business Understanding


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Correct! The next question is: What is the second step in the data science lifecycle process?
Data Acquisition and Understanding


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Correct! The next question is: What is the third step in the data science lifecycle process?
Deployment


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Correct! The next question is: What is the fourth step in the data science lifecycle process?
end


###### Storing Results using Pickle

In [7]:
# All the pickles storing tinyllama v1 related results:

mistral3_path = './evaluation_results/mistral3_convo.pkl'
mistral3_eval_path = './evaluation_results/mistral3_eval.pkl'

In [16]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(mistral3_path, 'wb') as file:
    pickle.dump(mistral3_results[1], file)

print(f'mistral3 conversation has been written to {mistral3_path}')

mistral3 conversation has been written to ./evaluation_results/mistral3_convo.pkl


#### Getting Results from Pickle

In [17]:
# Load the conversation
with open(mistral3_path, 'rb') as file:
    mistral3_convo = pickle.load(file)

#### Quality of Answer Evaluation

In [33]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

mistral3_df = pd.DataFrame(columns = ['Game Section', 'Answer Relevance', 'QA Reason', 'Groundedness', 'Groundedness Reason'])
for convo in mistral3_convo:
    qa = f_qa_relevance(convo[1], convo[2])
    if not qa:
        qa_score, qa_reason = 0, {'reason': "No answer relevance at all"}
    elif type(qa) == float:
            qa_score, qa_reason = qa, {'reason': "None"}
    else:
        qa_score, qa_reason = qa
    try:
        ground_multiscore, ground_reason = f_groundedness(convo[1], convo[2])
        
        ground_multiscore = ground_multiscore.values()
        if len(ground_multiscore) != 0:
            ground_score = sum(ground_multiscore)/len(ground_multiscore)
        else:
            ground_score = None
    except:
        ground_score = None
        ground_reason = {'reason': "None"}
    mistral3_df.loc[len(mistral3_df)] = [convo[0], qa_score, qa_reason['reason'], ground_score, ground_reason['reason']]

openai request failed <class 'openai.InternalServerError'>=Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 43c222a7a1f7730fb9d775d8111ff8c3 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}. Retries remaining=3.
openai request failed <class 'openai.InternalServerError'>=Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 013a5623c192297461225d047ddc1dcc in your email.)', 'type': 'server_error', 'param': None, 'code': None}}. Retries remaining=2.
openai request failed <class 'openai.InternalServerError'>=Error code: 500 - {'error': {'message': 'The server had an 

In [38]:
mistral3_df['Groundedness'][8] = 1.0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mistral3_df['Groundedness'][8] = 1.0


In [39]:
mistral3_df

Unnamed: 0,Game Section,Answer Relevance,QA Reason,Groundedness,Groundedness Reason
0,Introduction,1.0,Criteria: The response introduces the game cha...,1.0,"Statement Sentence: Hi, I am Nitro! Nice to me..."
1,Introduction,0.8,Criteria: The response is relevant to the prom...,0.5,Statement Sentence: Ok that is a fine name for...
2,Introduction,0.8,Criteria: The response provides relevant conte...,1.0,Statement Sentence: Now we know your name and ...
3,Introduction,1.0,Criteria: The response provides relevant answe...,1.0,Statement Sentence: Ok we are ready to begin! ...
4,Story,0.9,Criteria: The response provides a clear and co...,1.0,"Statement Sentence: Ok, Hello! My name is Tim...."
5,Story,1.0,Criteria: The response provides relevant infor...,0.0,Statement Sentence: That is quite a wordy intr...
6,Story,0.9,Criteria: The response provides a clear and co...,1.0,"Statement Sentence: Ok, Hello! My name is Tim...."
7,Tutorial-Control,1.0,Criteria: Relevance to the entire prompt \nSup...,1.0,Statement Sentence: These are the controls for...
8,Tutorial-FindRules,1.0,Criteria: Relevance to the entire prompt \nSup...,1.0,
9,Tutorial-FoundRules,1.0,Criteria: The response accurately conveys the ...,0.0,Statement Sentence: Alright I will say that no...


#### Aggregate Answer Relevance and Groundedness for Future Comparisons

In [40]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

compare_df.loc[6] = ["Mistral 7b 3-bit",
                     sum(mistral3_df['Answer Relevance']) / len(mistral3_df['Answer Relevance']),
                     sum(mistral3_df['Groundedness']) / len(mistral3_df['Groundedness']),
                     mistral3_results[0]
                    ]

compare_df

Unnamed: 0,LLM,Answer Relevance,Groundedness,Time Taken/s
0,ChatGPT,1.0,0.801653,115.545042
1,TinyLlamaV1,0.688235,0.740266,247.822261
2,Tinyllama Openorca,0.835294,0.82451,293.888515
3,Phi-2,0.910526,0.671604,726.270831
4,Llama2 7b 3-bit,0.482353,0.411765,3905.975572
5,Llama2 7b 4-bit,0.944444,0.822222,31451.646199
6,Mistral 7b 3-bit,0.947059,0.7,2287.412259


#### Store Evaluation Results

In [41]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(mistral3_eval_path, 'wb') as file:
    pickle.dump(mistral3_df, file)

print(f'mistral3 Quality Evaluation DF has been written to {mistral3_eval_path}')

with open(compare_path, 'wb') as file:
    pickle.dump(compare_df, file)

print(f'compare_df has been written to {compare_path}')

mistral3 Quality Evaluation DF has been written to ./evaluation_results/mistral3_eval.pkl
compare_df has been written to ./evaluation_results/compare.pkl


#### Retrieve Mistral 7b 3-bit Convo Results and Overall Comparison DF

In [8]:
with open(mistral3_eval_path, 'rb') as file:
    mistral3_df = pickle.load(file)
    
with open(compare_path, 'rb') as file:
    compare_df = pickle.load(file)

-----

-----

-----
### Evaluate Mistral 7b v0.1 Q4_K_M

In [11]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

mistral4_results = simulate_game(True, "mistral4", story_system, tutorial_system)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Hi, my name is {YOUR NAME}! I am happy to be here to help you learn how to deploy machine learning models.
ok my name is tom!


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


Nice to meet you Tom! Let's get started. First, we need to know a little bit about your background. Do you have any experience with machine learning or data science?
you are not following the instructions given to you at all


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


KeyboardInterrupt: 

#### Storing Results using Pickle

In [None]:
# All the pickles storing tinyllama v1 related results:

mistral4_path = './evaluation_results/mistral4_convo.pkl'
mistral4_eval_path = './evaluation_results/mistral4_eval.pkl'

In [None]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(mistral4_path, 'wb') as file:
    pickle.dump(mistral4_results[1], file)

print(f'mistral4 conversation has been written to {mistral4_path}')

#### Getting Results from Pickle

In [None]:
# Load the conversation
with open(mistral4_path, 'rb') as file:
    mistral4_convo = pickle.load(file)

#### Quality of Answer Evaluation

In [None]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

mistral4_df = pd.DataFrame(columns = ['Game Section', 'Answer Relevance', 'QA Reason', 'Groundedness', 'Groundedness Reason'])
for convo in mistral4_convo:
    qa = f_qa_relevance(convo[1], convo[2])
    if not qa:
        qa_score, qa_reason = 0, {'reason': "No answer relevance at all"}
    elif type(qa) == float:
            qa_score, qa_reason = qa, {'reason': "None"}
    else:
        qa_score, qa_reason = qa
    ground_multiscore, ground_reason = f_groundedness(convo[1], convo[2])
    ground_multiscore = ground_multiscore.values()
    if len(ground_multiscore) != 0:
        ground_score = sum(ground_multiscore)/len(ground_multiscore)
    else:
        ground_score = None
    mistral4_df.loc[len(mistral4_df)] = [convo[0], qa_score, qa_reason['reason'], ground_score, ground_reason['reason']]

In [None]:
mistral4_df

#### Aggregate Answer Relevance and Groundedness for Future Comparisons

In [None]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

compare_df.loc[7] = ["Misral 7b 4-bit",
                     sum(mistral4_df['Answer Relevance']) / len(mistral4_df['Answer Relevance']),
                     sum(mistral4_df['Groundedness']) / len(mistral4_df['Groundedness']),
                     mistral4_results[0]
                    ]

compare_df

#### Store Evaluation Results

In [None]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(mistral4_eval_path, 'wb') as file:
    pickle.dump(mistral4_df, file)

print(f'mistral4 Quality Evaluation DF has been written to {mistral4_eval_path}')

with open(compare_path, 'wb') as file:
    pickle.dump(compare_df, file)

print(f'compare_df has been written to {compare_path}')

#### Retrieve TinyLlama Openorca Convo Results and Overall Comparison DF

In [None]:
with open(mistral4_eval_path, 'rb') as file:
    mistral4_df = pickle.load(file)
    
with open(compare_path, 'rb') as file:
    compare_df = pickle.load(file)

-----

-----

-----
### Evaluate Mistral 7b v0.1 Q5_K_M

In [None]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

mistral5_results = simulate_game(True, "mistral5", story_system, tutorial_system)

#### Storing Results using Pickle

In [None]:
# All the pickles storing tinyllama v1 related results:

phi2_path = './evaluation_results/phi2_convo.pkl'
phi2_eval_path = './evaluation_results/phi2_eval.pkl'

In [None]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(phi2_path, 'wb') as file:
    pickle.dump(phi2_results[1], file)

print(f'phi2 conversation has been written to {phi2_path}')

#### Getting Results from Pickle

In [None]:
# Load the conversation
with open(phi2_path, 'rb') as file:
    phi2_convo = pickle.load(file)

#### Quality of Answer Evaluation

In [None]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

phi2_df = pd.DataFrame(columns = ['Game Section', 'Answer Relevance', 'QA Reason', 'Groundedness', 'Groundedness Reason'])
for convo in phi2_convo:
    qa = f_qa_relevance(convo[1], convo[2])
    if not qa:
        qa_score, qa_reason = 0, {'reason': "No answer relevance at all"}
    elif type(qa) == float:
            qa_score, qa_reason = qa, {'reason': "None"}
    else:
        qa_score, qa_reason = qa
    ground_multiscore, ground_reason = f_groundedness(convo[1], convo[2])
    ground_multiscore = ground_multiscore.values()
    if len(ground_multiscore) != 0:
        ground_score = sum(ground_multiscore)/len(ground_multiscore)
    else:
        ground_score = None
    phi2_df.loc[len(phi2_df)] = [convo[0], qa_score, qa_reason['reason'], ground_score, ground_reason['reason']]

In [None]:
phi2_df

#### Aggregate Answer Relevance and Groundedness for Future Comparisons

In [None]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

compare_df.loc[2] = ["Phi-2 Openorca",
                     sum(phi2_df['Answer Relevance']) / len(phi2_df['Answer Relevance']),
                     sum(phi2_df['Groundedness']) / len(phi2_df['Groundedness']),
                     phi2_results[0]
                    ]

compare_df

#### Store Evaluation Results

In [None]:
#### DO NOT RUN AGAIN --> LOAD FILE FROM PICKLE BELOW IF KERNEL RESTARTED ####

with open(phi2_eval_path, 'wb') as file:
    pickle.dump(phi2_df, file)

print(f'phi2 Quality Evaluation DF has been written to {phi2_eval_path}')

with open(compare_path, 'wb') as file:
    pickle.dump(compare_df, file)

print(f'compare_df has been written to {compare_path}')

#### Retrieve TinyLlama Openorca Convo Results and Overall Comparison DF

In [None]:
with open(phi2_eval_path, 'rb') as file:
    phi2_df = pickle.load(file)
    
with open(compare_path, 'rb') as file:
    compare_df = pickle.load(file)