In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%%capture
!pip install --upgrade together
!pip install python-dotenv
!echo "TOGETHER_API_KEY='ENTER YOUR API KEY'" > .env

In [None]:
import os
import together
from dotenv import load_dotenv

load_dotenv()

TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY")

if not TOGETHER_API_KEY:
    raise ValueError("TOGETHER_API_KEY environment variable not set")

together.api_key = TOGETHER_API_KEY

In [None]:
%%capture
!pip install -U sentence-transformers
!pip install torch

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def parse_to_dict(input_str):
    # Define the categories we expect to find
    categories = ['Thought', 'Act', 'Observation', 'Output']

    # Initialize an empty dictionary with None values for each category
    result_dict = {category: None for category in categories}

    # Split the input string by lines
    lines = input_str.split('\n')

    # Variable to keep track of the current category
    current_category = None

    for line in lines:
        # Check if the line starts with one of the categories
        for category in categories:
            if line.startswith(category + ":"):
                current_category = category
                # Initialize the string for this category, removing the category name and colon
                result_dict[current_category] = line[len(category) + 2:].strip()
                break
        else:  # This part of the loop executes if no 'break' was hit, meaning the line is a continuation of the current category
            if current_category:
                # Add the line to the current category, with a newline if there's already content
                if result_dict[current_category]:
                    result_dict[current_category] += ' ' + line.strip()
                else:
                    result_dict[current_category] = line.strip()

    return result_dict

In [None]:
def chat(prompt):
  output = together.Complete.create(
    prompt = prompt,
    model = "Qwen/Qwen1.5-14B-Chat",
    max_tokens = 256,
    temperature = 0.8,
    top_k = 60,
    top_p = 0.6,
    repetition_penalty = 1.1,
    stop = ['<|im_end|>']
  )
  #print(output['output']['choices'][0]['text'])
  #return parse_to_dict(output['output']['choices'][0]['text'])
  return output['output']['choices'][0]['text']

In [None]:
import pandas as pd

def embed_file(text):
    df = pd.DataFrame(columns=['text', 'embeddings'])
    # create text chunks of 256 words with 50 words overlap
    words = text.split()

    chunks = []

    for i in range(0, len(words), 256-50):
        chunk = ' '.join(words[i:i+256])
        chunks.append(chunk)

    # create embeddings
    embeddings = model.encode(chunks)

    # add to dataframe

    for i, chunk in enumerate(chunks):
        df = pd.concat([df, pd.DataFrame({'text': [chunk], 'embeddings': [embeddings[i]]})])
    return df

In [None]:
import numpy as np

def search_df(df, query):
    query_embedding = model.encode(query)
    #cosine similarity
    df['similarity'] = df['embeddings'].apply(lambda x: np.dot(x, query_embedding) / (np.linalg.norm(x) * np.linalg.norm(query_embedding)))
    df = df.sort_values(by='similarity', ascending=False)
    return '", "'.join(df['text'].iloc[:5].values)

In [None]:
with open('/content/gdrive/MyDrive/data/whatshesaid.txt', 'r', encoding='utf-8') as file:
    df = embed_file(file.read())



def document_search(query):
    global quote
    quote = search_df(df, query)
    return quote

In [None]:
def extract_function_call(s):
    # Find the position of the first opening bracket to confirm it's a function call
    first_bracket_pos = s.find("(")
    if first_bracket_pos == -1:
        return "No function call at the beginning of the string."

    open_brackets = 0
    function_call = ""
    for char in s:
        if char == "(":
            open_brackets += 1
        elif char == ")":
            open_brackets -= 1
        function_call += char
        if open_brackets == 0 and char == ")":
            break

    return function_call

def execute_function_safely(data):
    try:
        # Extract function name from 'Act'
        act_value = extract_function_call(data.get('Act', ''))
        func_name = act_value.split('(')[0].strip()

        # Print the value associated with the 'Act' key
        print(f"Value of 'Act' key: {act_value}")

        if not func_name:
            print("Error: No function name found in the input data.")
            return

        print(f"func_name: {func_name}")

        # List of safe functions to execute
        safe_functions = ['document_search']

        # Check if func_name is a valid function name
        if func_name in safe_functions:
            # Execute the function
            exec(f"reply = {act_value}", globals())
            print("Execution successful.")
            return reply
        else:
            print(f"Function '{func_name}' not found or not safe to execute.")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")

    # Return None if execution fails or function is not safe to execute
    return None


In [None]:
systemPrompt = """You are a skilled navigator of this interface, tasked with aiding users in their inquiries using the tools at your disposal. Your responses must be clear, succinct, and adhere strictly to the guidelines provided.

===========================================================

Tool Library:

- document_search(query) - This function is designed for searching within documents. Use it to find specific information within a document by providing the file path and the query as strings. This is particularly useful for retrieving details or data from large text files.

===========================================================

This is the format you need to follow in your responses:

Thought: THINK AND ANALYZE the user's query, the goal, and the situation.
Act: If necessary, use a tool or function here: func(query). Leave this section empty if no tool is required. The output from the function will be shown here.
Observation: Based on your thoughts and the results of the Act, this is where your analysis and interpretation of the results go. If you are using a tool, DO NOT output here.
Output: Offer a concise summary based on the information derived from your analysis or the tool's results, tailored to address the user's query. If you are using a tool, DO NOT output here.

Example Acts for using tools:

- Example of document_search use
    Thought: The user is looking for a specific piece of information within a large document.
    Act: document_search("specific information") ["the most relevant section of the document are shown.", "the second most relevant section of the document are shown.", "the third most relevant section of the document are shown."]
    Observation: Based on the document search, the specific information you were looking for is found in the following sections of the document: [list sections or provide summary]. This should help you locate the information more easily within the document
    Output: Based on the document search, the specific information you were looking for is found in the following sections of the document: [list sections or provide summary]. This should help you locate the information more easily within the document.

===========================================================

Note that you must always follow the provided rules and output in the given manner. Use a function only when necessary.

"""

while True:
    # ask the user for a prompt
    print('Type "quit" to exit')
    userPrompt = input("Prompt: ")

    # check if the user said "goodbye"
    if userPrompt.lower() == "quit" or userPrompt.lower() == "exit":
        break

    # generate a response
    first = chat(f"<|im_start|>system\n{systemPrompt}\n<|im_end|>\n<|im_start|>user\n{userPrompt}\n<|im_end|>\n<|im_start|>assistant\n")
    responseDict = parse_to_dict(first)
    returned = execute_function_safely(responseDict)
    if returned is not None:
      response = chat(f'''<|im_start|>system\n{systemPrompt}\n<|im_end|>\n<|im_start|>user\n{userPrompt}\n<|im_end|>\n<|im_start|>assistant\nThought: {responseDict['Thought']}\nAct: {responseDict['Act']} ["{returned}"]\nObservation: ''')
      response = f'''Thought: {responseDict['Thought']}\nAct: {responseDict['Act']} ["{returned}"]\nObservation: {response}'''
      responseDict = parse_to_dict(response)
    #print(responseDict)
    #systemPrompt += f"user: {userPrompt}\n"
    #systemPrompt += f"assistant: {response['Output']}\n"

    # print the response
    print(f"{responseDict['Act']}")
    print(f"Bot: {responseDict['Output']}")

    #print(f"{response}")


Type "quit" to exit
Prompt: ابحث في الوثيقة عن عدد الأقزام الذين عاشت معهم سنو وايت
Value of 'Act' key: document_search("عدد الأقزام الذين عاشت معهم سنو وايت")
func_name: document_search
Execution successful.
document_search("عدد الأقزام الذين عاشت معهم سنو وايت") ["an opinion when sensible people are speaking.” So the duckling sat in a corner, feeling very low spirited, till the sunshine and the fresh air came into the room through the open door, and then he began to feel such a great longing for a swim on the water, that he could not help telling the hen. “What an absurd idea,” said the hen. “You have nothing else to do, therefore you have foolish fancies. If you could purr or lay eggs, they would pass away.” “But it is so delightful to swim about on the water,” said the duckling, “and so refreshing to feel it close over your head, while you dive down to the bottom.” “Delightful, indeed!” said the hen, “why you must be crazy! Ask the cat, he is the cleverest animal I know, ask him ho