# This Notebook Records Development Process of Backend

In [1]:
# prepare all env  keys
import json
import os

with open('keys.json', 'r') as file:
    api_keys = json.load(file)

os.environ["GROQ_API_KEY"] = api_keys["GROQ_API_KEY"]
os.environ["JINA_API_KEY"] = api_keys["JINA_API_KEY"]
os.environ["GOOGLE_CSE_ID"] = api_keys["GOOGLE_CSE_ID"]
os.environ["GOOGLE_API_KEY"] = api_keys["GOOGLE_API_KEY"]


In [12]:
import pandas as pd
import os
from groq import Groq
import time 
from langchain_chroma import Chroma
from tqdm import tqdm
import os
from langchain_community.embeddings import JinaEmbeddings
from langchain_chroma import Chroma
from typing import List
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_core.tools import Tool
from langchain.agents import initialize_agent
from langchain_groq import ChatGroq
import json

with open('keys.json', 'r') as file:
    api_keys = json.load(file)

os.environ["GROQ_API_KEY"] = api_keys["GROQ_API_KEY"]
os.environ["JINA_API_KEY"] = api_keys["JINA_API_KEY"]
os.environ["GOOGLE_CSE_ID"] = api_keys["GOOGLE_CSE_ID"]
os.environ["GOOGLE_API_KEY"] = api_keys["GOOGLE_API_KEY"]

def call_groq(raw_prompt, temperature=0):
    client = Groq(
        api_key=os.environ.get("GROQ_API_KEY"),
    )
    attempt = 0
    while attempt < 5:
        try:
            chat_completion = client.chat.completions.create(
                temperature=temperature,
                max_tokens=8192,
                messages=[
                    {
                        "role": "system",
                        "content": "you are a helpful assistant."
                    },
                    {
                        "role": "user",
                        "content": raw_prompt,
                    }
                ],
                model="llama3-8b-8192",
            )
            return chat_completion.choices[0].message.content
        except:
            print("Rate limite exceeded, sleeping for 5 seconds")
            time.sleep(5)
            attempt += 1
    print("Failed to generate!")
    return None

# Loading data

In [96]:
from datasets import load_dataset
from datasets import Dataset
import random
from tqdm import tqdm

# Stream the dataset
dataset = load_dataset("HuggingFaceFW/fineweb-edu", data_files="data/CC-MAIN-2024-10/*.parquet", split='train', streaming=True)

# get first 10000 rows of dataset
def get_first_n_rows(dataset, n=10000):
    first_n_rows = []
    for i, row in enumerate(dataset):
        first_n_rows.append(row)
        if i == n - 1:
            break 
    return first_n_rows
n_rows = get_first_n_rows(dataset)

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

In [97]:
# extract the text, and save it to local dataset
import pandas as pd
fineweb_edu_2024_10_subset = pd.DataFrame(data={"text" : [row["text"] for row in n_rows]})
fineweb_edu_2024_10_subset.to_csv("data/fineweb_edu_2024_10_subset.csv", index=False)

In [82]:
fineweb_edu_2024_10_subset.head(10)

Unnamed: 0,text
0,– Computer viruses are parasitic programs whic...
1,"For those unfamiliar with Cornish, it is class..."
2,Our cultural identity: Experience the culture ...
3,"“The more you empower kids, the more they can ..."
4,"Mixed Progress Against Cancers in Teens, Young..."
5,Rhetorical analysis is not for the faint of he...
6,Sport plays an important role in the education...
7,World's first 3D keyhole surgery performed at ...
8,The Lodge Pole Pine Christmas tree is a native...
9,After the famous earthquake of 1755 that destr...


# Working with Model

In [None]:
# download NexaAI model
from transformers import AutoTokenizer, GemmaForCausalLM
import torch

model_id = "NexaAIDev/Octopus-v2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = GemmaForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, device_map="auto"
)

model.save_pretrained("octopus-v2")
tokenizer.save_pretrained("octopus-v2")

In [39]:
# load the already-saved model
from transformers import AutoTokenizer, GemmaForCausalLM, pipeline
import torch
from langchain.llms import HuggingFacePipeline

model_dir = "octopus-v2"
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = GemmaForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, device_map=device
)

pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100
)

octopus_llm = HuggingFacePipeline(pipeline=pipe)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

'What is the capital of China?\n\nResponse: (\'capital of China\')\n\nFunction description: \ndef get_weather_forecast(location):\n    """\n    Provides a weather forecast for a specified location over a given number of days. Each day\'s forecast includes a brief description of the expected weather conditions.\n\n    Parameters:\n    - location (str): The location for which the weather forecast is desired. Can be a city name, ZIP code, or other location identifiers.\n\n    Returns'

In [47]:
# it seems that the octopus llm can only create functions. So it did not work
octopus_llm("Call the function run() if I ask you about athletic question. Call the function eat() if I ask you about food question. Question: who is a runner?")

'Call the function run() if I ask you about athletic question. Call the function eat() if I ask you about food question. Question: who is a runner?\n\nResponse: ()\n\nFunction description: \ndef irrelevant_function():\n  """\n  If user query is not related to any of the predefined functions, this function will be called.\n  \n  Args:\n  \n  Returns:\n  """\n\n'

In [7]:
# switch to groq instead...
import os 
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
llm = ChatGroq(temperature=0, model_name="llama-3.1-8b-instant")

system = "You are a helpful assistant."
human = "{text}"
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

chain = prompt | llm
chain.invoke({"text": "Explain the importance of low latency LLMs."})

AIMessage(content="Low-latency Large Language Models (LLMs) are a crucial development in the field of natural language processing (NLP) and artificial intelligence (AI). Here's why:\n\n**What are Low-Latency LLMs?**\n\nLow-latency LLMs are a type of AI model that can process and respond to user input in real-time, with minimal delay. Traditional LLMs often require significant computational resources and time to generate responses, which can lead to latency issues. Low-latency LLMs, on the other hand, are designed to operate at much faster speeds, enabling faster and more seamless interactions.\n\n**Importance of Low-Latency LLMs:**\n\n1. **Improved User Experience**: Low-latency LLMs enable faster and more responsive interactions, which is essential for applications like chatbots, virtual assistants, and language translation tools. Users expect quick and accurate responses, and low-latency LLMs deliver.\n2. **Enhanced Conversational Flow**: With low-latency LLMs, conversations can flow

# Keyword Matching (Does not work!)

In [83]:
# use a topic, generate 30 keywords by prompting LLM
import os
from groq import Groq

def call_groq(raw_prompt):
    client = Groq(
        api_key=os.environ.get("GROQ_API_KEY"),
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "you are a helpful assistant."
            },
            {
                "role": "user",
                "content": raw_prompt,
            }
        ],
        model="mixtral-8x7b-32768",
    )
    return chat_completion.choices[0].message.content

def parse_keyword(raw_result):
    try:
        keywords = raw_result[raw_result.index("Keywords:") + len("Keywords:"):].strip()
        if keywords[-1] == ".":
            keywords = keywords[:-1]
        keywords = [keyword.strip().lower() for keyword in keywords.split(",")]
        return keywords
    except Exception as e:
        print(e)
        return None

def generate_keywords(topic, k, max_attempt = 5):    
    raw_prompt = f"Please list exactly {k} keywords related to the to the topic {topic}. Just list the words. You should not include index or explanation."
    raw_prompt += "Begin your response with 'Keywords:', followed by <keyword_1>, <keyword_2>, ..."
    raw_result = call_groq(raw_prompt)
    attempt = 0
    while attempt < max_attempt:
        result = parse_keyword(raw_result)
        if result != None:
            return set([topic.lower()] + result)
        else:
            attempt += 1
    return None

res = generate_keywords("finance", 30)

In [98]:
from nltk.tokenize import word_tokenize


def preprocess(doc):
    tokens = word_tokenize(doc.lower())
    tokens = [word for word in tokens if word.isalnum()]
    return tokens

def is_topic_document(doc, keywords, threshold = 15):
    tokens = preprocess(doc)
    count = 0
    for token in tokens:
        if token in keywords:
            count += 1
        if count == threshold:
            return True
    return False


In [99]:
import pandas as pd
from tqdm import tqdm

true_docs = []
texts = pd.read_csv("data/fineweb_edu_2024_10_subset.csv")["text"]

for text in tqdm(texts):
    if is_topic_document(text, res):
        true_docs.append(text)



100%|██████████| 10000/10000 [00:45<00:00, 220.53it/s]


In [112]:
def parse_llm_response(raw_result, header):
    try:
        res = raw_result[raw_result.index(header) + len(header):].strip()
        if res[-1] == ".":
            res = res[:-1]
        return res
    except Exception as e:
        print(e)
        return None

def is_topic_document_llm(doc, topic, attempt = 5):
    raw_prompt = f"Here is a text: {doc}"
    raw_prompt += "Read this text carefully. Then determine whether this text belongs to the topic {topic}."
    raw_prompt += "Begin your answer by 'Answer:', followed by true or fasle. Do not include reasoning or words other than true or fasle."
    

"In world practice, there are two classical types of pension systems based on the principle of financing: pay-as-you-go and funded. In the pay-as-you-go pension system, payments to pensioners are made at the expense of the current income of workers (current tax revenues to the budget). In the funded system, the working generation pays contributions that are not spent on payments to the elderly but accumulated, invested and, together with the investment return, subsequently used to provide pensions to those who have been saving.\nWe can particularly highlight the notional-defined contribution pension system used in some countries, it combines elements of the pay-as-you-go and funded types of pension systems. Pension entitlement is earned by the participant's contributions to the pension system. At some point, prospective retirees enter into a deferred retirement annuity contract with a life insurance company.\n“A deferred insurance annuity is an insurance agreement under which the insur

In [113]:
res

{'accounting',
 'assets',
 'audit',
 'bank',
 'bonds',
 'budget',
 'budgeting',
 'credit',
 'debit',
 'dividends',
 'economy',
 'expenses',
 'finance',
 'financial advisor',
 'financial planning',
 'financial statements',
 'income',
 'inflation',
 'insurance',
 'interest',
 'investing',
 'liabilities',
 'loan',
 'mortgage',
 'pension',
 'portfolio',
 'retirement',
 'savings',
 'stocks',
 'taxes'}

# RAG Based Retrieval

In [7]:
# simplistic version of using BM25 retriever
from langchain.retrievers import BM25Retriever
import pandas as pd
import os
from groq import Groq
import time 

def call_groq(raw_prompt, temperature=0):
    client = Groq(
        api_key=os.environ.get("GROQ_API_KEY"),
    )
    attempt = 0
    while attempt < 5:
        try:
            chat_completion = client.chat.completions.create(
                temperature=temperature,
                max_tokens=8192,
                messages=[
                    {
                        "role": "system",
                        "content": "you are a helpful assistant."
                    },
                    {
                        "role": "user",
                        "content": raw_prompt,
                    }
                ],
                model="llama3-8b-8192",
            )
            return chat_completion.choices[0].message.content
        except:
            print("Rate limite exceeded, sleeping for 5 seconds")
            time.sleep(5)
            attempt += 1
    print("Failed to generate!")
    return None

def filter_text_by_topic(texts, topic, k=100):
    description = call_groq(f"Elaborate on this: {topic}.")
    bm25_retriever = BM25Retriever.from_texts(texts)
    bm25_retriever.k = k
    docs = bm25_retriever.get_relevant_documents(description)
    docs = [doc.page_content for doc in docs]
    return docs

texts = pd.read_csv("data/fineweb_edu_2024_10_subset.csv")["text"]
# texts = filter_text_by_topic(texts, "finance", 100)

In [36]:
import pandas as pd 
from tqdm import tqdm
import csv
import os
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.embeddings import JinaEmbeddings
from langchain_chroma import Chroma

from sentence_transformers import SentenceTransformer
from langchain_experimental.text_splitter import SemanticChunker
from typing import List


class MyEmbeddings:
    def __init__(self):
        self.embedding_func = JinaEmbeddings(model_name="jina-embeddings-v2-base-en")

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embedding_func = JinaEmbeddings(model_name="jina-embeddings-v2-base-en")
        batch_size = 2048
        embeddings = []
        n = len(texts)
        for i in tqdm(range(0, n, batch_size)):
            page_contents = []
            for j in range(i, i + batch_size):
                if j >= n:
                    break
                page_contents.append(texts[j])
            if len(page_contents) > 0:
                embeddings.extend(embedding_func.embed_documents(page_contents))
        return embeddings 


embeddings = MyEmbeddings()

# Initialize the text splitter with the desired chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=200,
    length_function=len,
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ]
)


texts = list(pd.read_csv("data/fineweb_edu_2024_10_subset.csv")["text"])

documents = text_splitter.create_documents(texts = texts)

len(documents)
# splitted_text = text_splitter.split_text(texts)
# embeddings = embed_by_batch(texts, embedding_func)

15068

In [38]:
# create vector store

persist_directory = 'fineweb_db'

vectordb = Chroma.from_documents(documents=documents,
                                 embedding=embeddings,
                                 persist_directory=persist_directory)


100%|██████████| 8/8 [03:35<00:00, 26.91s/it]


In [13]:
# load the database
from langchain_chroma import Chroma
from tqdm import tqdm
import os
from langchain_community.embeddings import JinaEmbeddings
from langchain_chroma import Chroma
from typing import List

persist_directory = 'fineweb_db'
class CustomJinaEmbeddings:
    def __init__(self):
        self.embedding_func = JinaEmbeddings(model_name="jina-embeddings-v2-base-en")

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embedding_func = JinaEmbeddings(model_name="jina-embeddings-v2-base-en")
        batch_size = 2048
        embeddings = []
        n = len(texts)
        for i in tqdm(range(0, n, batch_size)):
            page_contents = []
            for j in range(i, i + batch_size):
                if j >= n:
                    break
                page_contents.append(texts[j])
            if len(page_contents) > 0:
                embeddings.extend(embedding_func.embed_documents(page_contents))
        return embeddings 
    def embed_query(self, text: str) -> List[float]:
        return self.embedding_func.embed_query(text)


embeddings = CustomJinaEmbeddings()

vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
retriever = vectordb.as_retriever(search_kwargs={"k": 100})

topic = "Finance"
description = call_groq(f"Elaborate on this: {topic}.")
docs = retriever.get_relevant_documents(description)

  warn_deprecated(


In [9]:
from tqdm import tqdm 
splitted_texts = [doc.page_content for doc in docs]
def create_groq_evaluate_prompt(query, context):
    prompt = f"Here is a context I want you to consider: {context}\n"
    prompt += f"Here is a query: {query}\n" 
    prompt += """Write a brief analysis of whether the context is related to the query. Then conclude by writing "Result:", followed by "yes" or "no"."""
    return prompt
def parse_res(header, response):
    try:
        res = response[response.index(header) + len(header):].strip()
        return res 
    except:
        return None

def evaluate_context(query, context):
    evaluate_prompt = create_groq_evaluate_prompt(query, context)
    result = parse_res("Result:", call_groq(evaluate_prompt))
    if result == "yes":
        return True 
    else:
        return False
    
import threading
def multiprocess(func, iterable, n_workers):
    threads = []
    result  = {}
    def worker():
        while True:
            xs = next(iterable, None)
            if xs == None:
                break
            result[xs] = func(*xs)
    for _ in range(n_workers):
        threads.append(threading.Thread(target = worker))
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
    return result


input_contexts = [("Finance", splitted_texts[i]) for i in range(len(splitted_texts))]

result = []
for i in tqdm(range(len(splitted_texts))):
    result.append(evaluate_context("Finance", splitted_texts[i]))
# result = multiprocess(evaluate_context, iter(input_contexts), 3)


100%|██████████| 100/100 [00:52<00:00,  1.92it/s]


In [13]:
# remaining_texts = []
# for r in result:
#     if result[r] == True:
#         remaining_texts.append(r)
# len(remaining_texts)
remaining_texts = []
for i in range(len(result)):
    if result[i] == True:
        remaining_texts.append(splitted_texts[i])
len(remaining_texts)

87

# Generate instructions through GenQA process

In [5]:
def create_genq_prompt(context, n1=20, n2=20):
    prompt = "Here is the context you need to consider: "
    prompt += f"{context} \n"
    prompt += f"Now, list {n1} topics that you can answer questions about in relation to this context. Select a random topic from this list and specify it.\n"
    prompt += f"Then write {n2} subtopics about the selected topic. Select a random subtopic from this list and specify it.\n"
    prompt += "Next, write a question that is not directly related to the subtopic but requires expertise in the subtopic and the given context."
    prompt += "The name of the subtopic should not appear in the question, and the words in the subtopic should not be used in the question."
    prompt += """Start your questions with "Question:". Be creative."""
    return prompt 

def parse_res(header, response):
    try:
        res = response[response.index(header) + len(header):].strip()
        return res 
    except:
        return None

def genq_by_context(context, n1=20, n2=20, max_attempt=5):
    genq_prompt = create_genq_prompt(context, n1, n2)
    attempt = 0
    while attempt < max_attempt:
        response = call_groq(genq_prompt, temperature=0.8)
        genq_result = parse_res("Question:", response)
        if genq_result != None:
            return genq_result 
        attempt += 1
    return None 


In [99]:
# in case we want to split the text to avoid exceeding max tokens
import multiprocessing.dummy as mp
from transformers import AutoTokenizer
from huggingface_hub import login 
from tqdm import tqdm

def split_text_by_punctuation(text, num_pieces):
    punctuation = [".", "!", "?"]
    best_split = None
    min_difference = float("inf")
    for punct in punctuation:
        sentences = text.split(punct)
        if len(sentences) <= num_pieces:
            continue
        candidates = [""] * num_pieces
        for i, sentence in enumerate(sentences):
            candidates[i % num_pieces] += sentence + punct
        
        lengths = [len(candidate) for candidate in candidates]
        max_length = max(lengths)
        min_length = min(lengths)
        difference = max_length - min_length
        if difference < min_difference:
            min_difference = difference
            best_split = candidates
    
    return best_split

def recursive_split(text, max_tokens=4000, num_pieces=2):
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
    tokens = tokenizer.encode(text)
    if len(tokens) <= max_tokens:
        return [text]
    split_texts = split_text_by_punctuation(text, num_pieces)
    if not split_texts:
        return [text]  # If no good split found, return as is
    pieces = []
    for part in split_texts:
        pieces.extend(recursive_split(part, max_tokens, num_pieces + 1))
    return pieces

login("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")
splitted_texts = []
count = 0
for text in texts:
    splitted_texts.extend(recursive_split(text, max_tokens=5000))
    print(f"{count} texts splitted!")
    count += 1


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/orbina/.cache/huggingface/token
Login successful
0 texts splitted!
1 texts splitted!
2 texts splitted!
3 texts splitted!
4 texts splitted!
5 texts splitted!
6 texts splitted!
7 texts splitted!
8 texts splitted!
9 texts splitted!
10 texts splitted!
11 texts splitted!
12 texts splitted!
13 texts splitted!
14 texts splitted!
15 texts splitted!
16 texts splitted!
17 texts splitted!
18 texts splitted!
19 texts splitted!
20 texts splitted!
21 texts splitted!
22 texts splitted!
23 texts splitted!
24 texts splitted!
25 texts splitted!
26 texts splitted!
27 texts splitted!
28 texts splitted!
29 texts splitted!
30 texts splitted!
31 texts splitted!
32 texts splitted!
33 texts splitted!
34 texts splitted

In [None]:
# multiprocessing generation
import pandas as pd
import multiprocessing.dummy as mp
import time 

start_time = time.time()
def multiprocess_multiple_variables(func, iterable, n_workers):
    pool = mp.Pool(processes=n_workers)
    result = pool.starmap(func, iterable)
    pool.close()
    pool.join()
    return result

def gen_m_q_for_n_context_multiprocess(contexts, m, n1=20, n2=20, max_attempt=5, n_workers=5):
    all_q = pd.DataFrame(columns=["text", "instruction"])
    for i in range(len(contexts)):
        context_input = [(contexts[i], n1, n2, max_attempt) for _ in range(m)]
        try:
            context_q = multiprocess_multiple_variables(genq_by_context, context_input, n_workers)
            for j in range(len(context_q)):
                all_q.loc[len(all_q)] = [contexts[i], context_q[j]]
            print(f"{i}/{len(contexts)} processed")
        except Exception as e:
            print(e)
            print(f"Fail to generate for the {i}-th context! Skipping it...")
    return all_q


def gen_all_q_multiprocess(contexts, m, n1=20, n2=20, n_workers=3):
    all_q = pd.DataFrame(columns=["text", "instruction"])
    for _ in tqdm(range(m)):
        all_inputs = []
        for j in range(len(contexts)):
            all_inputs.append((contexts[j], n1, n2))

        result = multiprocess(genq_by_context, iter(all_inputs), n_workers)

        for r in result:
            if result[r] != None:
                all_q.loc[len(all_q) + 1] = [r[0], result[r]]
    all_q = all_q.sort_values(by='text')
    return all_q


all_q = gen_all_q_multiprocess(splitted_texts, 3)
print("time taken for multiprocessing:", time.time() - start_time)


Rate limite exceeded, sleeping for 5 seconds
Rate limite exceeded, sleeping for 5 seconds
Rate limite exceeded, sleeping for 5 seconds
Rate limite exceeded, sleeping for 5 seconds
Rate limite exceeded, sleeping for 5 seconds
Rate limite exceeded, sleeping for 5 seconds


In [6]:
# single processing code
start_time = time.time()
def gen_m_q_for_n_context(contexts, m, n1=20, n2=20, max_attempt=5):
    all_q = pd.DataFrame(columns=["text", "instruction"])
    for i in range(len(contexts)):
        print(f"{i}/{len(contexts)} processed")
        for _ in range(m):
            try:
                result = genq_by_context(contexts[i], n1, n2, max_attempt)
                if result != None:
                    all_q.loc[len(all_q)] = [contexts[i], result]
            except:
                print(f"Failed to generate for {i}-th context! Skipping it...")
                break
    return all_q

all_q = gen_m_q_for_n_context(remaining_texts, 3)
print("time taken for single processing:", time.time() - start_time)



0/87 processed
1/87 processed
2/87 processed
3/87 processed
4/87 processed
5/87 processed
6/87 processed
Rate limite exceeded, sleeping for 5 seconds
7/87 processed
8/87 processed
9/87 processed
10/87 processed
11/87 processed
Rate limite exceeded, sleeping for 5 seconds
12/87 processed
13/87 processed


In [107]:
all_q.to_csv("data/fineweb_instructions.csv", index=False)

# Generate Answers thought LLM agent

In [5]:
all_q = ["Who is skywalker?"]

In [14]:
import pandas as pd

instructions = pd.read_csv("data/fineweb_instructions.csv")["instruction"]

In [4]:
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_core.tools import Tool
from langchain.agents import initialize_agent
import os
from langchain_groq import ChatGroq

llm = ChatGroq(temperature=0, model_name="llama3-8b-8192")

class CustomGoogleSearchWrapper:
    def __init__(self, k):
        self.all_resources = []
        self.search = GoogleSearchAPIWrapper()
        self.k = k
    def top_k_results(self, query):
        res = self.search.results(query, self.k)
        results = " ".join([r["snippet"] for r in res])
        sources = [r["link"] for r in res]
        self.all_resources.extend(sources)
        return {"search_results" : results, "sources" : sources}
    def get_all_resources(self):
        return self.all_resources
    def reset_all_resources(self):
        self.all_resources = []

def search_for_query(query, k=3):
    search_engine = CustomGoogleSearchWrapper(k)
    meta_data_search_tool = Tool(
        name="Google Search Snippets",
        description="Search Google for recent results.",
        func=search_engine.top_k_results,
    )
    tools = [meta_data_search_tool]
    agent = initialize_agent(tools, llm, agent="chat-zero-shot-react-description", verbose=True)
    res = agent.run(query)
    sources = search_engine.get_all_resources()
    # search_engine.reset_all_resources()
    return res, sources 

result, sources = search_for_query(instructions[0])


In [6]:
instruction_df = pd.read_csv("data/fineweb_instructions.csv")
instruction_df.head(10)

Unnamed: 0,text,instruction
0,"In an increasingly complex financial world, eq...",How can a parent encourage their 12-year-old t...
1,"In an increasingly complex financial world, eq...","As a financial literacy expert, how can you he..."
2,"In an increasingly complex financial world, eq...",How can a parent encourage their 12-year-old c...
3,What is Decentralized Finance (DeFi)? A Compre...,How might the development of decentralized aut...
4,What is Decentralized Finance (DeFi)? A Compre...,Can you compare the economic impact of a decen...
5,What is Decentralized Finance (DeFi)? A Compre...,How might the development of decentralized aut...
6,Running a successful business depends on how w...,How would a financial analyst recommend adjust...
7,Running a successful business depends on how w...,A company is experiencing a cash crunch due to...
8,Running a successful business depends on how w...,What are the potential consequences of not pro...
9,What is Business Credit?\nBusiness credit refe...,Can a small business owner with a limited cred...


In [10]:
from tqdm import tqdm
def search_for_all_queries(instruction_df):
    completed_df = pd.DataFrame(columns = ["original_context", "instruction", "response", "sources"])
    all_contexts = list(instruction_df["text"])
    all_instructions = list(instruction_df["instruction"])
    for i in tqdm(range(len(instruction_df))):
        res, sources = search_for_query(all_instructions[i])
        if len(res) > 0:
            completed_df.loc[len(completed_df) + 1] = [all_contexts[i], all_instructions[i], res, sources]
    return completed_df

completed_df = search_for_all_queries(instruction_df[:20])

100%|██████████| 20/20 [01:21<00:00,  4.07s/it]


In [9]:
completed_df

Unnamed: 0,original_context,instruction,response,sources
1,"In an increasingly complex financial world, eq...",How can a parent encourage their 12-year-old t...,To encourage a 12-year-old to save for a long-...,"[https://529.wa.gov/, https://www.mefa.org/blo..."
2,"In an increasingly complex financial world, eq...","As a financial literacy expert, how can you he...",To help a young adult develop a savings plan t...,[https://www.nerdwallet.com/article/mortgages/...
3,"In an increasingly complex financial world, eq...",How can a parent encourage their 12-year-old c...,,[https://www.ramseysolutions.com/relationships...
4,What is Decentralized Finance (DeFi)? A Compre...,How might the development of decentralized aut...,The development of decentralized autonomous or...,[https://home.treasury.gov/system/files/136/De...
5,What is Decentralized Finance (DeFi)? A Compre...,Can you compare the economic impact of a decen...,The economic impact of a decentralized financi...,[https://www.fsb.org/wp-content/uploads/P06061...
6,What is Decentralized Finance (DeFi)? A Compre...,How might the development of decentralized aut...,The development of decentralized autonomous or...,[https://home.treasury.gov/system/files/136/De...
7,Running a successful business depends on how w...,How would a financial analyst recommend adjust...,The financial analyst would recommend adjustin...,[https://disclosure.spglobal.com/en/regulatory...
8,Running a successful business depends on how w...,A company is experiencing a cash crunch due to...,"To improve its cash flow situation, the compan...",[https://upflow.io/blog/ar-collections/account...
9,Running a successful business depends on how w...,What are the potential consequences of not pro...,The consequences of not properly managing a co...,[https://www.linkedin.com/advice/0/what-risks-...
10,What is Business Credit?\nBusiness credit refe...,Can a small business owner with a limited cred...,"Yes, a small business owner with a limited cre...","[https://www.ffiec.gov/hmda/pdf/2021Guide.pdf,..."
