Here's the link if you want to test: https://colab.research.google.com/drive/1pfSR__Q-NaiTD8Hr9GucWWOM4zkXvq7R?usp=sharing

In [None]:
!pip install wikipedia-api chromadb openai



In [None]:
import chromadb
from chromadb.utils import embedding_functions
from wikipediaapi import Wikipedia
from openai import OpenAI

# Initialize ChromaDB
COLLECTION_NAME = "chatbot-ask"
client = chromadb.PersistentClient(path="./data")
client.heartbeat()

# Use the default embedding function
embedding_function = embedding_functions.DefaultEmbeddingFunction()
collection = client.create_collection(name=COLLECTION_NAME, embedding_function=embedding_function)

In [None]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=825d1288f74616e324cdb9687401bf0f36d6dfb0b202c395723c296fc4595f42
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
from pydantic import TypeAdapter
import json
import inspect
from langdetect import detect


# Initialize OpenAI client
client = OpenAI(
    api_key="key"
    )
COMPLETION_MODEL = "gpt-4o-mini"

CHUNK_SIZE = 500

def fetch_wikipedia_page(query: str, language: str = "en") -> str:
    """
    Fetches content from Wikipedia based on the given query and language.
    """
    wiki = Wikipedia(user_agent='FB (https://Facebook.com)', language=language)
    page = wiki.page(query)
    if not page.exists():
        return f"No page found for the query '{query}' in language '{language}'."
    return page.text

# Extract function metadata for tool setup
functions = [
    {
        "type": "function",
        "function": {
            "name": fetch_wikipedia_page.__name__,
            "description": inspect.getdoc(fetch_wikipedia_page),
            "parameters": TypeAdapter(fetch_wikipedia_page).json_schema()

        }
    }
]

# Main process
def process_user_input(user_input: str):
    """
    Handles user input, detects language, queries Wikipedia, and uses RAG for question answering.
    """
    # Detect language of the user input
    detected_language = detect(user_input)
    print(f"Detected language: {detected_language}")

    # Step 1: Ask the LLM to analyze the query
    messages = [{"role": "user", "content": user_input}]
    response = client.chat.completions.create(
        model=COMPLETION_MODEL,
        messages=messages,
        tools=functions
    )

    # Ensure the response includes a valid tool call
    if response.choices[0].message.tool_calls:
        tool_call = response.choices[0].message.tool_calls[0]
        print("Tool call structure:", tool_call)

        # Access the function details
        if tool_call.function.name == "fetch_wikipedia_page":
            # Step 2: Run the fetch_wikipedia_page function with detected language
            arguments = json.loads(tool_call.function.arguments)
            wiki_content = fetch_wikipedia_page(arguments["query"], language=detected_language)

            if "No page found" in wiki_content:
                return wiki_content

            # Step 3: Chunk content and store in ChromaDB
            chunks = [wiki_content[i:i + CHUNK_SIZE] for i in range(0, len(wiki_content), CHUNK_SIZE)]
            for index, chunk in enumerate(chunks):
                doc_id = f"{arguments['query']}-{index}"
                # Check if the document ID already exists
                existing_docs = collection.get(ids=[doc_id])
                if not existing_docs["ids"]:  # If the document doesn't exist, add it
                    collection.add(documents=[chunk], ids=[doc_id])

            # Step 4: Query the collection and return results
            query = user_input
            q_result = collection.query(query_texts=[query], n_results=3)
            context = q_result["documents"][0]

            # Step 5: Final prompt for answering the user's query
            final_prompt = f"""
            Use the following CONTEXT to answer the QUESTION at the end.
            If you don't know the answer, just say that you don't know, don't try to make up an answer.
            Use an unbiased and journalistic tone.

            CONTEXT: {context}

            QUESTION: {query}
            """
            final_response = client.chat.completions.create(
                model=COMPLETION_MODEL,
                messages=[{"role": "user", "content": final_prompt}]
            )
            return final_response.choices[0].message.content
    else:
        return "No valid tool call found in the response."


In [None]:
user_input = "Cốt truyện của Chú thuật hồi chiến?"
result = process_user_input(user_input)
print(result)

Detected language: vi
Tool call structure: ChatCompletionMessageToolCall(id='call_NbbvLPQbmwA0hcN5FrNhsoYk', function=Function(arguments='{"query":"Jujutsu Kaisen","language":"vi"}', name='fetch_wikipedia_page'), type='function')
Cốt truyện của "Chú thuật hồi chiến" (Jujutsu Kaisen) xoay quanh Yuji Itadori, một học sinh trung học, người tình cờ tiếp xúc với một vật thể nguy hiểm - ngón tay của Ryoumen Sukuna, một yêu quái mạnh mẽ. Khi một nhóm nguyền hồn, sinh ra từ những cảm xúc tiêu cực của con người, tìm cách chiếm lấy ngón tay này để tăng cường sức mạnh của chúng, Yuji và Megumi, một chú thuật sư, phối hợp với nhau để ngăn chặn âm mưu của các nguyền hồn.

Trong bối cảnh này, câu chuyện khám phá các khái niệm về chú lực, một loại năng lượng được sinh ra từ cảm xúc tiêu cực, và việc các chú thuật sư phải kiểm soát dòng chảy của năng lượng này để bảo vệ con người khỏi nguyền hồn. Chú thuật sư có khả năng tạo ra các chú thuật và kết giới để chống lại những sinh vật này, đồng thời phát 

In [None]:
user_input = "What is Bocchi the Rock!?"
result = process_user_input(user_input)
print(result)

Detected language: en
Tool call structure: ChatCompletionMessageToolCall(id='call_9rQuXHGPSHuDtI98tYXGURXY', function=Function(arguments='{"query":"Bocchi the Rock!"}', name='fetch_wikipedia_page'), type='function')
Bocchi the Rock! is a Japanese four-panel manga series written and illustrated by Aki Hamazi. It has been serialized in Houbunsha's seinen manga magazine Manga Time Kirara Max since December 2017. The series has been collected in seven tankōbon volumes as of October 2024. Additionally, it has a spin-off manga titled Bocchi the Rock! Side Story: Kikuri Hiroi's Heavy-Drinking Diary, which began publication in July 2023. The manga has also been adapted into an anime television series and has inspired a stage play.
