In [1]:
!pip install langchain chromadb wikipedia-api



In [2]:
!pip install -U bitsandbytes langchain-community



Import libaries

In [3]:
import chromadb
import wikipediaapi
from chromadb.config import Settings
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, set_seed

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate

from langchain import PromptTemplate, LLMChain
from langchain.chains import StuffDocumentsChain
from langchain.docstore.document import Document
import os

setup chromadb

In [4]:
chroma_client = chromadb.PersistentClient(path="db2/")

wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent='extract data'
)

collection = chroma_client.get_or_create_collection(name="snd_collection")

article_titles = ["machine learning"]

extract articles from wikipedia

In [5]:
# extract text from wikipedia
def extract_text_from_wikipedia(title):
    page = wiki_wiki.page(title)
    if page.exists():
        return page.text
    else:
        print(f"Article '{title}' does not exist on Wikipedia.")
        return None

In [6]:
# add articles to chromadb
for title in article_titles:
    article_text = extract_text_from_wikipedia(title)

    if article_text:
      collection.upsert(
            documents=[article_text],
            ids=[title])

In [7]:
# add new articles to existing chromadb
new_articles = ['software engineering']
for title in new_articles:
    article_text = extract_text_from_wikipedia(title)

    if article_text:
      collection.upsert(
            documents=[article_text],
            ids=[title])

In [8]:
chroma_client = chromadb.PersistentClient(path="db2/")
collection = chroma_client.get_or_create_collection(name="snd_collection")

# get all ids from the collection
ids = collection.get(ids=collection.get()['ids'])['ids']

# print all ids
ids

['machine learning', 'software engineering']

download phi model

In [9]:
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100)
llm = HuggingFacePipeline(pipeline=pipe)

In [11]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

final prompt teemplate creation and RAG implementation

In [27]:
def phi_query_chroma_and_generate_response(query, chroma_collection, top_k=2, max_context_length=20000):
    results = chroma_collection.query(query_texts=[query], n_results=top_k)
    context = "\n".join(results['documents'][0])

    # truncate context if it exceeds max_context_length -- this was done to fine out of memory error in GPU
    if len(context) > max_context_length:
        context = context[:max_context_length]

     system_prompt = "<|system|> Use the following context to answer the question.<|end|>"
    user_prompt = f"<|user|> {query}<|end|>"
    full_prompt = f"{system_prompt}\n<|context|> {{context}}<|end|>\n{user_prompt}<|assistant|>"

    PROMPT = PromptTemplate(template=full_prompt, input_variables=["context", "query"])
    llm_chain = LLMChain(llm=llm, prompt=PROMPT)

    chain = StuffDocumentsChain(
        llm_chain=llm_chain,
        document_variable_name="context",
    )

    answer = (chain({"input_documents": [Document(page_content=context)], "query": query}, return_only_outputs=True))['output_text']
    answer = (answer.split("<|assistant|>")[-1]).strip()
    return answer

In [24]:
# usage example
user_query = "what is machine learning?"
response = phi_query_chroma_and_generate_response(user_query, collection)
print(response)

Machine learning is a branch of artificial intelligence (AI) that focuses on building systems that can learn from and make decisions based on data. It involves the development of algorithms that can process and analyze large amounts of data, identify patterns, and make predictions or decisions without being explicitly programmed for each specific task. Machine learning enables computers to improve their performance on a specific task over time with experience, much like humans learn from their experiences. It's widely used in various fields such as finance, healthcare, marketing, and technology for applications like fraud detection, disease diagnosis, recommendation systems, and autonomous vehicles.


In [25]:
# usage examples
user_query = "what is software engineering?"
response = phi_query_chroma_and_generate_response(user_query, collection)
print(response)

Software engineering is a field of engineering that focuses on the design, development, maintenance, testing, and management of software and systems. It applies engineering principles and practices to the creation of software that is reliable, efficient, and meets the needs of users and stakeholders. Software engineers use a systematic, disciplined, and quantifiable approach to the development, operation, and maintenance of software, ensuring that software products are of high quality and can be maintained and improved over time. This field encompasses a wide range of activities, including requirements analysis, software design, coding, testing, and project management, all aimed at producing software that is robust, secure, and user-friendly.


In [28]:
# Example usag
user_query = "what is difference between software engineering and machine learning?"
response = phi_query_chroma_and_generate_response(user_query, collection)
print(response)

Software engineering and machine learning are two distinct fields within the broader realm of computer science, each with its own focus, methodologies, and applications. Here's a detailed comparison:

**Software Engineering:**

1. **Definition and Scope:** Software engineering is the discipline of designing, developing, maintaining, and testing computer software. It encompasses a wide range of activities, including the creation of software applications, systems, and tools that meet
