Install all libraries

In [None]:
!pip -q install -U transformers accelerate bitsandbytes sentence-transformers faiss-cpu trafilatura beautifulsoup4

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m837.9/837.9 kB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.7/318.7 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.7/274.7 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h

Upload knowledge files

In [None]:
from google.colab import files
uploaded = files.upload()
print("Uploaded files:", list(uploaded.keys()))

Saving Chatbot.docx to Chatbot.docx
Saving current_affairs.docx to current_affairs.docx
Saving food.docx to food.docx
Saving health.docx to health.docx
Saving jokes.docx to jokes.docx
Saving requirements.docx to requirements.docx
Saving tech.docx to tech.docx
Uploaded files: ['Chatbot.docx', 'current_affairs.docx', 'food.docx', 'health.docx', 'jokes.docx', 'requirements.docx', 'tech.docx']


Load and clean documents

In [None]:
import os, re
from docx import Document # Import Document from python-docx

def clean_text(text):
    text = text.replace("\r", "\n")
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

# Function to read text from a .docx file
def read_docx_text(filepath):
    try:
        doc = Document(filepath)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        return "\n".join(full_text)
    except Exception as e:
        print(f"Error reading docx file {filepath}: {e}")
        return ""

documents = []
for fn in os.listdir():
    text = ""
    if fn.endswith(".txt"):
        with open(fn, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read().strip()
    elif fn.endswith(".docx"): # Handle .docx files
        text = read_docx_text(fn)

    if text:
        documents.append({
            "source": fn,
            "text": clean_text(text)
        })

print("Loaded documents:", [d["source"] for d in documents])

Loaded documents: ['tech.docx', 'food.docx', 'Chatbot.docx', 'health.docx', 'jokes.docx', 'current_affairs.docx', 'requirements.docx']


Chunk documents

In [None]:
def chunk_text(text, chunk_size=800, overlap=150):
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunks.append(text[start:end])
        start = max(end - overlap, start + 1)
    return chunks

all_chunks = []
chunk_meta = []

for doc in documents:
    chunks = chunk_text(doc["text"])
    for i, ch in enumerate(chunks):
        all_chunks.append(ch)
        chunk_meta.append({
            "source": doc["source"],
            "chunk_id": i
        })

print("Total chunks created:", len(all_chunks))

Total chunks created: 999


In [None]:
!pip install python-docx



Create embeddings + FAISS index

In [None]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = embedder.encode(
    all_chunks,
    convert_to_numpy=True,
    normalize_embeddings=True
)

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

print("Embedding shape:", embeddings.shape)
print("FAISS index size:", index.ntotal)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Embedding shape: (999, 384)
FAISS index size: 999


Persist the FAISS index + embeddings

In [None]:
import pickle, faiss, os
import numpy as np

# Save
faiss.write_index(index, "kb.index")
np.save("kb_embeddings.npy", embeddings)
with open("kb_chunks.pkl", "wb") as f:
    pickle.dump({"all_chunks": all_chunks, "chunk_meta": chunk_meta}, f)

print("Saved knowledge base to disk.")

import pickle, faiss, os
import numpy as np

if os.path.exists("kb.index") and os.path.exists("kb_embeddings.npy") and os.path.exists("kb_chunks.pkl"):
    index = faiss.read_index("kb.index")
    embeddings = np.load("kb_embeddings.npy")
    with open("kb_chunks.pkl", "rb") as f:
        data = pickle.load(f)
    all_chunks = data["all_chunks"]
    chunk_meta = data["chunk_meta"]
    print("Loaded saved knowledge base.")
else:
    print("No saved KB found — build it first.")

Saved knowledge base to disk.
Loaded saved knowledge base.


Add a reranker

In [None]:
!pip -q install sentence-transformers
from sentence_transformers import CrossEncoder

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def retrieve_with_rerank(query, first_k=12, final_k=4, min_score=0.25):
    qv = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    scores, ids = index.search(qv, first_k)

    candidates = []
    for score, idx_ in zip(scores[0], ids[0]):
        if idx_ == -1:
            continue
        if float(score) >= min_score:
            candidates.append((int(idx_), float(score)))

    if not candidates:
        return []

    pairs = [(query, all_chunks[i]) for i, _ in candidates]
    rr_scores = reranker.predict(pairs)

    ranked = sorted(
        [(i, s, rr) for (i, s), rr in zip(candidates, rr_scores)],
        key=lambda x: x[2],
        reverse=True
    )[:final_k]

    return [{
        "score": s,
        "rerank": float(rr),
        "text": all_chunks[i],
        "meta": chunk_meta[i]
    } for i, s, rr in ranked]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Load open-source chat model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16
)

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Retrieval + guardrails

In [None]:

def is_greeting(text):
    return text.lower().strip() in {"hi", "hi loki", "hello", "hey", "hii", "hai", "sup", "how are you doing", "how are you"}

def retrieve(query, k=4, min_score=0.25):
    qv = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    scores, ids = index.search(qv, k)

    results = []
    for score, idx_ in zip(scores[0], ids[0]):
        if score >= min_score:
            results.append({
                "score": float(score),
                "text": all_chunks[int(idx_)],
                "meta": chunk_meta[int(idx_)]
            })
    return results

def build_prompt(query, retrieved):
    context = "\n---\n".join(
        f"[SOURCE: {r['meta']['source']} | CHUNK: {r['meta']['chunk_id']}]\n{r['text']}"
        for r in retrieved
    )

    return f"""
You are a helpful assistant.
Use ONLY the context below.
If the answer is not in the context, say:
"I don't have that in my documents."

CONTEXT:
{context}

USER QUESTION:
{query}

ASSISTANT:
"""
def is_conceptual_question(q):
    q = q.lower()
    return any(k in q for k in [
        "what is", "explain", "define", "overview", "introduction", "meaning"
    ])

    def build_concept_prompt(query, retrieved):
        context = "\n".join(r["text"] for r in retrieved)
        return f"""
Explain the concept clearly in simple terms.
Do NOT include commands or code unless explicitly asked.

Context:
{context}

Question:
{query}

Answer:
"""

def build_practical_prompt(query, retrieved):
    context = "\n".join(r["text"] for r in retrieved)
    return f"""
Explain with practical steps and examples.
You may include commands if useful.

Context:
{context}

Question:
{query}

Answer:
"""

Add URL ingestion code

In [None]:


import trafilatura

def fetch_url_text(url: str) -> str:
    downloaded = trafilatura.fetch_url(url)
    if not downloaded:
        return ""
    text = trafilatura.extract(downloaded)
    return text or ""

def add_web_sources(urls):
    global all_chunks, chunk_meta, embeddings, index

    new_chunks = []
    new_meta = []

    for url in urls:
        text = fetch_url_text(url)
        if not text.strip():
            print("Failed:", url)
            continue

        chunks = chunk_text(text, chunk_size=900, overlap=150)
        for i, ch in enumerate(chunks):
            new_chunks.append(ch)
            new_meta.append({"source": url, "chunk_id": i})

        print("Added:", url, "chunks:", len(chunks))

    if not new_chunks:
        print("No new content added.")
        return

    new_emb = embedder.encode(new_chunks, convert_to_numpy=True, normalize_embeddings=True)

    # Update store
    start_idx = len(all_chunks)
    all_chunks.extend(new_chunks)
    chunk_meta.extend(new_meta)

    # Update FAISS
    index.add(new_emb)

    print("Web sources ingested. Total chunks:", len(all_chunks), "Index size:", index.ntotal)

URLs

In [None]:
SEED_URLS = {
  "Chatbot.txt": [
    "https://github.com/facebookresearch/faiss",                          # Vector search index
    "https://huggingface.co/docs/transformers/en/main_classes/text_generation",  # Text generation API
    "https://github.com/adbar/trafilatura",                               # Web text extraction
    "https://trafilatura.readthedocs.io/",                                # Trafilatura docs
    "https://huggingface.co/tasks/text-generation"                        # Text-generation overview
  ],

  "tech.txt": [
    "https://docs.python.org/3/tutorial/",
    "https://kubernetes.io/docs/concepts/overview/what-is-kubernetes/",
    "https://docs.docker.com/get-started/",
    "https://aws.amazon.com/what-is-cloud-computing/",
    "https://owasp.org/www-project-top-ten/"
  ],

  "health.txt": [
    "https://www.who.int/news-room/fact-sheets/detail/healthy-diet",
    "https://www.cdc.gov/physical-activity-basics/health-benefits/adults.html",
    "https://www.cdc.gov/nutrition/features/healthy-eating-tips.html",
    "https://www.nichd.nih.gov/health/topics/sleep/conditioninfo",
    "https://www.who.int/news-room/fact-sheets/detail/physical-activity"
  ],

  "food.txt": [
    "https://www.myplate.gov/",
    "https://www.cdc.gov/food-safety/prevention/index.html",
    "https://www.fsis.usda.gov/food-safety/safe-food-handling-and-preparation/food-safety-basics/steps-keep-food-safe",
    "https://www.foodsafety.gov/keep/basics/clean/index.html",
    "https://www.cdc.gov/nutrition/features/healthy-eating-tips.html"
  ],

  "current_affairs.txt": [
    "https://www.federalreserve.gov/faqs/5CD8134B130A43E998A945450E041BF0.htm",  # What is inflation (Fed)
    "https://www.oecd.org/en/data/indicators/inflation-cpi.html",                # Inflation definition (OECD)
    "https://www.un.org/sustainabledevelopment/sustainable-development-goals/",  # UN SDGs
    "https://www.undp.org/content/undp/en/home/sustainable-development-goals.html", # SDGs (UNDP)
    "https://www.ecb.europa.eu/ecb/educational/hicp/html/index.en.html"          # Inflation basics (ECB)
  ],

  "jokes.txt": [
    "https://xkcd.com/303/",                          # Programming humor
    "https://www.explainxkcd.com/wiki/index.php/303:_Compiling",
    "https://peps.python.org/pep-0020/",              # Zen of Python (fun + useful)
    "https://www.rfc-editor.org/info/rfc1149",        # IP over avian carriers (classic nerd joke)
    "https://stackoverflow.blog/april-fools"          # Engineering humor archive
  ],
}

fetch HTML

In [None]:
import time
import trafilatura

def fetch_url_text(url: str) -> str:
    downloaded = trafilatura.fetch_url(url)
    if not downloaded:
        return ""
    text = trafilatura.extract(downloaded, include_tables=True, include_links=False)
    return (text or "").strip()

def ingest_seed_urls_to_txt(SEED_URLS, min_chars=600, sleep_s=1.0):
    total_added = 0

    for filename, urls in SEED_URLS.items():
        print(f"\n=== Ingesting into: {filename} ===")
        added_here = 0

        # append mode so you don’t lose existing content
        with open(filename, "a", encoding="utf-8") as f:
            for url in urls:
                text = fetch_url_text(url)

                if len(text) < min_chars:
                    print(f"  Failed/too short ({len(text)} chars): {url}")
                    time.sleep(sleep_s)
                    continue

                f.write("\n\n" + "="*90 + "\n")
                f.write(f"SOURCE_URL: {url}\n")
                f.write("="*90 + "\n\n")
                f.write(text)
                f.write("\n")

                added_here += 1
                total_added += 1
                print(f"  Added ({len(text)} chars): {url}")

                time.sleep(sleep_s)

        print(f"Added {added_here} pages into {filename}")

    print(f"\n✅ Done. Total pages appended across files: {total_added}")

# RUN ingestion
ingest_seed_urls_to_txt(SEED_URLS)


=== Ingesting into: Chatbot.txt ===
  Added (5034 chars): https://github.com/facebookresearch/faiss
  Added (41348 chars): https://huggingface.co/docs/transformers/en/main_classes/text_generation
  Added (5834 chars): https://github.com/adbar/trafilatura
  Added (5208 chars): https://trafilatura.readthedocs.io/
  Added (10423 chars): https://huggingface.co/tasks/text-generation
Added 5 pages into Chatbot.txt

=== Ingesting into: tech.txt ===
  Added (3411 chars): https://docs.python.org/3/tutorial/
  Added (9280 chars): https://kubernetes.io/docs/concepts/overview/what-is-kubernetes/
  Failed/too short (412 chars): https://docs.docker.com/get-started/
  Added (7687 chars): https://aws.amazon.com/what-is-cloud-computing/
  Added (15681 chars): https://owasp.org/www-project-top-ten/
Added 4 pages into tech.txt

=== Ingesting into: health.txt ===
  Added (20091 chars): https://www.who.int/news-room/fact-sheets/detail/healthy-diet
  Added (1204 chars): https://www.cdc.gov/physical-activit

ERROR:trafilatura.downloads:download error: https://www.fsis.usda.gov/food-safety/safe-food-handling-and-preparation/food-safety-basics/steps-keep-food-safe HTTPSConnectionPool(host='www.fsis.usda.gov', port=443): Max retries exceeded with url: /food-safety/safe-food-handling-and-preparation/food-safety-basics/steps-keep-food-safe (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.fsis.usda.gov', port=443): Read timed out. (read timeout=30)"))


  Failed/too short (0 chars): https://www.fsis.usda.gov/food-safety/safe-food-handling-and-preparation/food-safety-basics/steps-keep-food-safe


ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://www.foodsafety.gov/keep/basics/clean/index.html


  Failed/too short (0 chars): https://www.foodsafety.gov/keep/basics/clean/index.html
  Added (6983 chars): https://www.cdc.gov/nutrition/features/healthy-eating-tips.html
Added 2 pages into food.txt

=== Ingesting into: current_affairs.txt ===
  Added (2618 chars): https://www.federalreserve.gov/faqs/5CD8134B130A43E998A945450E041BF0.htm


ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://www.oecd.org/en/data/indicators/inflation-cpi.html


  Failed/too short (0 chars): https://www.oecd.org/en/data/indicators/inflation-cpi.html
  Added (3228 chars): https://www.un.org/sustainabledevelopment/sustainable-development-goals/


ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://www.undp.org/content/undp/en/home/sustainable-development-goals.html


  Failed/too short (0 chars): https://www.undp.org/content/undp/en/home/sustainable-development-goals.html
  Added (10940 chars): https://www.ecb.europa.eu/ecb/educational/hicp/html/index.en.html
Added 3 pages into current_affairs.txt

=== Ingesting into: jokes.txt ===
  Added (744 chars): https://xkcd.com/303/


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


  Failed/too short (0 chars): https://www.explainxkcd.com/wiki/index.php/303:_Compiling
  Added (1511 chars): https://peps.python.org/pep-0020/
  Added (795 chars): https://www.rfc-editor.org/info/rfc1149
  Added (616 chars): https://stackoverflow.blog/april-fools
Added 4 pages into jokes.txt

✅ Done. Total pages appended across files: 23


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Speacial features

In [None]:
# ANSI color helpers
RED = "\033[91m"
GREEN = "\033[92m"
RESET = "\033[0m"

def print_user(text):
    print(f"{RED}YOU: {text}{RESET}")

def print_bot(text):
    print(f"{GREEN}BOT: {text}{RESET}")

In [None]:
def generate_answer(prompt, max_new_tokens=256):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    # Return only assistant part if prompt is echoed
    if "ASSISTANT:" in decoded:
        return decoded.split("ASSISTANT:", 1)[-1].strip()
    return decoded.strip()

Chat loop (FINAL)

In [None]:
BOT_NAME = "Loki"
print(f"{GREEN}🤖 {BOT_NAME} is ready. Type 'bye' to exit.{RESET}\n")

while True:
    print(f"{RED}YOU:{RESET} ", end="")
    q = input().strip()

    if q.lower() == "bye":
        print_bot("Goodbye 👋")
        break

    if is_greeting(q):
        print_bot("Hi! I Hope you are doing well. Ask me something.")
        continue

    retrieved = retrieve(q)

    if not retrieved:
        print_bot("I am still in the learning process and I don't have that in my database.")
        continue

    prompt = build_prompt(q, retrieved)
    answer = generate_answer(prompt)

    print_bot(answer)

[92m🤖 Loki is ready. Type 'bye' to exit.[0m

[91mYOU:[0m 

KeyboardInterrupt: Interrupted by user