In [None]:
import os
import re

def load_and_chunk_corpus(corpus_dir, chunk_size=300):
    chunks = []
    for filename in os.listdir(corpus_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(corpus_dir, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                # Clean and normalize text
                text = re.sub(r'\s+', ' ', text).strip()
                # Split into chunks
                for i in range(0, len(text), chunk_size):
                    chunks.append(text[i:i + chunk_size])
    return chunks

corpus_chunks = load_and_chunk_corpus('corpus/')

In [None]:
pip install chromadb

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def embed_chunks(chunks):
    return model.encode(chunks)

embeddings = embed_chunks(corpus_chunks)

In [None]:
import chromadb

client = chromadb.Client()
collection = client.create_collection("chatbot_corpus")

for chunk, embedding in zip(corpus_chunks, embeddings):
    collection.add(documents=[chunk], embeddings=[embedding.tolist()])

In [None]:
def retrieve_relevant_chunks(query, top_k=5):
    query_embedding = model.encode([query]).tolist()
    results = collection.query(embeddings=query_embedding, n_results=top_k)
    return results['documents'][0]

In [None]:
def generate_answer(query):
    relevant_chunks = retrieve_relevant_chunks(query)
    context = " ".join(relevant_chunks)
    # Simple answer generation (you can replace this with a more complex model)
    return f"Based on the information: {context} \n\n Your query was: {query}"