In [None]:
import pdfplumber
import re
import string
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def preprocess_text(text):
    text = text.replace("\n", " ").replace("\t", " ").strip()
    return " ".join(text.split())

def prepare_data(pdf_paths, output_file):
    combined_text = ""
    for pdf_path in pdf_paths:
        raw_text = extract_text_from_pdf(pdf_path)
        cleaned_text = preprocess_text(raw_text)
        combined_text += cleaned_text + " "

    combined_text = combined_text.strip()

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(combined_text)
    return combined_text


def chunk_text(text, chunk_size=1500, overlap_size=100):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap_size):
        chunks.append(text[i:i + chunk_size])
    return chunks

def generate_embeddings(texts):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    embeddings = model.encode(texts)
    model.save('all-MiniLM-L6-v2')
    return embeddings

def create_knowledge_base(texts, embeddings):
    knowledge_base = {}
    for i, text in enumerate(texts):
        knowledge_base[i]= {"text": text, "embedding": embeddings[i].tolist() }
    return knowledge_base
def find_relevant_answer(query, knowledge_base):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode([query])
    scores = {}
    for url, item in knowledge_base.items():
      if item.get('embedding'):
        similarity_score = cosine_similarity(query_embedding, [item['embedding']])[0][0]
        scores[url] = similarity_score
    if not scores:
        return "Sorry, I cannot find an answer for your query."
    best_url = max(scores, key=scores.get)
    if scores[best_url] > 0.3:
      return knowledge_base[best_url]["text"]
    else:
        return "Sorry, I cannot find an answer for your query."

if __name__ == "__main__":
    # pdf_paths = ["/content/textbook1.pdf", "/content/textbook2.pdf", "/content/textbook3.pdf"]
    # output_file = "combined_text.txt"
    # combined_text = prepare_data(pdf_paths, output_file)
    with open('combined_text.txt', 'r') as f:
        combined_text = f.read()
    chunks = chunk_text(combined_text)
    embeddings = generate_embeddings(chunks)
    knowledge_base = create_knowledge_base(chunks,embeddings)
    print("Knowledge base is ready!")

    while True:
        query = input("You: ")
        if query.lower() == "exit":
          break
        response = find_relevant_answer(query, knowledge_base)
        print("Bot: ", response)

In [13]:
!pip install pdfplumber

Collecting pdfplumber
  Using cached pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [21]:
!pip install shutil

[31mERROR: Could not find a version that satisfies the requirement shutil (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for shutil[0m[31m
[0m

In [26]:
import shutil

# Compress the model folder into a zip file
shutil.make_archive('all-MiniLM-L6-v2', 'zip', 'all-MiniLM-L6-v2')


'/content/all-MiniLM-L6-v2.zip'

In [23]:
from google.colab import files

# Download the zipped model
files.download('all-MiniLM-L6-v2.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
import json
from google.colab import files

# Save the knowledge_base dictionary to a JSON file
with open("knowledge_base.json", "w", encoding="utf-8") as f:
    json.dump(knowledge_base, f, ensure_ascii=False, indent=4)

# Download the JSON file to your local machine
# files.download("knowledge_base.json")
