In [1]:
# Import Libraries and Packages

from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

In [2]:
# Extract text from PDFs

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""

    for page in reader.pages:
        text += page.extract_text()

    return text

In [3]:
pdf_text = extract_text_from_pdf("/Users/princekhunt/Documents/Portfolio/Smart-Kitchen-Helper/pdfs/recipes0.pdf")

In [4]:
def analyze_separators(text, separators=None):
    if separators is None:
        separators = ["\n\n", "\n", ".", ",", ";", ":", "!", "?", "—", "-", " "]

    counts = {}

    for sep in separators:
        counts[repr(sep)] = text.count(sep)
    
    counts = dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))

    return counts

In [5]:
separators_freq = analyze_separators(pdf_text)
separators_freq

{"' '": 27401,
 "'.'": 13613,
 "'\\n'": 5807,
 "','": 839,
 "'-'": 506,
 "':'": 361,
 "';'": 6,
 "'\\n\\n'": 0,
 "'!'": 0,
 "'?'": 0,
 "'—'": 0}

In [6]:
# Initialize the text splitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", ",", "-", ":"]
)

In [7]:
chunks = text_splitter.split_text(pdf_text)
print(f"Number of chunks: {len(chunks)}")
print(chunks[:3])

Number of chunks: 429
['Developed by Indian Academic InstitutionsDeveloped by Indian Academic InstitutionsCompilation of recipes Compilation of recipes \nacross India statesacross India statesFOOD AND RELATED \nPRODUCTSCOMPILED BY:  NATIONAL CENTRE OF EXCELLENCE AND ADVANCED RESEARCH ON DIETS (NCEARD), \nLADY IRWIN COLLEGE\nDISCLAIMER: THE INFORMATION IS REPRODUCED AS PROVIDED BY THE INSTITUTIONS. NCEARD', 'IS NOT RESPONSIBLE FOR THE TECHNICAL ACCURACY , CONTENTS OR DEVIATIONS FROM GLOBAL GUIDELINES. EAST INDIA RECIPES  .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. . 1\nASSAM MIX . .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..  2\nMULTIGRAIN ROTI MIX . .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..  4', 'GLUTEN FREE BISCUIT . .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..  6\nBALANCED ENTERAL FORMULA . .. .. .. .. .. .. .. .. 

In [8]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange


In [9]:
vectorstore = Chroma.from_texts(
    chunks,
    embeddings,
    persist_directory="./chroma_db"
)

In [10]:
vectorstore.persist()

  vectorstore.persist()


In [11]:
vectorstore = Chroma(
    persist_directory="/Users/princekhunt/Documents/Portfolio/Smart-Kitchen-Helper/notebooks/chroma_db",
    embedding_function=embeddings
)

  vectorstore = Chroma(


In [15]:
query = "Tishee / Linseed ladoos"
results = vectorstore.similarity_search(query, k=3)

for r in results:
    print("********************************")
    print(r.page_content)

********************************
NUTRITIOUS STUFF- I  .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..  42
NUTRITIOUS STUFF- II  .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..  44
NUTRITIOUS STUFF- III . .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..  46
TISHEE/LINSEED LADOOS  .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..  48
********************************
R oast the ragi flour, linseed, gond, methi, 
white til separately and grind them. 
 Pr epare Jaggery Syrup. Mix all the ingr edients well. 
 Mak e small round shape ladoos using 
ghee. 
DEVELOPED BY
Department of Home Sc., BAU, Kanke, Ranchi, Jharkhand50Madua/Ragi Ladoos
SHELF LIFE
20-25 days
COST
Rs 250.00/kg
Energy 550 Kcal
5 gmProtein
75 mgCalcium
1 mgIronNutritional Information/100 grams51Madua/Ragi Ladoos
Target Group and 
BenefitsIngredients
