# Scraping Dataset from Internet

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Set the search queries
search_queries = [
    "Ayurveda Herbs", "Ayurveda Medicines", "Ayurveda Diseases",
    "Ayurveda Treatments", "Ayurveda Therapy", "Ayurveda Nutrition", "Ayurveda Diet",
    "Ayurveda Lifestyle", "Ayurveda Practices", "Ayurveda Yoga", "Ayurveda Massage",
    "Ayurveda Remedies", "Ayurveda Detox", "Ayurveda Panchakarma", "Ayurveda Doshas",
    "Ayurveda Vata", "Ayurveda Pitta", "Ayurveda Kapha", "Ayurveda Body Types",
    "Ayurveda Holistic Health", "Ayurveda Wellness", "Ayurveda Skin Care",
    "Ayurveda Hair Care", "Ayurveda Oils", "Ayurveda Spices", "Ayurveda Immunity",
    "Ayurveda Stress Relief", "Ayurveda Meditation", "Ayurveda Natural Healing",
    "Ayurveda Cleansing", "Ayurveda Weight Loss", "Ayurveda Anti-aging",
    "Ayurveda Pain Relief", "Ayurveda Sleep Disorders", "Ayurveda Mental Health",
    "Ayurveda Chronic Illness", "Ayurveda Diabetes", "Ayurveda Hypertension",
    "Ayurveda Heart Health", "Ayurveda Respiratory Health", "Ayurveda Digestive Health",
    "Ayurveda Joint Health", "Ayurveda Bone Health", "Ayurveda Eye Health",
    "Ayurveda Ear Health", "Ayurveda Oral Health", "Ayurveda Women's Health",
    "Ayurveda Men's Health", "Ayurveda Children's Health", "Ayurveda Seasonal Regimen",
    "Ayurveda Daily Routine", "Ayurveda Beauty Tips", "Ayurveda Hair Growth",
    "Ayurveda Weight Gain", "Ayurveda Healthy Recipes", "Ayurveda Cooking",
    "Ayurveda Herbal Teas", "Ayurveda Drinks", "Ayurveda Supplements",
    "Ayurveda Organic Products", "Ayurveda Essential Oils", "Ayurveda Plants",
    "Ayurveda Seeds", "Ayurveda Fruits", "Ayurveda Vegetables", "Ayurveda Roots",
    "Ayurveda Leaves", "Ayurveda Flowers", "Ayurveda Shrubs", "Ayurveda Trees",
    "Ayurveda Healing Properties", "Ayurveda Traditional Medicine", "Ayurveda History",
    "Ayurveda Philosophy", "Ayurveda Spirituality", "Ayurveda Mind-Body Connection",
    "Ayurveda Research", "Ayurveda Education", "Ayurveda Courses", "Ayurveda Certification",
    "Ayurveda Practitioners", "Ayurveda Clinics", "Ayurveda Hospitals", "Ayurveda Tourism",
    "Ayurveda in India", "Ayurveda Worldwide", "Ayurveda Modern Applications",
    "Ayurveda and Science", "Ayurveda Myths", "Ayurveda Benefits", "Ayurveda for Pets",
    "Ayurveda Environmental Impact", "Ayurveda Organic Farming", "Ayurveda Farming Practices",
    "Ayurveda Community", "Ayurveda Workshops", "Ayurveda Events", "Ayurveda Blogs",
    "Ayurveda Books", "Ayurveda Cosmetics", "Ayurveda Bath Products", "Ayurveda Baby Care",
    "Ayurveda Pet Care", "Ayurveda Animal Health", "Ayurveda Home Remedies", "Ayurveda Detox Diets",
    "Ayurveda Cold Remedies", "Ayurveda Fever Remedies", "Ayurveda Headache Remedies", "Ayurveda Allergy Remedies",
    "Ayurveda Cancer Support", "Ayurveda Immune Boosting", "Ayurveda Bone Density", "Ayurveda Athletic Performance",
    "Ayurveda Adaptogens", "Ayurveda Anti-inflammatory", "Ayurveda Cholesterol Management", "Ayurveda Blood Pressure",
    "Ayurveda Gut Health", "Ayurveda Probiotics", "Ayurveda Adaptogen Herbs", "Ayurveda Functional Foods",
    "Ayurveda Hormone Balance", "Ayurveda Detox Drinks", "Ayurveda Smoothies", "Ayurveda Soups",
    "Ayurveda Sauces", "Ayurveda Spreads", "Ayurveda Condiments", "Ayurveda Beauty Products",
    "Ayurveda Personal Care", "Ayurveda Oral Hygiene", "Ayurveda Dental Care", "Ayurveda Aroma Therapy",
    "Ayurveda Self-care", "Ayurveda Mental Wellness", "Ayurveda Journals", "Ayurveda Supplements Reviews",
    "Ayurveda Tea Reviews", "Ayurveda Essential Oils Reviews", "Ayurveda Spa Treatments", "Ayurveda Facial Treatments",
    "Ayurveda Eye Treatments", "Ayurveda Hearing Treatments", "Ayurveda Sleep Therapy", "Ayurveda Sleep Hygiene",
    "Ayurveda Fitness", "Ayurveda Exercise", "Ayurveda Sports Nutrition", "Ayurveda Energy Boosters",
    "Ayurveda Superfoods", "Ayurveda Hydration", "Ayurveda Bone Broth", "Ayurveda Herbal Infusions",
    "Ayurveda Balms", "Ayurveda Salves", "Ayurveda Lotions", "Ayurveda Creams",
    "Ayurveda Moisturizers", "Ayurveda Face Masks", "Ayurveda Hair Masks", "Ayurveda Shampoos",
    "Ayurveda Conditioners", "Ayurveda Serums", "Ayurveda Toners", "Ayurveda Cleansers",
    "Ayurveda Facial Oils", "Ayurveda Body Oils", "Ayurveda Scrubs", "Ayurveda Bath Salts",
    "Ayurveda Exfoliators", "Ayurveda Foot Care", "Ayurveda Hand Care", "Ayurveda Nail Care",
    "Ayurveda Lip Care", "Ayurveda Anti-Aging Skin Care", "Ayurveda Sun Protection", "Ayurveda After-Sun Care",
    "Ayurveda Eye Creams", "Ayurveda Lip Balms", "Ayurveda Beard Oils", "Ayurveda Beard Balms",
    "Ayurveda Hair Styling", "Ayurveda Hair Treatments", "Ayurveda Hair Serums", "Ayurveda Hair Oils",
    "Ayurveda Scalp Treatments", "Ayurveda Hair Loss Treatments", "Ayurveda Hair Growth Serums", "Ayurveda Heat Protectants",
    "Ayurveda Hair Dyes", "Ayurveda Hair Care Routines", "Ayurveda Hair Accessories", "Ayurveda Hair Tools",
    "Ayurveda Bath Accessories", "Ayurveda Body Brushes", "Ayurveda Foot Massagers", "Ayurveda Eye Massagers",
    "Ayurveda Face Rollers", "Ayurveda Gua Sha", "Ayurveda Scalp Massagers", "Ayurveda Face Tools",
    "Ayurveda Fitness Equipment", "Ayurveda Yoga Mats", "Ayurveda Meditation Cushions", "Ayurveda Incense",
    "Ayurveda Candles", "Ayurveda Crystals", "Ayurveda Jewelry", "Ayurveda Home Decor",
    "Ayurveda Bedding", "Ayurveda Pillows", "Ayurveda Throws", "Ayurveda Rugs",
    "Ayurveda Wall Art", "Ayurveda Kitchenware", "Ayurveda Tableware", "Ayurveda Cookware",
    "Ayurveda Storage Solutions", "Ayurveda Cleaning Products", "Ayurveda Natural Cleaners", "Ayurveda Sustainable Living",
    "Ayurveda Green Living", "Ayurveda Eco-Friendly Products", "Ayurveda Zero Waste", "Ayurveda Minimalism",
    "Ayurveda Decluttering", "Ayurveda Feng Shui", "Ayurveda Space Clearing",
    "Ayurveda Plant Care",
    "Ayurveda Gardening", "Ayurveda Indoor Plants", "Ayurveda Outdoor Plants", "Ayurveda Herbal Gardens",
    "Ayurveda Organic Gardening", "Ayurveda Garden Design", "Ayurveda Companion Planting", "Ayurveda Permaculture",
    "Ayurveda Beekeeping", "Ayurveda Aquaponics", "Ayurveda Hydroponics", "Ayurveda Farm-to-Table",
    "Ayurveda Seasonal Eating", "Ayurveda Local Food", "Ayurveda Farmers Markets", "Ayurveda Community Supported Agriculture",
    "Ayurveda Ethical Eating", "Ayurveda Food Justice", "Ayurveda Food Security", "Ayurveda Urban Gardening",
    "Ayurveda Wildcrafting", "Ayurveda Foraging", "Ayurveda Herbalism", "Ayurveda Plant Medicine",
    "Ayurveda Traditional Healing", "Ayurveda Indigenous Wisdom", "Ayurveda Ancestral Knowledge", "Ayurveda Cultural Heritage",
    "Ayurveda Sacred Plants", "Ayurveda Rituals", "Ayurveda Ceremonies", "Ayurveda Festivities",
    "Ayurveda Holidays", "Ayurveda Traditions", "Ayurveda Legends", "Ayurveda Folklore",
    "Ayurveda Mythology", "Ayurveda Stories", "Ayurveda Wisdom", "Ayurveda Teachings",
    "Ayurveda Elders", "Ayurveda Lineages", "Ayurveda Mentorship", "Ayurveda Apprenticeship",
    "Ayurveda Internships", "Ayurveda Career Opportunities", "Ayurveda Job Listings", "Ayurveda Employment",
    "Ayurveda Volunteering", "Ayurveda Internships Abroad", "Ayurveda Study Abroad", "Ayurveda Exchange Programs",
    "Ayurveda Scholarships", "Ayurveda Grants", "Ayurveda Fellowships", "Ayurveda Conferences",
    "Ayurveda Webinars", "Ayurveda Online Courses", "Ayurveda Distance Learning", "Ayurveda E-Learning",
    "Ayurveda Virtual Classes", "Ayurveda Digital Resources", "Ayurveda Podcasts", "Ayurveda Vlogs",
    "Ayurveda YouTube Channels", "Ayurveda Instagram Accounts", "Ayurveda Facebook Groups", "Ayurveda Twitter Accounts",
    "Ayurveda Pinterest Boards", "Ayurveda LinkedIn Groups", "Ayurveda Networking", "Ayurveda Forums",
    "Ayurveda Online Communities", "Ayurveda Social Media",
    "Ayurveda Influencers", "Ayurveda Brand Ambassadors",
    "Ayurveda Product Reviews", "Ayurveda Testimonials", "Ayurveda Case Studies", "Ayurveda Clinical Trials",
    "Ayurveda Research Papers", "Ayurveda Scientific Studies", "Ayurveda Journals", "Ayurveda Magazines",
    "Ayurveda Newsletters", "Ayurveda Press Releases", "Ayurveda Media Coverage", "Ayurveda News",
    "Ayurveda Trends", "Ayurveda Innovations", "Ayurveda New Products", "Ayurveda Product Launches",
    "Ayurveda Market Analysis", "Ayurveda Industry Insights", "Ayurveda Business Opportunities", "Ayurveda Entrepreneurship",
    "Ayurveda Startups", "Ayurveda Investment", "Ayurveda Funding", "Ayurveda Crowdfunding",
    "Ayurveda Partnerships", "Ayurveda Collaborations", "Ayurveda Alliances", "Ayurveda Joint Ventures",
    "Ayurveda Licensing", "Ayurveda Franchising", "Ayurveda Retail", "Ayurveda E-commerce",
    "Ayurveda Wholesale", "Ayurveda Distribution", "Ayurveda Supply Chain", "Ayurveda Logistics",
    "Ayurveda Export", "Ayurveda Import", "Ayurveda Trade Shows", "Ayurveda Expos",
    "Ayurveda Fairs", "Ayurveda Markets", "Ayurveda Stores", "Ayurveda Boutiques",
    "Ayurveda Pop-Up Shops", "Ayurveda Farmers Markets", "Ayurveda Craft Fairs", "Ayurveda Artisanal Products",
    "Ayurveda Handcrafted Products", "Ayurveda Small Businesses", "Ayurveda Artisans", "Ayurveda Makers",
    "Ayurveda Creators", "Ayurveda Designers", "Ayurveda Artists", "Ayurveda Innovators",
    "Ayurveda Entrepreneurs", "Ayurveda Influencers", "Ayurveda Thought Leaders", "Ayurveda Visionaries",
    "Ayurveda Pioneers", "Ayurveda Trailblazers", "Ayurveda Changemakers", "Ayurveda Advocates",
    "Ayurveda Activists", "Ayurveda Leaders", "Ayurveda Community Leaders", "Ayurveda Nonprofits",
    "Ayurveda NGOs", "Ayurveda Philanthropy", "Ayurveda Volunteering", "Ayurveda Community Service",
    "Ayurveda Social Impact", "Ayurveda Social Responsibility", "Ayurveda Corporate Social Responsibility", "Ayurveda Ethical Practices",
    "Ayurveda Sustainable Practices", "Ayurveda Environmental Stewardship", "Ayurveda Conservation", "Ayurveda Wildlife Protection",
    "Ayurveda Habitat Restoration", "Ayurveda Climate Action", "Ayurveda Renewable Energy", "Ayurveda Green Energy",
    "Ayurveda Energy Efficiency", "Ayurveda Carbon Footprint", "Ayurveda Carbon Neutrality", "Ayurveda Offsetting",
    "Ayurveda Sustainable Design", "Ayurveda Eco-Architecture", "Ayurveda Green Building", "Ayurveda LEED Certification",
    "Ayurveda Healthy Homes", "Ayurveda Wellness Architecture", "Ayurveda Biophilic Design", "Ayurveda Natural Materials",
    "Ayurveda Organic Materials", "Ayurveda Non-Toxic Materials", "Ayurveda Recycled Materials", "Ayurveda Upcycling",
    "Ayurveda Repurposing", "Ayurveda Waste Reduction", "Ayurveda Waste Management", "Ayurveda Composting",
    "Ayurveda Recycling", "Ayurveda Circular Economy", "Ayurveda Sustainable Economy", "Ayurveda Local Economy",
    "Ayurveda Community Economy", "Ayurveda Cooperative Economy", "Ayurveda Shared Economy", "Ayurveda Social Enterprises",
    "Ayurveda B Corps", "Ayurveda Fair Trade", "Ayurveda Ethical Trade", "Ayurveda Transparent Trade",
    "Ayurveda Fair Wages", "Ayurveda Worker Rights", "Ayurveda Labor Practices", "Ayurveda Human Rights",
    "Ayurveda Gender Equality", "Ayurveda Racial Equality", "Ayurveda Social Justice", "Ayurveda Economic Justice",
    "Ayurveda Environmental Justice", "Ayurveda Health Equity", "Ayurveda Global Health", "Ayurveda Public Health",
    "Ayurveda Health Policy", "Ayurveda Health Systems", "Ayurveda Healthcare Access", "Ayurveda Healthcare Delivery",
    "Ayurveda Health Programs", "Ayurveda Health Initiatives", "Ayurveda Health Campaigns", "Ayurveda Health Education",
    "Ayurveda Health Promotion", "Ayurveda Disease Prevention", "Ayurveda Health Screening", "Ayurveda Health Surveillance",
    "Ayurveda Health Data", "Ayurveda Health Informatics", "Ayurveda Health Technology", "Ayurveda Telemedicine",
    "Ayurveda Digital Health", "Ayurveda Health Apps", "Ayurveda Health Wearables", "Ayurveda Health Trackers",
    "Ayurveda Health Analytics", "Ayurveda Health Research", "Ayurveda Health Studies", "Ayurveda Health Publications",
    "Ayurveda Health Conferences", "Ayurveda Health Workshops", "Ayurveda Health Seminars", "Ayurveda Health Training",
    "Ayurveda Health Careers", "Ayurveda Health Professions", "Ayurveda Health Certifications", "Ayurveda Health Licenses",
    "Ayurveda Health Regulations", "Ayurveda Health Standards", "Ayurveda Health Guidelines", "Ayurveda Health Policies",
    "Ayurveda Health Laws", "Ayurveda Health Ethics", "Ayurveda Health Governance", "Ayurveda Health Advocacy",
    "Ayurveda Patient Advocacy", "Ayurveda Patient Rights", "Ayurveda Patient Safety", "Ayurveda Patient Experience",
    "Ayurveda Patient Engagement", "Ayurveda Patient Education", "Ayurveda Patient Support", "Ayurveda Patient Empowerment",
    "Ayurveda Patient-Centered Care", "Ayurveda Family-Centered Care", "Ayurveda Community-Centered Care", "Ayurveda Integrative Care",
    "Ayurveda Coordinated Care", "Ayurveda Collaborative Care", "Ayurveda Multidisciplinary Care", "Ayurveda Holistic Care",
    "Ayurveda Compassionate Care",
    "Ayurveda Palliative Care", 
    "Ayurveda Hospice Care", "Ayurveda End-of-Life Care",
    "Ayurveda Bereavement Support", "Ayurveda Grief Support", "Ayurveda Mental Health Support", "Ayurveda Peer Support",
    "Ayurveda Support Groups", "Ayurveda Counseling", "Ayurveda Therapy Services", "Ayurveda Coaching",
    "Ayurveda Mentorship Programs", "Ayurveda Peer Mentorship", "Ayurveda Volunteer Mentorship", "Ayurveda Career Mentorship",
    "Ayurveda Business Mentorship", "Ayurveda Leadership Mentorship", "Ayurveda Academic Mentorship", "Ayurveda Student Mentorship",
    "Ayurveda Youth Mentorship", "Ayurveda Adult Mentorship", "Ayurveda Senior Mentorship", "Ayurveda Parent Mentorship",
    "Ayurveda Teacher Mentorship", "Ayurveda Faculty Mentorship", "Ayurveda Peer Coaching", "Ayurveda Life Coaching",
    "Ayurveda Wellness Coaching", "Ayurveda Health Coaching", "Ayurveda Executive Coaching", "Ayurveda Career Coaching",
    "Ayurveda Leadership Coaching", "Ayurveda Performance Coaching", "Ayurveda Skills Coaching", "Ayurveda Personal Development",
    "Ayurveda Self-Improvement", "Ayurveda Self-Growth", "Ayurveda Self-Help", "Ayurveda Self-Care",
    "Ayurveda Mindfulness", "Ayurveda Meditation Techniques", "Ayurveda Meditation Practices", "Ayurveda Guided Meditation",
    "Ayurveda Mindfulness Meditation", "Ayurveda Breathwork", "Ayurveda Yoga Nidra", "Ayurveda Mantras",
    "Ayurveda Chanting", "Ayurveda Sound Healing", "Ayurveda Sound Therapy", "Ayurveda Music Therapy",
    "Ayurveda Art Therapy", "Ayurveda Dance Therapy", "Ayurveda Movement Therapy", "Ayurveda Expressive Arts",
    "Ayurveda Creative Arts", "Ayurveda Journaling", "Ayurveda Reflection", "Ayurveda Contemplation",
    "Ayurveda Diseaseas",
    "Ayurveda Solutions",
    "Yoga",
    "Ayurveda and Yoga",
    "Ayurveda Breathing",
    "Ayurveda Meditation",
    "Ayurvedic Medicine",
    "Ayurvedic Alternatives to Medicine",
    "Ayurvedic Alternatives to Western Medicine"
]


# Create a directory to store downloaded pages
output_dir = "ayurveda_pages"
os.makedirs(output_dir, exist_ok=True)
import pickle

with open("visited.pkl", "rb") as f:
    visited_urls = pickle.load(f)
# visited_urls = set()

# Directory containing verified texts
verified_texts_dir = "texts"

num_pages = 3
# Expanded keywords related to Ayurveda
ayurveda_keywords = [
    "Ayurveda",
    "herbal",
    "medicinal",
    "holistic",
    "dosha",
    "panchakarma",
    "vata",
    "pitta",
    "kapha",
    "ayurvedic",
    "western",
    "medicine",
    # "balance",
    # "wellness",
    "alternative medicine",
    # "detoxification",
    # "therapy",
    # "nutrition",
    # "mind-body",
    # "spiritual",
    # "prakriti",
    "shirodhara",
    "abhyanga",
    "rasayana",
    # "natural",
    # "healthy",
    # "meditation",
    # "breathing",
]


def load_verified_texts(directory):
    verified_texts = ""
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath, "r", encoding="utf-8") as file:
            verified_texts += file.read() + " "
    return verified_texts


def is_english(content):
    # Check if the content is in English
    try:
        return detect(content) == "en"
    except:
        return False


def contains_ayurveda_keywords(content, threshold=5):
    # Check if the content contains any of the Ayurveda-related keywords more than a threshold
    keyword_count = sum(
        content.lower().count(keyword.lower()) for keyword in ayurveda_keywords
    )
    return keyword_count > threshold


def compute_similarity(vectorizer, reference_vector, content):
    # Transform the content using the pre-fitted vectorizer
    content_vector = vectorizer.transform([content])
    # Compute cosine similarity between reference vector and content vector
    similarity_matrix = cosine_similarity(reference_vector, content_vector)
    return similarity_matrix[0, 0]


def search_and_download_pages(
    url,
    depth,
    vectorizer,
    reference_vector,
    similarity_threshold=0.5,
    keyword_threshold=3,
):
    if depth == 0:
        return

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        page_content = soup.get_text()  # Extract text content

        # Clean the URL for use as a filename
        filename = url.replace("http://", "").replace("https://", "")
        filename = filename.replace("/", "_").replace(".", "_")
        filename = f"{output_dir}/{filename}.txt"

        # Check if the content is in English, contains Ayurveda keywords, and has high similarity with the reference text
        if (
            is_english(page_content)
            and contains_ayurveda_keywords(page_content, keyword_threshold)
            and compute_similarity(vectorizer, reference_vector, page_content)
            > similarity_threshold
        ):
            similarity_score = compute_similarity(
                vectorizer, reference_vector, page_content
            )
            if similarity_score > similarity_threshold:
                # Save the page content to a text file       
                with open(filename, "w", encoding="utf-8") as f:
                    f.write(page_content)
                    
                    
                    print(
                        f"Downloaded page {url} with similarity score {similarity_score}"
                    )
                    result_links = soup.find_all("a", href=True)
                    for link in result_links:
                        next_url = link["href"]
                        if next_url.startswith("http") and next_url not in visited_urls:
                            visited_urls.add(next_url)  # Mark URL as visited
                            search_and_download_pages(
                                next_url,
                                depth - 1,
                                vectorizer,
                                reference_vector,
                                similarity_threshold,
                                keyword_threshold,
                            )
    except Exception as e:
        print(f"Failed to process {url}: {e}")


if __name__ == "__main__":
    # Load the reference text from the verified texts directory
    reference_text = load_verified_texts(verified_texts_dir)

    # Initialize Count Vectorizer and fit on the reference text
    vectorizer = CountVectorizer()
    reference_vector = vectorizer.fit_transform([reference_text])

    # Start with the initial search queries
    for query in search_queries:
        search_and_download_pages(
            f"https://www.bing.com/search?q={query}",
            num_pages,
            vectorizer,
            reference_vector,
        )
        with open("visited.pkl", "wb") as f:
            pickle.dump(visited_urls, f)


# Split Book pdf into smaller text files for embedding

In [None]:
import os
from pdf2image import convert_from_path
import pytesseract

def pdf_to_text(input_file, output_dir, file, chunk_size=30000):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Convert PDF to images
    try:
        images = convert_from_path(input_file)
    except Exception as e:
        print(f"Error converting {input_file} to images: {e}")
        return
    
    text = ""

    # Perform OCR on each image
    for i, image in enumerate(images):
        try:
            text += pytesseract.image_to_string(image)
        except Exception as e:
            print(f"Error performing OCR on page {i + 1} of {input_file}: {e}")

    # Write text to files in chunks
    start = 0
    file_index = 1
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunk = text[start:end]
        output_file_path = os.path.join(output_dir, f"{file}_output_{file_index}.txt")
        try:
            with open(output_file_path, "w", encoding="utf-8") as output_file:
                output_file.write(chunk)
        except Exception as e:
            print(f"Error writing chunk {file_index} to file: {e}")
        
        start = end
        file_index += 1

if __name__ == "__main__":
    input_folder = "files"  # Folder containing PDFs
    output_dir = "texts"    # Folder to save text files

    # Process each PDF file in the input folder
    for file in os.listdir(input_folder):
        if file.endswith(".pdf"):
            input_file_path = os.path.join(input_folder, file)
            print(f"Processing {input_file_path}")
            pdf_to_text(input_file_path, output_dir, file)


# Splitting large texts into smaller texts

In [None]:
import os

def split_text_file(input_file, destination="texts", max_chars=300000):
    # Read the content of the input file
    with open(input_file, 'r', encoding='utf-8') as file:
        content = file.read()

    # Calculate the number of parts needed
    total_chars = len(content)
    num_parts = (total_chars // max_chars) + 1

    # Ensure the destination directory exists
    if not os.path.exists(destination):
        os.makedirs(destination)

    # Split the content and write to separate files
    base_filename = os.path.basename(input_file)
    name, ext = os.path.splitext(base_filename)
    
    for i in range(num_parts):
        start_index = i * max_chars
        end_index = start_index + max_chars
        part_content = content[start_index:end_index]

        output_file = os.path.join(destination, f"{name}_part{i + 1}{ext}")
        with open(output_file, 'w', encoding='utf-8') as part_file:
            part_file.write(part_content)

        print(f"Wrote part {i + 1} to {output_file}")

# Directory containing text files
input_folder = 'files'
output_folder = 'texts'

# Iterate through all text files in the directory
for file in os.listdir(input_folder):
    if file.endswith(".txt"):
        input_file_path = os.path.join(input_folder, file)
        print(f"Processing {input_file_path}")
        split_text_file(input_file_path, destination=output_folder)


# Embedding

In [None]:
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
    text = text.replace('•', '  *')
    return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

from dotenv import load_dotenv
load_dotenv(".env") 

import os
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.vectorstores import FAISS
files=[]
for file in os.listdir("texts"):
    files.append("texts/"+file)
loaders = UnstructuredFileLoader(files)
data = loaders.load() 
len(data)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

docs = text_splitter.split_documents(data)

genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))

from langchain_google_genai import GoogleGenerativeAIEmbeddings

gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

for i in range(len(docs)//100):
    db = FAISS.from_documents(docs[i * 100:min((i + 1) * 100, len(docs))], gemini_embeddings)

    file_path = f"saved_embeddings/{i + 1}"
    db.save_local(file_path)
    print(f"Saved embeddings for batch {i + 1} to {file_path}")

# Load embeddings by grouping into one Database

In [None]:
base_path = 'saved_embeddings'
all_dbs = []

for subfolder in sorted(os.listdir(base_path)):
    subfolder_path = os.path.join(base_path, subfolder)
    
    if os.path.isdir(subfolder_path):
        db = FAISS.load_local(folder_path=subfolder_path, embeddings=gemini_embeddings, allow_dangerous_deserialization=True)
        all_dbs.append(db)
        print(f"Loaded embeddings from {subfolder_path}")

if all_dbs:
    target_db = all_dbs[0]

    # Merge all other databases into the target database
    for db in all_dbs[1:]:
        FAISS.merge_from(target_db, db)
    
    retriever = target_db.as_retriever(search_kwargs={'k': 3})
else:
    print("No databases loaded.")

# Create Retrieval Chain on Gemini Flash

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0.3, top_p=0.85)

from langchain.chains import RetrievalQAWithSourcesChain
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever)

from langchain import PromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.prompt_template import format_document
from langchain.schema.runnable import RunnablePassthrough

llm_prompt_template = """You are an assistant for question-answering tasks with advanced analytical and reasoning capabilities.
Use the following context to answer the question.
If you don't know the answer, try to think of it without context.\n
Question: {question} \nContext: {context} \nAnswer:"""

llm_prompt = PromptTemplate.from_template(llm_prompt_template)

print(llm_prompt)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | llm_prompt
    | llm
    | StrOutputParser()
)

# Invoke the chain with example questions
responses = [
    rag_chain.invoke("What are the 3 secondary supports of life?"),
    rag_chain.invoke("Reduced Therapy"),
    rag_chain.invoke("How to deal with heart disease"),
    rag_chain.invoke("What asanas should I do if I have imbalance in strnegth of my left and right arm?"),
    rag_chain.invoke("How can I make my tea better?"),
    rag_chain.invoke("Teach me in detail about dosas and how do I find mine?"),
    rag_chain.invoke("I have a Vatta-Pita imbalance. Help me")
]

# Print the responses
for response in responses:
    print(response)

In [None]:

while True:
    question=input("Ask a question")
    if "stop" in question:
        break
    print(question+"\n\n")
    response=rag_chain.invoke(question)
    print(response+"\n\n")