In [36]:
# **1. Imports Section**

In [37]:
# --- Main Script ---
import os
import shutil
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline
from langchain_core.documents import Document
import warnings
warnings.filterwarnings("ignore")

# --- Configuration ---
DATA_DIR = "rag_data"
PDF_DIRS = ["research_papers", "clinical_studies", "fda_reports"]
HTML_DIRS = ["kennel_clubs", "vet_associations", "breeder_forums"]
STRUCTURED_DATA_PATH = os.path.join(DATA_DIR, "structured", "breed_health_data.xlsx")
VECTOR_DB_PATH = os.path.join(DATA_DIR, "unstructured", "vector_db")

# --- Fixed Document Loading ---
def load_documents():
    docs = []
    print("\n[Document Loading Progress]")
    
    # 1. Verify directory structure exists
    print("\nüîç Verifying directory structure...")
    required_dirs = {
        "PDFs": [os.path.join(DATA_DIR, "unstructured", "pdfs", d) for d in PDF_DIRS],
        "HTML": [os.path.join(DATA_DIR, "unstructured", "web_articles", d) for d in HTML_DIRS],
        "Structured": [os.path.dirname(STRUCTURED_DATA_PATH)]
    }
    
    # Check and report missing directories
    for data_type, dirs in required_dirs.items():
        missing = [d for d in dirs if not os.path.exists(d)]
        if missing:
            print(f"‚ö†Ô∏è Missing {data_type} directories: {missing}")
        else:
            print(f"‚úì All {data_type} directories present")

    # 2. Load structured data from Excel
    print("\nüìÇ Loading structured data...")
    if os.path.exists(STRUCTURED_DATA_PATH):
        print(f"Found Excel file at: {STRUCTURED_DATA_PATH}")
        try:
            df = pd.read_excel(STRUCTURED_DATA_PATH)
            records = df.to_dict('records')
            
            for record in records:
                content = "\n".join(f"{k}: {v}" for k, v in record.items())
                docs.append(Document(
                    page_content=content, 
                    metadata={"source": "structured_data"}
                ))
                
            print(f"    ‚Üí Loaded {len(records)} structured records")
        except Exception as e:
            print(f"‚ùå Failed to load Excel: {type(e).__name__}: {str(e)[:100]}")
    else:
        print(f"‚ö†Ô∏è Structured data file not found: {STRUCTURED_DATA_PATH}")

    # 3. Load HTML files
    print("\nüìÇ Loading HTML files...")
    for folder in HTML_DIRS:
        full_path = os.path.join(DATA_DIR, "unstructured", "web_articles", folder)
        if not os.path.exists(full_path):
            print(f"‚ö†Ô∏è Skipping missing HTML folder: {full_path}")
            continue
            
        print(f"\nProcessing HTML folder: {full_path}")
        html_files = [f for f in os.listdir(full_path) if f.endswith(".html")]
        
        if not html_files:
            print("  No HTML files found")
            continue
            
        for file in html_files:
            file_path = os.path.join(full_path, file)
            try:
                print(f"  Processing: {file[:50]}...", end=" ")
                with open(file_path, 'rb') as f:
                    content = f.read().decode('utf-8', errors='replace')
                    if not content.strip():
                        print("‚ö†Ô∏è Empty, skipping")
                        continue
                
                loader = TextLoader(file_path, encoding='utf-8')
                loaded = loader.load()
                docs.extend(loaded)
                print(f"‚úÖ {len(loaded)} chunks")
            except Exception as e:
                print(f"‚ùå {type(e).__name__}: {str(e)[:50]}")

    # 4. Load PDFs
    print("\nüìÇ Loading PDFs...")
    for folder in PDF_DIRS:
        full_path = os.path.join(DATA_DIR, "unstructured", "pdfs", folder)
        if not os.path.exists(full_path):
            print(f"‚ö†Ô∏è Skipping missing PDF folder: {full_path}")
            continue
            
        print(f"\nProcessing PDF folder: {full_path}")
        pdf_files = [f for f in os.listdir(full_path) if f.endswith(".pdf")]
        
        for file in pdf_files:
            file_path = os.path.join(full_path, file)
            try:
                print(f"  Processing: {file[:50]}...", end=" ")
                loader = PyPDFLoader(file_path)
                loaded = loader.load()
                docs.extend(loaded)
                print(f"‚úÖ {len(loaded)} chunks")
            except Exception as e:
                print(f"‚ùå {str(e)[:50]}")

    print(f"\nüìä Total documents loaded: {len(docs)}")
    return docs


# --- Split Text ---


In [38]:
def split_documents(documents):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    return splitter.split_documents(documents)


# --- Build or Load FAISS Vector Store ---

In [39]:
def build_vectorstore(docs):
    print("\nüîß Building vector store...")
    embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-small-en-v1.5",
        model_kwargs={"device": "cpu"},
        encode_kwargs={"normalize_embeddings": True}
    )
    
    print("‚öôÔ∏è Splitting documents...")
    split_docs = split_documents(docs)
    print(f"üìê Processing {len(split_docs)} document chunks...") 
    
    print("üîÑ Generating embeddings (this may take several minutes)...")
    vectorstore = FAISS.from_documents(split_docs, embeddings)
    
    print("üíæ Saving vector store...")
    os.makedirs(VECTOR_DB_PATH, exist_ok=True)
    vectorstore.save_local(VECTOR_DB_PATH)
    print("‚úÖ Vector store built successfully")
    return vectorstore

def load_vectorstore():
    embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-small-en-v1.5",
        model_kwargs={"device": "cpu"},
        encode_kwargs={"normalize_embeddings": True}
    )
    return FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)
# Load Structured Data 
def load_breed_data():
    return pd.read_excel(STRUCTURED_DATA_PATH)

# --- Prompt ---

In [40]:
# Internal cache
_llm = None

def get_llm():
    global _llm
    if _llm is None:
        try:
            _llm = pipeline(
                "text-generation",
                model="google/gemma-3-4b-it",
                device="cpu",
                model_kwargs={"torch_dtype": "auto"}
            )
        except Exception as e:
            raise RuntimeError(f"‚ùå Failed to load Gemma: {str(e)}") from e
    return _llm
def generate_breed_recommendations(breed_info, vectorstore):
    """
    Generates recommendations in the exact 3-part format for mobile apps
    """
    llm = get_llm()
    
    # Part 1: Breed Description
    description_prompt = f"""Write a 3-sentence description of {breed_info['Breed Name']} dogs:
    - First sentence: Personality traits
    - Second sentence: Activity preferences
    - Third sentence: Companionship qualities
    Example for Golden Retrievers:
    \"Golden Retrievers are friendly, intelligent companions. They love outdoor adventures! Their gentle nature makes them great family pets.\"
    Your response:"""
    
    description = llm(
        description_prompt,
        max_new_tokens=100,
        temperature=0.7
    )[0]['generated_text'].split("Your response:")[-1].strip().strip('"')
    
# Part 2: Health Tips - More robust generation
    tips_prompt = f"""Provide 5 essential health tips for {breed_info['Breed Name']} regarding {breed_info['Primary Health Issue']}:
    - Format: [emoji] [imperative sentence]!
    - Required emojis: ‚ö° ü•ó üèÉ üßº üë©‚Äç‚öïÔ∏è
    - Max 12 words per tip
    Example:
    ‚ö° Active dogs need joint supplements!
    ü•ó Measure food to prevent obesity!
    üèÉ Daily walks are essential!
    üßº Clean ears weekly!
    üë©‚Äç‚öïÔ∏è Annual vet checks catch issues early!
    Tips:"""
    
    # Generate multiple times if needed to get all 5 tips
    tips = []
    attempts = 0
    while len(tips) < 5 and attempts < 3:
        tips_response = llm(
            tips_prompt,
            max_new_tokens=150,
            temperature=0.5
        )[0]['generated_text']
        
        # Extract only valid tips
        new_tips = [tip.strip() for tip in tips_response.split("\n") 
                   if any(tip.strip().startswith(e) for e in ["‚ö°", "ü•ó", "üèÉ", "üßº", "üë©‚Äç‚öïÔ∏è"])]
        tips.extend(new_tips)
        attempts += 1
    
    # Ensure we have exactly 5 unique tips
    tips = list(dict.fromkeys(tips))[:5]  # Remove duplicates while preserving order
    if len(tips) < 5:
        # Fallback tips if generation fails
        tips = [
            "‚ö° Regular exercise prevents joint issues!",
            "ü•ó Feed measured meals to maintain weight!",
            "üèÉ Daily walks keep your dog healthy!",
            "üßº Groom weekly to prevent skin problems!",
            "üë©‚Äç‚öïÔ∏è Annual vet visits catch issues early!"
        ]
    
    # Part 3: Fun Fact - FIXED VERSION
    fact_prompt = f"""Generate exactly one fun fact about {breed_info['Breed Name']} dogs that would surprise owners:
    - Must begin with "Did you know?"
    - Must end with exactly one relevant emoji
    - Must be exactly 1 sentence (10-15 words)
    - Must be verifiably true
    
    Bad Example: "Fun fact about Beagles..." (doesn't start correctly)
    Bad Example: "Did you know? Beagles are dogs" (not surprising)
    Good Example: "Did you know? Beagles can detect smells 10,000x better than humans! üëÉ"
    
    Generate now: Did you know?"""
    
    # Generate with higher tokens and temperature for creativity
    fact_response = llm(
        fact_prompt,
        max_new_tokens=100,
        temperature=1.0,  # Higher for more creative facts
        top_p=0.95,
        do_sample=True
    )[0]['generated_text']
    
    # Robust extraction and formatting
    fact = ""
    if "Did you know?" in fact_response:
        fact = fact_response.split("Did you know?")[-1].strip()
        # Ensure it ends with emoji
        if not any(c in fact for c in ["üêï", "üê∂", "üëÉ", "üèÉ", "üêæ", "üåü", "!", "?"]):
            fact = f"{fact} üê∂"
        # Ensure proper punctuation
        if not fact.endswith(("!", "?", ".")):
            fact = f"{fact}!"
    else:
        # Fallback fact
        fact = f"Did you know? {breed_info['Breed Name']}s have an extraordinary sense of smell! üëÉ"
    
    # Final cleanup
    fact = fact.replace('"', '').strip()
    if not fact.startswith("Did you know?"):
        fact = f"Did you know? {fact}"
    
    return {
        "description": description,  # From Part 1
        "tips": tips,  # From Part 2
        "fun_fact": fact
    }




# --- Generate AI Advice ---

In [41]:

def retrieve_context(query, vectorstore):
    docs = vectorstore.similarity_search(query, k=3)
    return "\n".join([d.page_content[:500] for d in docs])

def generate_advice(breed_info, vectorstore):
    print("\nü§ñ Generating advice...")
    llm = get_llm()
    
    query = f"{breed_info['Primary Health Issue']} in {breed_info['Breed Name']}"
    context = retrieve_context(query, vectorstore)
    
    prompt =  generate_breed_recommendations(breed_info, context)
    
    try:
        response = llm(
            prompt,
            max_new_tokens=600,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            truncation=True
        )
        
        full_response = response[0]['generated_text']
        advice_start = full_response.find("### Your Advice") + len("### Your Advice")
        advice = full_response[advice_start:].strip()
        advice = advice.split("###")[0].strip()
        advice = advice.split("\n\n")[0].strip()
        
        if not advice.startswith(breed_info['Breed Name']):
            advice = f"{breed_info['Breed Name']} {advice}"
            
        return advice
        
    except Exception as e:
        print(f"Error generating advice: {e}")
        return f"Professional advice for {breed_info['Breed Name']} could not be generated."


# --- Main Entry Point ---

In [42]:
if __name__ == "__main__":
    print("\n> Starting RAG pipeline :) ")
    
    if os.path.exists(VECTOR_DB_PATH):
        shutil.rmtree(VECTOR_DB_PATH)
    
    docs = load_documents()
    if not docs:
        raise ValueError("No documents loaded - check your data paths")
    
    vectorstore = build_vectorstore(docs)
    
    try:
        df = pd.read_excel(STRUCTURED_DATA_PATH)
    except Exception as e:
        raise ValueError(f"Error loading breed data: {e}")

    breed_name = input("Enter breed name: ").strip()
    breed_row = df[df["Breed Name"] == breed_name]
    
    if breed_row.empty:
        print(f"Breed '{breed_name}' not found")
    else:
        breed_info = breed_row.iloc[0].to_dict()
        
        # Generate both detailed advice and mobile recommendations
        advice = generate_advice(breed_info, vectorstore)
        recommendations = generate_breed_recommendations(breed_info, vectorstore)
        
        print("\n>> Generated Advice:\n")
        print(advice)
        
        print("\n>> Mobile App Recommendations:\n")
        print(f"# Know More About Me\n")
        print(f"{breed_info['Breed Name']} üåüÔ∏è\n")
        print(f"{recommendations['description']} üåüÔ∏è\n")
        print("## Tips & Recommendations üåüÔ∏è\n")
        print("\n".join(recommendations['tips']))
        print("\n---\n")
        print(f"### Fun Fact üåüÔ∏è\n")
        print(f"{recommendations['fun_fact']} üåüÔ∏è\n")
        print("\n" + "="*50 + "\n")


> Starting RAG pipeline :) 

[Document Loading Progress]

üîç Verifying directory structure...
‚úì All PDFs directories present
‚úì All HTML directories present
‚úì All Structured directories present

üìÇ Loading structured data...
Found Excel file at: rag_data\structured\breed_health_data.xlsx
    ‚Üí Loaded 208 structured records

üìÇ Loading HTML files...

Processing HTML folder: rag_data\unstructured\web_articles\kennel_clubs
  Processing: A new direction for kennel club regulations and br... ‚úÖ 1 chunks
  Processing: AKC's Guide to Responsible Dog Breeding ‚Äì American... ‚úÖ 1 chunks
  Processing: Cognitive Traits Vary by Breed - But How and Why_.... ‚úÖ 1 chunks
  Processing: Kennel Club Launching Breed Health And Conservatio... ‚úÖ 1 chunks
  Processing: The Kennel Club gives first preview of new health ... ‚úÖ 1 chunks
  Processing: The Kennel Club Health Standard _ Kennel Club.html... ‚úÖ 1 chunks

Processing HTML folder: rag_data\unstructured\web_articles\vet_associatio

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


Error generating advice: can only concatenate str (not "dict") to str

>> Generated Advice:

Professional advice for Beagle could not be generated.

>> Mobile App Recommendations:

# Know More About Me

Beagle üåüÔ∏è

Beagles are known for their happy-go-lucky and curious personalities. They thrive on sniffing and exploring, requiring plenty of exercise. Beagles make wonderful, loyal companions, always eager to be by their owner's side. üåüÔ∏è

## Tips & Recommendations üåüÔ∏è

‚ö° Active dogs need joint supplements!
ü•ó Measure food to prevent obesity!
üèÉ Daily walks are essential!
üßº Clean ears weekly!
üë©‚Äç‚öïÔ∏è Annual vet checks catch issues early!

---

### Fun Fact üåüÔ∏è

Did you know? Beagles' incredible noses allow them to find hidden bones easily ü¶¥. üê∂! üåüÔ∏è



