In [None]:
import pandas as pd
import numpy as np
import json
import os
import time
from datetime import datetime
from typing import List, Dict, Any, Optional, Callable
import logging
import concurrent.futures
from sklearn.metrics.pairwise import cosine_similarity
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer

# Optional imports (for LangChain approach)
try:
    from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
    from langchain_openai import ChatOpenAI, AzureChatOpenAI, AzureOpenAI
    from azure.ai.inference import ChatCompletionsClient
    from azure.core.credentials import AzureKeyCredential
    from dotenv import dotenv_values

    has_langchain = True
except ImportError:
    has_langchain = False
    print("LangChain not available. Will use rule-based filtering only.")

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("session_recommendation")

# Neo4j connection parameters
uri = "bolt://127.0.0.1:7687"
username = "neo4j"
password = ""  # Replace with your password

# Initialize Neo4j driver
# driver = GraphDatabase.driver(uri, auth=(username, password))

# Load the same embedding model for consistency
model = SentenceTransformer("all-MiniLM-L6-v2")

# Define role groups (for rule processing)
VET_ROLES = [
    "Vet/Vet Surgeon",
    "Assistant Vet",
    "Vet/Owner",
    "Clinical or other Director",
    "Locum Vet",
    "Academic",
]

NURSE_ROLES = ["Head Nurse/Senior Nurse", "Vet Nurse", "Locum RVN"]

BUSINESS = ["Practice Manager", "Practice Partner/Owner"]
# Other roles can attend any session
OTHER_ROLES = ["Student", "Receptionist", "Other (please specify)"]

# Cache for this year's sessions and embeddings
_this_year_sessions_cache = None
_visitor_cache = {}
_similar_visitors_cache = {}

# == Session Recommendation System ==


def clear_caches():
    """Clear all caches."""
    global _this_year_sessions_cache, _visitor_cache, _similar_visitors_cache
    _this_year_sessions_cache = None
    _visitor_cache = {}
    _similar_visitors_cache = {}

In [None]:
def get_this_year_sessions(tx):
    """
    Get all sessions for this year with their embeddings.
    Uses caching for better performance.
    """
    global _this_year_sessions_cache

    if _this_year_sessions_cache is not None:
        return _this_year_sessions_cache

    # Query to get this year's sessions with embeddings
    query = """
    MATCH (s:Sessions_this_year)
    WHERE s.embedding IS NOT NULL
    RETURN s.session_id as session_id, 
           s.title as title, 
           s.stream as stream, 
           s.synopsis_stripped as synopsis_stripped,
           s.theatre__name as theatre__name,
           s.embedding as embedding,
           s.date as date,
           s.start_time as start_time,
           s.end_time as end_time,
           s.sponsored_by as sponsored_by,
           s.sponsored_session as sponsored_session
    """

    results = tx.run(query).data()

    # Process results and cache them
    sessions = {}
    for r in results:
        embedding = np.array(json.loads(r["embedding"])) if r["embedding"] else None

        if embedding is not None:
            sessions[r["session_id"]] = {
                "session_id": r["session_id"],
                "title": r["title"],
                "stream": r["stream"],
                "synopsis_stripped": r["synopsis_stripped"],
                "theatre__name": r["theatre__name"],
                "embedding": embedding,
                "date": r["date"],
                "start_time": r["start_time"],
                "end_time": r["end_time"],
                "sponsored_by": r["sponsored_by"],
                "sponsored_session": r["sponsored_session"],
            }

    _this_year_sessions_cache = sessions
    return sessions


def get_visitor_info(tx, visitor_id):
    """
    Get visitor information with caching.
    """
    global _visitor_cache

    if visitor_id in _visitor_cache:
        return _visitor_cache[visitor_id]

    visitor_query = """
    MATCH (v:Visitor_this_year {BadgeId: $visitor_id})
    RETURN v
    """
    visitor_data = tx.run(visitor_query, visitor_id=visitor_id).single()

    if not visitor_data:
        return None

    visitor = visitor_data["v"]
    assisted = visitor.get("assist_year_before", "0")

    _visitor_cache[visitor_id] = {"visitor": visitor, "assisted": assisted}
    return _visitor_cache[visitor_id]


def get_past_sessions(tx, visitor_id):
    """
    Get sessions the visitor attended last year.
    Optimized with a more efficient query.
    """
    # Single query combining both visitor types
    query_past = """
    MATCH (v:Visitor_this_year {BadgeId: $visitor_id})-[:Same_Visitor]->(vp)-[:attended_session]->(sp:Sessions_past_year)
    WHERE (vp:Visitor_last_year_bva OR vp:Visitor_last_year_lva)
    RETURN DISTINCT sp.session_id as session_id, sp.embedding as embedding
    """

    results = tx.run(query_past, visitor_id=visitor_id).data()

    # Process embeddings
    sessions = []
    for r in results:
        embedding = np.array(json.loads(r["embedding"])) if r["embedding"] else None
        if embedding is not None:
            sessions.append({"session_id": r["session_id"], "embedding": embedding})

    return sessions


def find_similar_visitors_batch(tx, visitor, num_similar_visitors=3):
    """
    Find similar visitors with batch processing.
    Uses a more efficient query and caching.
    """
    global _similar_visitors_cache

    visitor_id = visitor["BadgeId"]

    # Check cache first
    if visitor_id in _similar_visitors_cache:
        return _similar_visitors_cache[visitor_id]

    # Get all visitors with sessions in one query
    query = """
    MATCH (v:Visitor_this_year)
    WHERE v.assist_year_before = '1' AND v.BadgeId <> $visitor_id
    // Pre-filter to avoid processing all visitors
    WITH v, 
         CASE WHEN v.job_role = $job_role THEN 1 ELSE 0 END + 
         CASE WHEN v.what_type_does_your_practice_specialise_in = $practice_type THEN 1 ELSE 0 END +
         CASE WHEN v.organisation_type = $org_type THEN 1 ELSE 0 END +
         CASE WHEN v.Country = $country THEN 1 ELSE 0 END AS base_similarity
    // Only process those with some similarity
    WHERE base_similarity > 0
    // Check if they have attended sessions (to save processing visitors without sessions)
    MATCH (v)-[:Same_Visitor]->(vp)-[:attended_session]->(sp:Sessions_past_year)
    WHERE (vp:Visitor_last_year_bva OR vp:Visitor_last_year_lva)
    WITH v, base_similarity, COUNT(DISTINCT sp) AS session_count
    WHERE session_count > 0
    RETURN v, base_similarity
    ORDER BY base_similarity DESC, session_count DESC
    LIMIT 20
    """

    visitors_data = tx.run(
        query,
        visitor_id=visitor_id,
        job_role=visitor.get("job_role", ""),
        practice_type=visitor.get("what_type_does_your_practice_specialise_in", ""),
        org_type=visitor.get("organisation_type", ""),
        country=visitor.get("Country", ""),
    ).data()

    # If we can't find enough similar visitors with the pre-filtering,
    # try a more general query
    if len(visitors_data) < num_similar_visitors:
        query = """
        MATCH (v:Visitor_this_year)
        WHERE v.assist_year_before = '1' AND v.BadgeId <> $visitor_id
        // Check if they have attended sessions
        MATCH (v)-[:Same_Visitor]->(vp)-[:attended_session]->(sp:Sessions_past_year)
        WHERE (vp:Visitor_last_year_bva OR vp:Visitor_last_year_lva)
        WITH v, COUNT(DISTINCT sp) AS session_count
        WHERE session_count > 0
        RETURN v, 0 AS base_similarity
        ORDER BY session_count DESC
        LIMIT 20
        """
        visitors_data = tx.run(query, visitor_id=visitor_id).data()

    # Extract visitor features for comparison
    def get_visitor_features(v):
        attributes = [
            v.get("what_type_does_your_practice_specialise_in", ""),
            v.get("job_role", ""),
            v.get("organisation_type", ""),
            v.get("JobTitle", ""),
            v.get("Country", ""),
        ]
        return " ".join(
            [
                str(attr)
                for attr in attributes
                if attr and str(attr).strip() and str(attr) != "NA"
            ]
        )

    # Get embedding for our visitor
    visitor_text = get_visitor_features(visitor)
    if not visitor_text.strip():
        visitor_text = "default visitor profile"

    try:
        # Load model here to ensure it's available
        model = SentenceTransformer("all-MiniLM-L6-v2")
        visitor_embedding = model.encode(visitor_text)

        # Calculate similarities for top 20 pre-filtered visitors
        similarities = []
        for vdata in visitors_data:
            v_compare = vdata["v"]
            base_similarity = vdata["base_similarity"]

            compare_text = get_visitor_features(v_compare)
            if not compare_text.strip():
                continue

            try:
                compare_embedding = model.encode(compare_text)
                sim = cosine_similarity([visitor_embedding], [compare_embedding])[0][0]
                # Combine neural and rule-based similarity
                combined_sim = (sim * 0.7) + (
                    base_similarity * 0.3 / 4
                )  # Max base_similarity is 4
                similarities.append((v_compare["BadgeId"], combined_sim))
            except Exception as e:
                print(f"Error comparing with visitor {v_compare['BadgeId']}: {e}")
                continue

        # Sort by similarity (highest first) and get top N
        similarities.sort(key=lambda x: -x[1])
        similar_visitors = [sid for sid, _ in similarities[:num_similar_visitors]]

        # Cache for future use
        _similar_visitors_cache[visitor_id] = similar_visitors
        return similar_visitors

    except Exception as e:
        print(f"Error encoding visitor profile: {e}")
        return []


def get_similar_visitor_sessions_batch(tx, similar_visitor_badge_ids):
    """
    Get sessions attended by similar visitors using a batch query.
    """
    if not similar_visitor_badge_ids:
        return []

    # Single query to get all sessions at once
    query = """
    MATCH (v:Visitor_this_year)-[:Same_Visitor]->(vp)-[:attended_session]->(sp:Sessions_past_year)
    WHERE v.BadgeId IN $similar_visitor_ids AND 
          (vp:Visitor_last_year_bva OR vp:Visitor_last_year_lva)
    RETURN DISTINCT sp.session_id AS session_id, sp.embedding AS embedding
    """

    results = tx.run(query, similar_visitor_ids=similar_visitor_badge_ids).data()

    # Process embeddings
    sessions = []
    for r in results:
        embedding = np.array(json.loads(r["embedding"])) if r["embedding"] else None
        if embedding is not None:
            sessions.append({"session_id": r["session_id"], "embedding": embedding})

    return sessions


def calculate_session_similarities_parallel(
    past_sessions, this_year_sessions, min_score=0.0
):
    """
    Calculate similarities between past sessions and this year's sessions in parallel.

    Args:
        past_sessions: List of past session objects with embeddings
        this_year_sessions: Dict of this year's session objects with embeddings
        min_score: Minimum similarity score threshold

    Returns:
        List of recommended sessions with similarity scores
    """
    if not past_sessions or not this_year_sessions:
        return []

    # Function to calculate similarity for a single past session against all this year sessions
    def process_past_session(past_sess):
        recommendations = []
        past_emb = past_sess["embedding"]

        for sid, current_sess in this_year_sessions.items():
            try:
                current_emb = current_sess["embedding"]
                sim = cosine_similarity([past_emb], [current_emb])[0][0]

                if sim >= min_score:
                    recommendations.append(
                        {
                            "session_id": sid,
                            "title": current_sess["title"],
                            "stream": current_sess["stream"],
                            "theatre__name": current_sess["theatre__name"],
                            "date": current_sess["date"],
                            "start_time": current_sess["start_time"],
                            "end_time": current_sess["end_time"],
                            "sponsored_by": current_sess.get("sponsored_by", ""),
                            "sponsored_session": current_sess.get(
                                "sponsored_session", ""
                            ),
                            "similarity": sim,
                        }
                    )
            except Exception as e:
                print(f"Error calculating similarity for session {sid}: {e}")

        return recommendations

    # Use parallel processing for faster calculation
    all_recommendations = []
    with concurrent.futures.ThreadPoolExecutor(
        max_workers=min(8, len(past_sessions))
    ) as executor:
        future_to_session = {
            executor.submit(process_past_session, ps): ps for ps in past_sessions
        }
        for future in concurrent.futures.as_completed(future_to_session):
            try:
                recommendations = future.result()
                all_recommendations.extend(recommendations)
            except Exception as e:
                print(f"Error processing session: {e}")

    # Deduplicate recommendations, keeping the highest similarity score
    session_to_best_rec = {}
    for rec in all_recommendations:
        sid = rec["session_id"]
        if (
            sid not in session_to_best_rec
            or rec["similarity"] > session_to_best_rec[sid]["similarity"]
        ):
            session_to_best_rec[sid] = rec

    # Convert back to list and sort by similarity
    recommendations = list(session_to_best_rec.values())
    recommendations.sort(key=lambda x: -x["similarity"])

    return recommendations

In [None]:
def recommend_sessions_optimized(
    badge_id,
    assist_year_before="1",
    min_score=0.0,
    max_recommendations=None,
    num_similar_visitors=3,
    use_neo4j=True,
):
    """
    Optimized version of the recommend_sessions function.

    Args:
        badge_id: Visitor's badge ID
        assist_year_before: "1" if visitor attended last year, "0" otherwise
        min_score: Minimum similarity score for recommendations (0.0-1.0)
        max_recommendations: Maximum number of recommendations to return
        num_similar_visitors: Number of similar visitors to consider for case 2
        use_neo4j: Whether to use Neo4j for real recommendations or mock data

    Returns:
        List of recommended sessions with details
    """
    start_time = time.time()

    # Use mock recommendations if Neo4j is not available
    if not use_neo4j:
        print(f"Using mock recommendations for visitor {badge_id}")
        recommendations = mock_get_recommendations(
            badge_id, count=max_recommendations or 10
        )
        print(
            f"Generated {len(recommendations)} mock recommendations in {time.time() - start_time:.2f}s"
        )
        return recommendations

    # Initialize Neo4j driver and get recommendations
    with GraphDatabase.driver(uri, auth=(username, password)) as driver:
        with driver.session() as session:
            # Get visitor information
            visitor_info = session.execute_read(get_visitor_info, visitor_id=badge_id)
            if not visitor_info:
                print(f"Visitor with BadgeId {badge_id} not found.")
                return []

            visitor = visitor_info["visitor"]
            assisted = visitor_info["assisted"]

            # Get all this year's sessions in one go (uses caching)
            this_year_sessions = session.execute_read(get_this_year_sessions)
            print(
                f"Loaded {len(this_year_sessions)} sessions for this year in {time.time() - start_time:.2f}s"
            )

            past_sessions = []

            if assisted == "1":
                # Case 1: Visitor attended last year
                print(f"Case 1: Visitor {badge_id} attended last year")
                case_start = time.time()

                # Get sessions the visitor attended last year
                past_sessions = session.execute_read(
                    get_past_sessions, visitor_id=badge_id
                )
                print(
                    f"Found {len(past_sessions)} past sessions in {time.time() - case_start:.2f}s"
                )

            else:
                # Case 2: New visitor - find similar visitors
                print(
                    f"Case 2: Finding {num_similar_visitors} similar visitors for {badge_id}"
                )
                case_start = time.time()

                # Find similar visitors in batch
                similar_visitors = session.execute_read(
                    find_similar_visitors_batch,
                    visitor=visitor,
                    num_similar_visitors=num_similar_visitors,
                )

                print(
                    f"Found {len(similar_visitors)} similar visitors in {time.time() - case_start:.2f}s"
                )
                sim_sessions_start = time.time()

                if similar_visitors:
                    # Get sessions attended by similar visitors in batch
                    past_sessions = session.execute_read(
                        get_similar_visitor_sessions_batch,
                        similar_visitor_badge_ids=similar_visitors,
                    )

                    print(
                        f"Found {len(past_sessions)} sessions from similar visitors in {time.time() - sim_sessions_start:.2f}s"
                    )

            # Calculate similarities in parallel
            sim_calc_start = time.time()
            recommendations = calculate_session_similarities_parallel(
                past_sessions=past_sessions,
                this_year_sessions=this_year_sessions,
                min_score=min_score,
            )
            print(f"Calculated similarities in {time.time() - sim_calc_start:.2f}s")

            # Apply maximum recommendations limit
            if max_recommendations and len(recommendations) > max_recommendations:
                recommendations = recommendations[:max_recommendations]

            print(f"Total recommendation time: {time.time() - start_time:.2f}s")
            return recommendations


# == Session Filtering System ==


class SessionFilter:
    """
    Class for filtering recommended sessions based on business rules.
    Designed to be flexible to accommodate changing business rules.
    """

    def __init__(self, rules_config: Optional[Dict] = None):
        """
        Initialize the session filter with optional rules configuration.

        Args:
            rules_config: Dictionary of rule configurations that can override defaults
        """
        # Default rule set - can be overridden via rules_config
        self.rules_config = {
            "equine_mixed_exclusions": [
                "exotics",
                "feline",
                "exotic animal",
                "farm",
                "small animal",
            ],
            "small_animal_exclusions": [
                "equine",
                "farm animal",
                "farm",
                "large animal",
            ],
            "vet_exclusions": ["nursing"],
            "nurse_streams": ["nursing", "wellbeing", "welfare"],
            "rule_priority": ["practice_type", "role"],  # Order of rule application
        }

        # Override defaults with provided config if any
        if rules_config:
            self.rules_config.update(rules_config)

        # Register rule implementations
        self.rule_implementations = {
            "practice_type": self._apply_practice_type_rules,
            "role": self._apply_role_rules,
        }

        logger.info(f"Session filter initialized with rules: {self.rules_config}")

    def _contains_any(self, text: str, keywords: List[str]) -> bool:
        """Check if text contains any of the keywords (case-insensitive)."""
        if not text or not isinstance(text, str):
            return False

        text_lower = text.lower()
        return any(keyword.lower() in text_lower for keyword in keywords)

    def _apply_practice_type_rules(
        self, visitor: Dict[str, Any], sessions: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        """Apply practice type filtering rules."""
        if not visitor or "what_type_does_your_practice_specialise_in" not in visitor:
            return sessions, []  # No filtering if practice type is missing

        practice_type = visitor.get("what_type_does_your_practice_specialise_in", "")
        if not practice_type or practice_type == "NA":
            return sessions, []

        filtered_sessions = []
        rules_applied = []

        # Check if practice contains equine or mixed
        if self._contains_any(practice_type, ["equine", "mixed"]):
            exclusions = self.rules_config["equine_mixed_exclusions"]
            # Filter out sessions with excluded streams
            filtered_sessions = [
                session
                for session in sessions
                if not session.get("stream")
                or not self._contains_any(session["stream"], exclusions)
            ]
            rules_applied.append(
                f"practice_type: mixed/equine - excluded {', '.join(exclusions)}"
            )
            logger.info(
                f"Applied equine/mixed rule: filtered from {len(sessions)} to {len(filtered_sessions)} sessions"
            )

        # Check if practice contains small animal
        elif self._contains_any(practice_type, ["small animal"]):
            exclusions = self.rules_config["small_animal_exclusions"]
            # Filter out sessions with excluded streams
            filtered_sessions = [
                session
                for session in sessions
                if not session.get("stream")
                or not self._contains_any(session["stream"], exclusions)
            ]
            rules_applied.append(
                f"practice_type: small animal - excluded {', '.join(exclusions)}"
            )
            logger.info(
                f"Applied small animal rule: filtered from {len(sessions)} to {len(filtered_sessions)} sessions"
            )

        else:
            # No specific practice type rule applies
            filtered_sessions = sessions

        return filtered_sessions, rules_applied

    def _apply_role_rules(
        self, visitor: Dict[str, Any], sessions: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        """Apply job role filtering rules."""
        if not visitor or "job_role" not in visitor:
            return sessions, []  # No filtering if job role is missing

        job_role = visitor.get("job_role", "")
        if not job_role or job_role == "NA":
            return sessions, []

        filtered_sessions = []
        rules_applied = []

        # Rule for VET_ROLES
        if job_role in VET_ROLES:
            exclusions = self.rules_config["vet_exclusions"]
            # Filter out sessions with excluded streams
            filtered_sessions = [
                session
                for session in sessions
                if not session.get("stream")
                or not self._contains_any(session["stream"], exclusions)
            ]
            rules_applied.append(f"role: vet - excluded {', '.join(exclusions)}")
            logger.info(
                f"Applied vet role rule: filtered from {len(sessions)} to {len(filtered_sessions)} sessions"
            )

        # Rule for NURSE_ROLES
        elif job_role in NURSE_ROLES:
            allowed_streams = self.rules_config["nurse_streams"]
            # Only keep sessions with allowed streams
            filtered_sessions = [
                session
                for session in sessions
                if session.get("stream")
                and self._contains_any(session["stream"], allowed_streams)
            ]
            rules_applied.append(
                f"role: nurse - limited to {', '.join(allowed_streams)}"
            )
            logger.info(
                f"Applied nurse role rule: filtered from {len(sessions)} to {len(filtered_sessions)} sessions"
            )

        else:
            # No specific role rule applies
            filtered_sessions = sessions

        return filtered_sessions, rules_applied

    def filter_sessions(
        self, visitor: Dict[str, Any], sessions: List[Dict[str, Any]]
    ) -> (List[Dict[str, Any]], List[str]):
        """
        Filter sessions based on visitor profile and business rules.

        Args:
            visitor: Dictionary containing visitor profile
            sessions: List of session dictionaries to filter

        Returns:
            Tuple of (filtered_sessions, rules_applied) where rules_applied is a list of rule descriptions
        """
        if not sessions:
            return [], []

        filtered_sessions = sessions
        rule_priority = self.rules_config["rule_priority"]
        all_rules_applied = []

        # Apply rules in priority order
        for rule_type in rule_priority:
            if rule_type in self.rule_implementations:
                rule_func = self.rule_implementations[rule_type]
                filtered_sessions, rules_applied = rule_func(visitor, filtered_sessions)
                all_rules_applied.extend(rules_applied)
            else:
                logger.warning(f"Unknown rule type: {rule_type}")

        # Sort by similarity score (highest first)
        filtered_sessions.sort(
            key=lambda x: float(x.get("similarity", 0)), reverse=True
        )

        return filtered_sessions, all_rules_applied

    def update_rules(self, new_rules: Dict[str, Any]) -> None:
        """
        Update rule configurations at runtime.

        Args:
            new_rules: Dictionary of rule configurations to update
        """
        self.rules_config.update(new_rules)
        logger.info(f"Updated rules configuration: {new_rules}")

    def add_rule_implementation(self, rule_name: str, rule_func: Callable) -> None:
        """
        Add a new rule implementation or override an existing one.

        Args:
            rule_name: Name of the rule
            rule_func: Function that implements the rule, should accept visitor and sessions parameters
        """
        self.rule_implementations[rule_name] = rule_func
        logger.info(f"Added rule implementation: {rule_name}")

        # Ensure rule is in priority list
        if rule_name not in self.rules_config["rule_priority"]:
            self.rules_config["rule_priority"].append(rule_name)


# == LangChain Filtering Approach ==


def filter_with_langchain(visitor, recommendations, rules):
    """
    Filter recommendations using LangChain and LLM.

    Args:
        visitor: Visitor profile dictionary
        recommendations: List of recommended sessions
        rules: Business rules text

    Returns:
        Filtered list of recommendations
    """
    if not has_langchain:
        logger.warning("LangChain not available. Cannot filter using LLM.")
        return recommendations, ["LangChain filtering not available"]

    try:
        # Load .env file
        config = dotenv_values(".env")

        # Initialize Azure OpenAI client
        llm = AzureChatOpenAI(
            azure_endpoint=config["AZURE_ENDPOINT"],
            azure_deployment=config["AZURE_DEPLOYMENT"],
            api_key=config["AZURE_API_KEY"],
            api_version=config["AZURE_API_VERSION"],
            temperature=0.5,
            top_p=0.9,
        )

        # Get keys from visitor and recommendations
        list_keys_vis = list(visitor.keys())
        if recommendations:
            list_keys = list(recommendations[0].keys())
        else:
            list_keys = [
                "session_id",
                "title",
                "stream",
                "theatre__name",
                "date",
                "start_time",
                "end_time",
                "sponsored_by",
                "sponsored_session",
                "similarity",
            ]

        # Generate the prompt
        system_prompt = f"""
        you are an assistant specialized in filter sessions of an Event based in bussiness rules and profiles of users.\n
        - you will receive a profile of a visitor with the following keys: {list_keys_vis}\n
          the attribute what_type_does_your_practice_specialise_in can be a list of specializations separated by ";"
        - you will receive a list of session with the following keys: {list_keys}\n
          stream in session can be a list of tpocis separated by ";". When you evaluate the rule you need to consider all of them \n
        - each session you return must have the same format.
        - different Job_Roles Groups:\n
        VET_ROLES = [
        "Vet/Vet Surgeon",
        "Assistant Vet",
        "Vet/Owner",
        "Clinical or other Director",
        "Locum Vet", 
        "Academic",
        ]\n
        
        NURSE_ROLES = ["Head Nurse/Senior Nurse", "Vet Nurse", "Locum RVN"]\n
        
        BUSINESS = ["Practice Manager", "Practice Partner/Owner"]\n
        # Other roles can attend any session
        OTHER_ROLES = ["Student", "Receptionist", "Other (please specify)"]\n
        - only return the sessions in json format
        """

        prompt = PromptTemplate(
            input_variables=["sessions", "rules", "profile"],
            template=system_prompt
            + """For the Visitor with profile {profile}\n, based on the attributes of these session: {sessions} and implementing the following rules {rules}.\n Filter them and just return from the list those that meet the requirements in the rules""",
        )

        # Create chain
        chain = prompt | llm

        # Convert recommendations to text
        text_rec = json.dumps(recommendations)

        # Invoke the chain
        logger.info("Invoking LangChain for session filtering")
        ai_msg = chain.invoke(
            {"sessions": text_rec, "profile": visitor, "rules": rules}
        )

        # Parse the response to extract the filtered recommendations
        response_text = ai_msg.content

        # Try to extract JSON from the response
        import re

        json_match = re.search(r"```json\n(.*?)\n```", response_text, re.DOTALL)
        if json_match:
            json_str = json_match.group(1)
        else:
            # Try to find JSON without markdown
            json_match = re.search(r"\[\s*{.*}\s*\]", response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
            else:
                json_str = response_text

        try:
            filtered_recommendations = json.loads(json_str)
            logger.info(
                f"Successfully filtered recommendations using LangChain: {len(filtered_recommendations)} sessions"
            )
            return filtered_recommendations, ["Filtered using LLM-based rules"]
        except json.JSONDecodeError as e:
            logger.error(f"Error parsing LangChain response: {e}")
            return recommendations, ["Error in LLM filtering"]

    except Exception as e:
        logger.error(f"Error using LangChain for filtering: {e}")
        return recommendations, ["Error in LLM filtering: " + str(e)]


# == Combined Recommendation & Filtering System ==


def get_recommendations_and_filter(
    badge_id: str,
    min_score: float = 0.5,
    max_recommendations: int = 30,
    rules_config: Optional[Dict] = None,
    visitor_data: Optional[pd.DataFrame] = None,
    use_neo4j: bool = True,
    use_langchain: bool = False,
    business_rules: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Get session recommendations and filter them based on business rules.

    Args:
        badge_id: Visitor's badge ID
        min_score: Minimum similarity score
        max_recommendations: Maximum number of recommendations to return
        rules_config: Optional rules configuration
        visitor_data: Optional DataFrame containing visitor data
        use_neo4j: Whether to use Neo4j for recommendations
        use_langchain: Whether to filter using LangChain
        business_rules: Optional business rules text for LangChain

    Returns:
        Dictionary with visitor profile, raw recommendations, filtered recommendations,
        and metadata about the filtering process
    """
    start_time = time.time()
    processing_steps = []

    # Get visitor profile
    visitor = None
    if visitor_data is not None:
        visitor_rows = visitor_data[visitor_data["BadgeId"] == badge_id]
        if not visitor_rows.empty:
            visitor = visitor_rows.iloc[0].to_dict()
            processing_steps.append(f"Found visitor {badge_id} in provided data")

    if visitor is None and use_neo4j:
        logger.warning(f"Visitor with BadgeId {badge_id} not found in provided data")
        # Try to get basic visitor info from Neo4j
        try:
            with GraphDatabase.driver(uri, auth=(username, password)) as driver:
                with driver.session() as session:
                    result = session.run(
                        "MATCH (v:Visitor_this_year {BadgeId: $badge_id}) RETURN v",
                        badge_id=badge_id,
                    ).single()

                    if result:
                        visitor = dict(result["v"])
                        processing_steps.append(f"Found visitor {badge_id} in Neo4j")
                    else:
                        processing_steps.append(
                            f"Visitor {badge_id} not found in Neo4j"
                        )
        except Exception as e:
            logger.error(f"Error connecting to Neo4j: {e}")
            processing_steps.append(f"Neo4j connection error: {e}")

    # If visitor still not found, create a mock visitor for testing
    if visitor is None:
        visitor = {
            "BadgeId": badge_id,
            "job_role": "Vet/Vet Surgeon",
            "what_type_does_your_practice_specialise_in": "Mixed",
            "Email": "test@example.com",
            "Email_domain": "example.com",
            "Company": "Test Company",
            "JobTitle": "veterinary surgeon",
            "Country": "UK",
            "BadgeType": "Delegate",
            "ShowRef": "BVA2025",
            "Source": "Test",
            "Days_since_registration": 100,
            "assist_year_before": 1,
            "organisation_type": "Independent",
        }
        processing_steps.append("Created mock visitor profile for testing")

    # Get recommendations
    try:
        # Get raw recommendations
        recommendations = recommend_sessions_optimized(
            badge_id=badge_id,
            assist_year_before=str(visitor.get("assist_year_before", "1")),
            min_score=min_score,
            max_recommendations=max_recommendations,
            use_neo4j=use_neo4j,
        )

        processing_steps.append(f"Retrieved {len(recommendations)} raw recommendations")

        # Choose filtering approach
        if use_langchain and has_langchain and business_rules:
            # Use LangChain filtering
            processing_steps.append("Using LangChain for filtering")
            filtered_recommendations, filter_notes = filter_with_langchain(
                visitor=visitor, recommendations=recommendations, rules=business_rules
            )
            processing_steps.extend(filter_notes)
        else:
            # Use rule-based filtering
            processing_steps.append("Using rule-based filtering")
            session_filter = SessionFilter(rules_config)
            filtered_recommendations, filter_notes = session_filter.filter_sessions(
                visitor, recommendations
            )
            processing_steps.extend(filter_notes)

        processing_steps.append(
            f"Filtered to {len(filtered_recommendations)} recommendations"
        )

        # Create result dictionary
        result = {
            "visitor": visitor,
            "raw_recommendations": recommendations,
            "filtered_recommendations": filtered_recommendations,
            "metadata": {
                "badge_id": badge_id,
                "num_raw_recommendations": len(recommendations),
                "num_filtered_recommendations": len(filtered_recommendations),
                "processing_time": time.time() - start_time,
                "timestamp": datetime.now().isoformat(),
                "processing_steps": processing_steps,
            },
        }

        return result

    except Exception as e:
        logger.error(f"Error getting recommendations: {str(e)}")
        return {
            "visitor": visitor,
            "raw_recommendations": [],
            "filtered_recommendations": [],
            "metadata": {
                "error": str(e),
                "processing_time": time.time() - start_time,
                "processing_steps": processing_steps,
            },
        }


def save_recommendations_to_json(result: Dict[str, Any], output_path: str) -> None:
    """
    Save recommendations to a JSON file.

    Args:
        result: Result dictionary from get_recommendations_and_filter
        output_path: Path to save the JSON file
    """
    try:
        # Ensure the directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        with open(output_path, "w") as f:
            json.dump(result, f, indent=2, default=str)
        logger.info(f"Saved recommendations to {output_path}")
        return True
    except Exception as e:
        logger.error(f"Error saving recommendations to {output_path}: {str(e)}")
        return False


# Standard rules for filtering
default_rules = """
1.) if visitor what_type_does_your_practice_specialise_in contains "equine" or "mixed", you can't propose session on stream "exotics", "feline", "exotic animal", "farm", "small animal"\n
2.) if visitor what_type_does_your_practice_specialise_in contains "small animal", you can't propose session on stream "equine", "farm animal", "farm", "large animal"\n
3.) if job_role in VET_ROLES session.stream cant be "nursing"\n
4.) if job_role in NURSE_ROLES you only recommend sessions in stream "nursing", "wellbeing", "welfare"\n
5.) rule 1 and 2 are mutually exclusive and apply first then apply 3 and 4
"""

In [None]:
# Let's run a test with a visitor
print("Session Recommendation and Filtering Notebook Ready!")
print("This notebook supports both Neo4j-based and mock recommendations")
print("It also supports both rule-based and LangChain-based filtering")

# Check if we're running in a Jupyter notebook
try:
    from IPython import get_ipython

    if get_ipython() is not None:
        # We're in a notebook, try to import display libraries
        from IPython.display import display, HTML, JSON

        in_notebook = True
        print("Running in Jupyter notebook environment")
    else:
        in_notebook = False
except ImportError:
    in_notebook = False

In [None]:
# Load visitor data if available
try:
    data_this = pd.read_csv("data/bva/output/df_reg_demo_this.csv")
    print(f"Loaded visitor data with {len(data_this)} visitors")

    # Display a sample of the visitor data
    if in_notebook:
        display(data_this.head(2))
    else:
        print(data_this.head(2))
except Exception as e:
    print(f"Could not load visitor data: {e}")
    data_this = None

In [None]:
result3 = get_recommendations_and_filter(
    badge_id="EL7CTM9",  # Replace with an actual badge ID
    min_score=0.3,
    max_recommendations=10,
    rules_config=None,  # Use default rules
    visitor_data=data_this,
    use_neo4j=True,  # Use real Neo4j data
    use_langchain=True,  # Use rule-based filtering
)

In [None]:
# result3

In [None]:
result3.keys()

# Custom Rules

```
# Create the session filter with default rules
session_filter = SessionFilter()

# Define a new rule function for excluding Exhibitor Showcase sessions
def exclude_exhibitor_showcase(visitor, sessions):
    """Custom rule to exclude sessions in the Exhibitor Showcase theatre."""
    # Filter out sessions in the Exhibitor Showcase
    filtered_sessions = [
        session for session in sessions
        if not session.get("theatre__name") or "Exhibitor Showcase" not in session["theatre__name"]
    ]
    
    # Create a rule description for logging
    rules_applied = [f"theatre: excluded Exhibitor Showcase sessions"]
    
    # Log what happened
    logger.info(f"Applied Exhibitor Showcase exclusion: filtered from {len(sessions)} to {len(filtered_sessions)} sessions")
    
    return filtered_sessions, rules_applied

# Register the new rule
session_filter.add_rule_implementation("exclude_showcase", exclude_exhibitor_showcase)

# Update priority to include the new rule
session_filter.update_rules({
    "rule_priority": ["practice_type", "role", "exclude_showcase"]
})

# Now use the session filter with the new rule
filtered_recommendations, filter_notes = session_filter.filter_sessions(visitor, recommendations)

OR

# Define custom rules that include Exhibitor Showcase exclusion
custom_rules = {
    "equine_mixed_exclusions": ["exotics", "feline", "exotic animal", "farm", "small animal"],
    "small_animal_exclusions": ["equine", "farm animal", "farm", "large animal"],
    "vet_exclusions": ["nursing"],
    "nurse_streams": ["nursing", "wellbeing", "welfare"],
    "excluded_theatres": ["Exhibitor Showcase"]  # New rule for excluded theatres
}

# Update the _apply_practice_type

```

In [None]:
custom_rules = {
    "equine_mixed_exclusions": [
        "exotics",
        "feline",
        "exotic animal",
        "farm",
        "small animal",
    ],
    "small_animal_exclusions": ["equine", "farm animal", "farm", "large animal"],
    "vet_exclusions": ["nursing"],
    "nurse_streams": ["nursing", "wellbeing", "welfare"],
    "excluded_theatres": ["Exhibitor Showcase 1"],  # New rule for excluded theatres
}

In [None]:
result4 = get_recommendations_and_filter(
    badge_id="EL7CTM9",  # Replace with an actual badge ID
    min_score=0.3,
    max_recommendations=10,
    rules_config=custom_rules,  # Use default rules
    visitor_data=data_this,
    use_neo4j=True,  # Use real Neo4j data
    use_langchain=True,  # Use rule-based filtering
)

In [None]:
len(result4["filtered_recommendations"]), len(result3["filtered_recommendations"])

In [None]:
config = dotenv_values(".env")
data_this = pd.read_csv("data/bva/output/df_reg_demo_this.csv")

In [None]:
# Visistors this year
visitors_this = list(data_this["BadgeId"].unique())

In [None]:
visitors_this_recs = []
for vis in visitors_this:
    result = get_recommendations_and_filter(
        badge_id=vis,  # Replace with an actual badge ID
        min_score=0.3,
        max_recommendations=10,
        rules_config=None,  # Use default rules
        visitor_data=data_this,
        use_neo4j=True,  # Use real Neo4j data
        use_langchain=True,  # Use rule-based filtering
    )
    visitors_this_recs.append({vis: result})