In [None]:
from neo4j import GraphDatabase
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import concurrent.futures
import time

# Neo4j connection parameters
uri = "bolt://127.0.0.1:7687"
username = "neo4j"
password = ""  # Replace with your password

# Initialize Neo4j driver
# driver = GraphDatabase.driver(uri, auth=(username, password))

# Load the same embedding model for consistency
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Define job role categories (same as original code)
VET_ROLES = [
    "Vet/Vet Surgeon",
    "Assistant Vet",
    "Vet/Owner",
    "Clinical or other Director",
    "Locum Vet",
    "Academic",
]

NURSE_ROLES = ["Head Nurse/Senior Nurse", "Vet Nurse", "Locum RVN"]

BUSINESS = ["Practice Manager", "Practice Partner/Owner"]
OTHER_ROLES = ["Student", "Receptionist", "Other (please specify)"]

# Cache for this year's sessions and embeddings
_this_year_sessions_cache = None
_visitor_cache = {}
_similar_visitors_cache = {}


def clear_caches():
    """Clear all caches."""
    global _this_year_sessions_cache, _visitor_cache, _similar_visitors_cache
    _this_year_sessions_cache = None
    _visitor_cache = {}
    _similar_visitors_cache = {}


def get_this_year_sessions(tx):
    """
    Get all sessions for this year with their embeddings.
    Uses caching for better performance.
    """
    global _this_year_sessions_cache

    if _this_year_sessions_cache is not None:
        return _this_year_sessions_cache

    # Query to get this year's sessions with embeddings
    query = """
    MATCH (s:Sessions_this_year)
    WHERE s.embedding IS NOT NULL
    RETURN s.session_id as session_id, 
           s.title as title, 
           s.stream as stream, 
           s.synopsis_stripped as synopsis_stripped,
           s.theatre__name as theatre__name,
           s.embedding as embedding,
           s.date as date,
           s.start_time as start_time,
           s.end_time as end_time,
           s.sponsored_by as sponsored_by,
           s.sponsored_session as sponsored_session
    """

    results = tx.run(query).data()

    # Process results and cache them
    sessions = {}
    for r in results:
        embedding = np.array(json.loads(r["embedding"])) if r["embedding"] else None

        if embedding is not None:
            sessions[r["session_id"]] = {
                "session_id": r["session_id"],
                "title": r["title"],
                "stream": r["stream"],
                "synopsis_stripped": r["synopsis_stripped"],
                "theatre__name": r["theatre__name"],
                "embedding": embedding,
                "date": r["date"],
                "start_time": r["start_time"],
                "end_time": r["end_time"],
                "sponsored_by": r["sponsored_by"],
                "sponsored_session": r["sponsored_session"],
            }

    _this_year_sessions_cache = sessions
    return sessions


def get_visitor_info(tx, visitor_id):
    """
    Get visitor information with caching.
    """
    global _visitor_cache

    if visitor_id in _visitor_cache:
        return _visitor_cache[visitor_id]

    visitor_query = """
    MATCH (v:Visitor_this_year {BadgeId: $visitor_id})
    RETURN v
    """
    visitor_data = tx.run(visitor_query, visitor_id=visitor_id).single()

    if not visitor_data:
        return None

    visitor = visitor_data["v"]
    assisted = visitor.get("assist_year_before", "0")

    _visitor_cache[visitor_id] = {"visitor": visitor, "assisted": assisted}
    return _visitor_cache[visitor_id]


def get_past_sessions(tx, visitor_id):
    """
    Get sessions the visitor attended last year.
    Optimized with a more efficient query.
    """
    # Single query combining both visitor types
    query_past = """
    MATCH (v:Visitor_this_year {BadgeId: $visitor_id})-[:Same_Visitor]->(vp)-[:attended_session]->(sp:Sessions_past_year)
    WHERE (vp:Visitor_last_year_bva OR vp:Visitor_last_year_lva)
    RETURN DISTINCT sp.session_id as session_id, sp.embedding as embedding
    """

    results = tx.run(query_past, visitor_id=visitor_id).data()

    # Process embeddings
    sessions = []
    for r in results:
        embedding = np.array(json.loads(r["embedding"])) if r["embedding"] else None
        if embedding is not None:
            sessions.append({"session_id": r["session_id"], "embedding": embedding})

    return sessions


def find_similar_visitors_batch(tx, visitor, num_similar_visitors=3):
    """
    Find similar visitors with batch processing.
    Uses a more efficient query and caching.
    """
    global _similar_visitors_cache

    visitor_id = visitor["BadgeId"]

    # Check cache first
    if visitor_id in _similar_visitors_cache:
        return _similar_visitors_cache[visitor_id]

    # Get all visitors with sessions in one query
    query = """
    MATCH (v:Visitor_this_year)
    WHERE v.assist_year_before = '1' AND v.BadgeId <> $visitor_id
    // Pre-filter to avoid processing all visitors
    WITH v, 
         CASE WHEN v.job_role = $job_role THEN 1 ELSE 0 END + 
         CASE WHEN v.what_type_does_your_practice_specialise_in = $practice_type THEN 1 ELSE 0 END +
         CASE WHEN v.organisation_type = $org_type THEN 1 ELSE 0 END +
         CASE WHEN v.Country = $country THEN 1 ELSE 0 END AS base_similarity
    // Only process those with some similarity
    WHERE base_similarity > 0
    // Check if they have attended sessions (to save processing visitors without sessions)
    MATCH (v)-[:Same_Visitor]->(vp)-[:attended_session]->(sp:Sessions_past_year)
    WHERE (vp:Visitor_last_year_bva OR vp:Visitor_last_year_lva)
    WITH v, base_similarity, COUNT(DISTINCT sp) AS session_count
    WHERE session_count > 0
    RETURN v, base_similarity
    ORDER BY base_similarity DESC, session_count DESC
    LIMIT 20
    """

    visitors_data = tx.run(
        query,
        visitor_id=visitor_id,
        job_role=visitor.get("job_role", ""),
        practice_type=visitor.get("what_type_does_your_practice_specialise_in", ""),
        org_type=visitor.get("organisation_type", ""),
        country=visitor.get("Country", ""),
    ).data()

    # If we can't find enough similar visitors with the pre-filtering,
    # try a more general query
    if len(visitors_data) < num_similar_visitors:
        query = """
        MATCH (v:Visitor_this_year)
        WHERE v.assist_year_before = '1' AND v.BadgeId <> $visitor_id
        // Check if they have attended sessions
        MATCH (v)-[:Same_Visitor]->(vp)-[:attended_session]->(sp:Sessions_past_year)
        WHERE (vp:Visitor_last_year_bva OR vp:Visitor_last_year_lva)
        WITH v, COUNT(DISTINCT sp) AS session_count
        WHERE session_count > 0
        RETURN v, 0 AS base_similarity
        ORDER BY session_count DESC
        LIMIT 20
        """
        visitors_data = tx.run(query, visitor_id=visitor_id).data()

    # Extract visitor features for comparison
    def get_visitor_features(v):
        attributes = [
            v.get("what_type_does_your_practice_specialise_in", ""),
            v.get("job_role", ""),
            v.get("organisation_type", ""),
            v.get("JobTitle", ""),
            v.get("Country", ""),
        ]
        return " ".join(
            [
                str(attr)
                for attr in attributes
                if attr and str(attr).strip() and str(attr) != "NA"
            ]
        )

    # Get embedding for our visitor
    visitor_text = get_visitor_features(visitor)
    if not visitor_text.strip():
        visitor_text = "default visitor profile"

    try:
        visitor_embedding = model.encode(visitor_text)

        # Calculate similarities for top 20 pre-filtered visitors
        similarities = []
        for vdata in visitors_data:
            v_compare = vdata["v"]
            base_similarity = vdata["base_similarity"]

            compare_text = get_visitor_features(v_compare)
            if not compare_text.strip():
                continue

            try:
                compare_embedding = model.encode(compare_text)
                sim = cosine_similarity([visitor_embedding], [compare_embedding])[0][0]
                # Combine neural and rule-based similarity
                combined_sim = (sim * 0.7) + (
                    base_similarity * 0.3 / 4
                )  # Max base_similarity is 4
                similarities.append((v_compare["BadgeId"], combined_sim))
            except Exception as e:
                print(f"Error comparing with visitor {v_compare['BadgeId']}: {e}")
                continue

        # Sort by similarity (highest first) and get top N
        similarities.sort(key=lambda x: -x[1])
        similar_visitors = [sid for sid, _ in similarities[:num_similar_visitors]]

        # Cache for future use
        _similar_visitors_cache[visitor_id] = similar_visitors
        return similar_visitors

    except Exception as e:
        print(f"Error encoding visitor profile: {e}")
        return []


def get_similar_visitor_sessions_batch(tx, similar_visitor_badge_ids):
    """
    Get sessions attended by similar visitors using a batch query.
    """
    if not similar_visitor_badge_ids:
        return []

    # Single query to get all sessions at once
    query = """
    MATCH (v:Visitor_this_year)-[:Same_Visitor]->(vp)-[:attended_session]->(sp:Sessions_past_year)
    WHERE v.BadgeId IN $similar_visitor_ids AND 
          (vp:Visitor_last_year_bva OR vp:Visitor_last_year_lva)
    RETURN DISTINCT sp.session_id AS session_id, sp.embedding AS embedding
    """

    results = tx.run(query, similar_visitor_ids=similar_visitor_badge_ids).data()

    # Process embeddings
    sessions = []
    for r in results:
        embedding = np.array(json.loads(r["embedding"])) if r["embedding"] else None
        if embedding is not None:
            sessions.append({"session_id": r["session_id"], "embedding": embedding})

    return sessions


def calculate_session_similarities_parallel(
    past_sessions, this_year_sessions, min_score=0.0
):
    """
    Calculate similarities between past sessions and this year's sessions in parallel.

    Args:
        past_sessions: List of past session objects with embeddings
        this_year_sessions: Dict of this year's session objects with embeddings
        min_score: Minimum similarity score threshold

    Returns:
        List of recommended sessions with similarity scores
    """
    if not past_sessions or not this_year_sessions:
        return []

    # Function to calculate similarity for a single past session against all this year sessions
    def process_past_session(past_sess):
        recommendations = []
        past_emb = past_sess["embedding"]

        for sid, current_sess in this_year_sessions.items():
            try:
                current_emb = current_sess["embedding"]
                sim = cosine_similarity([past_emb], [current_emb])[0][0]

                if sim >= min_score:
                    recommendations.append(
                        {
                            "session_id": sid,
                            "title": current_sess["title"],
                            "stream": current_sess["stream"],
                            "theatre__name": current_sess["theatre__name"],
                            "date": current_sess["date"],
                            "start_time": current_sess["start_time"],
                            "end_time": current_sess["end_time"],
                            "sponsored_by": current_sess.get("sponsored_by", ""),
                            "sponsored_session": current_sess.get(
                                "sponsored_session", ""
                            ),
                            "similarity": sim,
                        }
                    )
            except Exception as e:
                print(f"Error calculating similarity for session {sid}: {e}")

        return recommendations

    # Use parallel processing for faster calculation
    all_recommendations = []
    with concurrent.futures.ThreadPoolExecutor(
        max_workers=min(8, len(past_sessions))
    ) as executor:
        future_to_session = {
            executor.submit(process_past_session, ps): ps for ps in past_sessions
        }
        for future in concurrent.futures.as_completed(future_to_session):
            try:
                recommendations = future.result()
                all_recommendations.extend(recommendations)
            except Exception as e:
                print(f"Error processing session: {e}")

    # Deduplicate recommendations, keeping the highest similarity score
    session_to_best_rec = {}
    for rec in all_recommendations:
        sid = rec["session_id"]
        if (
            sid not in session_to_best_rec
            or rec["similarity"] > session_to_best_rec[sid]["similarity"]
        ):
            session_to_best_rec[sid] = rec

    # Convert back to list and sort by similarity
    recommendations = list(session_to_best_rec.values())
    recommendations.sort(key=lambda x: -x["similarity"])

    return recommendations


def recommend_sessions_optimized(
    badge_id,
    assist_year_before="1",
    min_score=0.0,
    max_recommendations=None,
    num_similar_visitors=3,
):
    """
    Optimized version of the recommend_sessions function.

    Args:
        badge_id: Visitor's badge ID
        assist_year_before: "1" if visitor attended last year, "0" otherwise
        min_score: Minimum similarity score for recommendations (0.0-1.0)
        max_recommendations: Maximum number of recommendations to return
        num_similar_visitors: Number of similar visitors to consider for case 2

    Returns:
        List of recommended sessions with details
    """
    start_time = time.time()

    with driver.session() as session:
        # Get visitor information
        visitor_info = session.execute_read(get_visitor_info, visitor_id=badge_id)
        if not visitor_info:
            print(f"Visitor with BadgeId {badge_id} not found.")
            return []

        visitor = visitor_info["visitor"]
        assisted = visitor_info["assisted"]

        # Get all this year's sessions in one go (uses caching)
        this_year_sessions = session.execute_read(get_this_year_sessions)
        print(
            f"Loaded {len(this_year_sessions)} sessions for this year in {time.time() - start_time:.2f}s"
        )

        past_sessions = []

        if assisted == "1":
            # Case 1: Visitor attended last year
            print(f"Case 1: Visitor {badge_id} attended last year")
            case_start = time.time()

            # Get sessions the visitor attended last year
            past_sessions = session.execute_read(get_past_sessions, visitor_id=badge_id)
            print(
                f"Found {len(past_sessions)} past sessions in {time.time() - case_start:.2f}s"
            )

        else:
            # Case 2: New visitor - find similar visitors
            print(
                f"Case 2: Finding {num_similar_visitors} similar visitors for {badge_id}"
            )
            case_start = time.time()

            # Find similar visitors in batch
            similar_visitors = session.execute_read(
                find_similar_visitors_batch,
                visitor=visitor,
                num_similar_visitors=num_similar_visitors,
            )

            print(
                f"Found {len(similar_visitors)} similar visitors in {time.time() - case_start:.2f}s"
            )
            sim_sessions_start = time.time()

            if similar_visitors:
                # Get sessions attended by similar visitors in batch
                past_sessions = session.execute_read(
                    get_similar_visitor_sessions_batch,
                    similar_visitor_badge_ids=similar_visitors,
                )

                print(
                    f"Found {len(past_sessions)} sessions from similar visitors in {time.time() - sim_sessions_start:.2f}s"
                )

        # Calculate similarities in parallel
        sim_calc_start = time.time()
        recommendations = calculate_session_similarities_parallel(
            past_sessions=past_sessions,
            this_year_sessions=this_year_sessions,
            min_score=min_score,
        )
        print(f"Calculated similarities in {time.time() - sim_calc_start:.2f}s")

        # Apply maximum recommendations limit
        if max_recommendations and len(recommendations) > max_recommendations:
            recommendations = recommendations[:max_recommendations]

        print(f"Total recommendation time: {time.time() - start_time:.2f}s")
        return recommendations


def recommend_sessions_for_multiple_visitors(
    badge_ids,
    assist_year_before="1",
    min_score=0.0,
    max_recommendations=None,
    num_similar_visitors=3,
):
    """
    Batch process recommendations for multiple visitors.

    Args:
        badge_ids: List of visitor badge IDs
        Other parameters same as recommend_sessions_optimized

    Returns:
        Dict mapping badge_ids to their recommendations
    """
    # Preload this year's sessions into cache
    with driver.session() as session:
        session.execute_read(get_this_year_sessions)

    results = {}

    # Process recommendations for each visitor
    for badge_id in badge_ids:
        results[badge_id] = recommend_sessions_optimized(
            badge_id=badge_id,
            assist_year_before=assist_year_before,
            min_score=min_score,
            max_recommendations=max_recommendations,
            num_similar_visitors=num_similar_visitors,
        )

    return results

# Step 9 Implement Business Rules

In [None]:
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
import json
import os

from datetime import datetime
import random

from langchain_openai import ChatOpenAI, AzureChatOpenAI, AzureOpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from dotenv import dotenv_values

In [None]:
config = dotenv_values(".env")
data_this = pd.read_csv("data/bva/output/df_reg_demo_this.csv")
llm = AzureChatOpenAI(
    azure_endpoint=config["AZURE_ENDPOINT"],
    azure_deployment=config["AZURE_DEPLOYMENT"],
    api_key=config["AZURE_API_KEY"],
    api_version=config["AZURE_API_VERSION"],
    temperature=0.5,
    top_p=0.9,
)

In [None]:
driver = GraphDatabase.driver(uri, auth=(username, password))
# Example parameters
badge_id = "EL7CTM9"  # Replace with actual badge ID
assist_year_before = "1"  # "1" for returning visitor, "0" for new visitor
min_score = 0.3  # Minimum similarity score (0.0-1.0)
max_recommendations = 30  # Maximum number of recommendations
num_similar_visitors = 3  # Number of similar visitors to consider

try:
    # Get recommendations
    recommendations = recommend_sessions_optimized(
        badge_id=badge_id,
        assist_year_before=assist_year_before,
        min_score=min_score,
        max_recommendations=max_recommendations,
        num_similar_visitors=num_similar_visitors,
    )


finally:
    # Close the driver
    driver.close()

In [None]:
# attributes sessions
list_keys = list(recommendations[0].keys())

In [None]:
# attributes sessions to text
text_rec = json.dumps(recommendations)

In [None]:
# attribuutes vistor with badge_id
vis = data_this[data_this["BadgeId"] == badge_id].to_dict(orient="records")[0]

In [None]:
vis

In [None]:
recommendations[0]

In [None]:
list_keys_vis = list(vis.keys())

In [None]:
def generate_prompt(keys, keys_visitor):
    """Generate a prompt for filter sessions according to some business rules"""
    system_prompt = f"""
    you are an assistant specialized in filter sessions of an Event based in bussiness rules and profiles of users.\n
    - you will receive a profile of a visitor with the following keys: {keys_visitor}\n
      the attribute what_type_does_your_practice_specialise_in can be a list of specializations separated by ";"
    - you will receive a list of session with the following keys: {keys}\n
      stream in session can be a list of tpocis separated by ";". When you evaluate the rule you need to consider all of them \n
    - each session you return must have the same format.
    - different Job_Roles Groups:\n
    VET_ROLES = [
    "Vet/Vet Surgeon",
    "Assistant Vet",
    "Vet/Owner",
    "Clinical or other Director",
    "Locum Vet", 
    "Academic",
    ]\n
    
    NURSE_ROLES = ["Head Nurse/Senior Nurse", "Vet Nurse", "Locum RVN"]\n
    
    BUSINESS = ["Practice Manager", "Practice Partner/Owner"]\n
    # Other roles can attend any session
    OTHER_ROLES = ["Student", "Receptionist", "Other (please specify)"]\n
    - only return the sessions in json format
    """

    return system_prompt


system_prompt = generate_prompt(keys=list_keys, keys_visitor=list_keys_vis)
prompt = PromptTemplate(
    input_variables=["sessions ", "rules", "profile"],
    template=system_prompt
    + """For the Visitor with profile {profile}\n, based on the attributes of these session: {sessions} and implementing the following rules {rules}.\n Filter them and just return from the list those that meet the requirements in the rules""",
)

In [None]:
# print(prompt.template)

In [None]:
chain = prompt | llm

In [None]:
rules = """
1.) if visitor  what_type_does_your_practice_specialise_in contains "equine" or "mixed" , you can't propose session on stream "exotics", "feline", "exotic animal", "farm", "small animal"\n
2.) if visitor  what_type_does_your_practice_specialise_in contains "small animal" , you can't propose session on stream "equine", "farm animal", "farm", "large animal"\n
3.) if job_role in VET_ROLES session.stream cant we "nursing"\n
4.) if job_role in  NURSE_ROLES you only recomend sessions in stream "nursing", "wellbeing", "welfare"\n
5.) rule 1 and 2 are mutually exclusive and apply first then apply 3 and 4
"""

In [None]:
ai_msg = chain.invoke({"sessions": text_rec, "profile": vis, "rules": rules})

In [None]:
ai_msg.content