In [None]:
# Import required libraries
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

uri = "bolt://127.0.0.1:7687"
username = "neo4j"
password = ""  # Replace with your password.

# Initialize Neo4j driver
driver = GraphDatabase.driver(uri, auth=(username, password))

# Load a good text embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")
model = SentenceTransformer(
    "nomic-ai/nomic-embed-text-v1", trust_remote_code=True
)  # , device="cuda")

In [None]:
text = """Urology in veterinary medicine focuses on the diagnosis, treatment, and management of diseases and conditions affecting the urinary tract and kidneys in animals. 
This category encompasses a wide range of topics including proteinuria interpretation, urinary tract infections, obstructive conditions like blocked bladder, and surgical
interventions for urolithiasis. It also highlights the importance of diagnostic imaging techniques and the critical role of veterinary nurses in managing chronic kidney disease. 
Overall, urology integrates medical, surgical, and supportive care approaches to maintain and restore urinary health in various animal species."""
len(model.encode(text))

In [None]:
# Default values for properties
default_properties = {
    "Days_since_registration": "119",
    "Country": "UK",
    "Source": "BVA Key Stakeholders",
    "Email_domain": "effem.com",
    "assist_year_before": "1",
    "job_role": "NA",
    "what_type_does_your_practice_specialise_in": "NA",
    "organisation_type": "NA",
    "JobTitle": "NA",
}


def set_default_properties(tx, properties):
    query = """
    MATCH (n:Visitor_this_year)
    SET
    """ + ",\n    ".join(
        [f"n.{key} = COALESCE(n.{key}, $props.{key})" for key in properties.keys()]
    )

    tx.run(query, props=properties)


def main():

    with driver.session() as session:
        session.execute_write(set_default_properties, default_properties)
    driver.close()
    print("Missing properties set to default values for all Visitor_this_year nodes.")

In [None]:
main()

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load a good text embedding model
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

# Define job role categories
VET_ROLES = [
    "Vet/Vet Surgeon",
    "Assistant Vet",
    "Vet/Owner",
    "Clinical or other Director",
    "Locum Vet" "Academic",
]

NURSE_ROLES = ["Head Nurse/Senior Nurse", "Vet Nurse", "Locum RVN"]

BUSINESS = ["Practice Manager", "Practice Partner/Owner"]
# Other roles can attend any session
OTHER_ROLES = ["Student", "Receptionist", "Other (please specify)"]


# Define session embedding
def create_session_embedding(session, stream_descriptions=None):
    # Include title, synopsis_stripped, and theatre__name as required
    base_text = f"{session['title']} {session['synopsis_stripped']} {session['theatre__name']} {session['sponsored_by']}"

    # Add stream descriptions if provided
    if stream_descriptions and len(stream_descriptions) > 0:
        stream_desc_text = " ".join(stream_descriptions)
        text = f"{base_text} {stream_desc_text}"
    else:
        text = base_text

    return model.encode(text)


# Embedding function for all sessions
def embed_all_sessions(tx):
    # Query all Sessions from past_year_dva, past_year_lva, Sessions_past_year (for compatibility) and this_year
    # Include the new sponsored_by and sponsored_session attributes
    query = """
    MATCH (s)
    WHERE s:Sessions_past_year_dva OR s:Sessions_past_year_lva OR s:Sessions_this_year OR s:Sessions_past_year
    RETURN s.session_id as session_id, s.title as title, 
           s.stream as stream, s.synopsis_stripped as synopsis_stripped,
           s.theatre__name as theatre__name, s.sponsored_by as sponsored_by,
           s.sponsored_session as sponsored_session, labels(s)[0] as type,
           CASE WHEN s.key_text IS NOT NULL THEN s.key_text ELSE '' END as key_text
    """
    sessions = tx.run(query).data()

    # Fetch all stream descriptions once to avoid multiple queries
    stream_query = """
    MATCH (s:Stream)
    RETURN s.stream as stream, s.description as description
    """
    stream_data = tx.run(stream_query).data()

    # Create a dictionary of stream descriptions for quick lookup
    stream_descriptions = {s["stream"].lower(): s["description"] for s in stream_data}

    embeddings = {}
    for s in sessions:
        # Process the stream field - split by semicolon and handle duplicates
        session_streams = []
        if s["stream"]:
            # Split the stream string and strip whitespace
            stream_list = [stream.strip().lower() for stream in s["stream"].split(";")]
            # Remove duplicates by converting to set and back to list
            stream_list = list(set(stream_list))

            # Get the description for each stream
            for stream in stream_list:
                if stream in stream_descriptions:
                    session_streams.append(stream_descriptions[stream])

        # Create embedding with the session data and stream descriptions
        embeddings[s["session_id"]] = {
            "type": s["type"],
            "embedding": create_session_embedding(s, session_streams),
            "theatre__name": s["theatre__name"],
            "stream": s["stream"],
            "sponsored_by": (
                s["sponsored_by"] if "sponsored_by" in s else "Not Sponsored"
            ),
            "sponsored_session": (
                s["sponsored_session"] if "sponsored_session" in s else "False"
            ),
            "key_text": s["key_text"] if "key_text" in s else "",
        }

    return embeddings


# Visitor-based similarity
def visitor_similarity(v1, v2, attrs):
    vec1 = np.array([1 if v1[a] == v2[a] else 0 for a in attrs])
    return vec1.mean()


# Function to check if a session is appropriate for a visitor's job role
def is_session_appropriate_for_role(session_stream, job_role):
    # Rule 1: Vets and nurses – sessions are exclusive
    if not session_stream or not job_role:
        return True  # If we don't have stream or job role data, don't filter

    session_stream_lower = session_stream.lower()

    # Check if session is nurse-specific (contains nursing keywords)
    is_nurse_session = any(
        keyword in session_stream_lower
        for keyword in ["nurse", "nursing", "rvn", "veterinary nurse", "head nurse"]
    )

    # Check if session is vet-specific (contains vet keywords)
    is_vet_session = any(
        keyword in session_stream_lower
        for keyword in [
            "vet",
            "surgeon",
            "clinical",
            "veterinarian",
            "veterinary surgeon",
        ]
    )  # removed  'practice owner' Juan 23042025

    # Apply the rules
    if job_role in VET_ROLES:
        # Vets shouldn't get nurse-specific sessions
        if is_nurse_session and not is_vet_session:
            return False
    elif job_role in NURSE_ROLES:
        # Nurses shouldn't get vet-specific sessions
        if is_vet_session and not is_nurse_session:
            return False

    # For other roles or non-specific sessions, allow them
    return True


# Function to get Zoetis sponsored sessions
def get_zoetis_sessions(tx):
    """Get all sessions sponsored by Zoetis this year"""
    query = """
    MATCH (s:Sessions_this_year)
    WHERE s.sponsored_by = 'Zoetis'
    RETURN s.session_id as session_id
    """
    results = tx.run(query).data()
    return [r["session_id"] for r in results]


# Function to check if a visitor should receive Zoetis sponsored recommendations
def check_for_zoetis_recommendations(tx, similar_visitor_badge_ids):
    """
    Rule 3: When doing Visitor similarity if one of the similar visitors
    is node type "Visitor_last_year_lva" and this Visitor attend sessions
    sponsored by "Zoetis", recommend sessions from this sponsor
    """
    # Check if there are similar visitors
    if not similar_visitor_badge_ids:
        return []

    zoetis_sessions = []

    # For each similar visitor
    for similar_vid in similar_visitor_badge_ids:
        # Check if they're "Visitor_last_year_lva" and attended Zoetis sessions
        query = """
        MATCH (v:Visitor_this_year {BadgeId: $similar_vid})-[:Same_Visitor]->(vp:Visitor_last_year_lva)-[:attended_session]->(sp:Sessions_past_year)
        WHERE sp.sponsored_by = 'Zoetis'
        RETURN count(sp) > 0 as attended_zoetis
        """
        result = tx.run(query, similar_vid=similar_vid).single()
        if result and result["attended_zoetis"]:
            # Get Zoetis sessions for this year
            zoetis_sessions.extend(get_zoetis_sessions(tx))
            break  # Found at least one qualifying visitor

    # Return unique list of Zoetis session IDs
    return list(set(zoetis_sessions))


# Modified recommend_sessions function to return session IDs with similarity scores
def recommend_sessions(tx, visitor_id, session_embeddings):
    # First check if visitor exists and get their info
    visitor_query = """
    MATCH (v:Visitor_this_year {BadgeId: $visitor_id})
    RETURN v
    """
    visitor_data = tx.run(visitor_query, visitor_id=visitor_id).single()
    if not visitor_data:
        return []
    visitor = visitor_data["v"]
    assisted = visitor["assist_year_before"]
    job_role = visitor.get("job_role", "")  # Get job role for Rule 1

    # Check if visitor is a surgeon for Rule 2
    is_surgeon = False
    if job_role:
        # Check if job_role contains surgeon or is a clinical director
        is_surgeon = (
            "surgeon" in job_role.lower() or job_role == "Vet/Vet Surgeon"
        )  # removed or job_role == 'Clinical or other Director' Juan 23042025

    recommendations = []
    similar_visitor_badge_ids = []

    if assisted == "1":
        # Case 1: Visitor attended last year
        query_past = """
        MATCH (v:Visitor_this_year {BadgeId: $visitor_id})-[:Same_Visitor]->(vp_bva:Visitor_last_year_bva)-[:attended_session]->(sp_bva:Sessions_past_year)
        RETURN sp_bva.session_id as session_id
        UNION
        MATCH (v:Visitor_this_year {BadgeId: $visitor_id})-[:Same_Visitor]->(vp_lva:Visitor_last_year_lva)-[:attended_session]->(sp_lva:Sessions_past_year)
        RETURN sp_lva.session_id as session_id
        """
        past_sessions = tx.run(query_past, visitor_id=visitor_id).data()

        # Get all sessions for this year
        this_year_sessions = {
            k: v
            for k, v in session_embeddings.items()
            if v["type"] == "Sessions_this_year"
        }

        # For surgeons, first add relevant "Referrals & Diagnostics Theatre" sessions
        if is_surgeon:
            surgeon_sessions = []
            for sid, data in this_year_sessions.items():
                if data["theatre__name"] == "Referrals & Diagnostics Theatre":
                    # Verify it's appropriate for the role
                    if is_session_appropriate_for_role(data["stream"], job_role):
                        # Add with similarity score of 1.0 (highest) for surgeon-specific recommendations
                        surgeon_sessions.append(
                            {
                                "session_id": sid,
                                "similarity": 1.0,
                                "reason": "Surgeon-specific",
                            }
                        )

            # Add top 2 surgeon-specific sessions # remove the limit . add all surgeon specific Juan 23042025
            recommendations.extend(surgeon_sessions)  # surgeon_sessions[:2]

        # Process previously attended sessions
        for past_sess in past_sessions:
            # Check if the session_id exists in embeddings
            if past_sess["session_id"] not in session_embeddings:
                continue  # Skip this session if it's not in our embeddings

            past_emb = session_embeddings[past_sess["session_id"]]["embedding"]
            past_session_id = past_sess["session_id"]

            similarities = []
            for sid, data in this_year_sessions.items():
                # Skip if we've already recommended this session
                if any(r["session_id"] == sid for r in recommendations):
                    continue

                # Apply Rule 1: Check if session is appropriate for job role
                if not is_session_appropriate_for_role(data["stream"], job_role):
                    continue

                sim = cosine_similarity([past_emb], [data["embedding"]])[0][0]

                # Apply Rule 2: If surgeon, boost "Referrals & Diagnostics Theatre" sessions
                reason = f"Similar to previously attended session"
                if (
                    is_surgeon
                    and data["theatre__name"] == "Referrals & Diagnostics Theatre"
                ):
                    sim += 0.2  # Boost by 0.2
                    reason += " (boosted for surgeon)"

                similarities.append(
                    {"session_id": sid, "similarity": sim, "reason": reason}
                )

            # Sort by similarity and pick top similar ones
            similarities.sort(key=lambda x: -x["similarity"])
            recommendations.extend(
                similarities
            )  # remove the limit . add all similar specific Juan 23042025 similarities[:2]

    else:
        # Case 2: New visitor - find similar visitors with history
        visitor_attrs = [
            "Days_since_registration",
            "Country",
            "Source",
            "Email_domain",
            "JobTitle",
            "assist_year_before",
            "job_role",
            "what_type_does_your_practice_specialise_in",
            "organisation_type",
        ]

        all_visitors = tx.run(
            """
            MATCH (v:Visitor_this_year)
            WHERE v.assist_year_before = '1'
            RETURN v
        """
        ).data()

        similarities = []
        for vdata in all_visitors:
            v_compare = vdata["v"]
            sim = visitor_similarity(visitor, v_compare, visitor_attrs)
            similarities.append((v_compare["BadgeId"], sim))

        similarities.sort(key=lambda x: -x[1])
        similar_visitor_badge_ids = [
            sid for sid, _ in similarities[:3]
        ]  # USe 3 similar visitors Juan 23042025

        # Get all sessions for this year
        this_year_sessions = {
            k: v
            for k, v in session_embeddings.items()
            if v["type"] == "Sessions_this_year"
        }

        # For surgeons, first add relevant "Referrals & Diagnostics Theatre" sessions
        if is_surgeon:
            surgeon_sessions = []
            for sid, data in this_year_sessions.items():
                if data["theatre__name"] == "Referrals & Diagnostics Theatre":
                    # Verify it's appropriate for the role
                    if is_session_appropriate_for_role(data["stream"], job_role):
                        # Add with similarity score of 1.0 (highest) for surgeon-specific recommendations
                        surgeon_sessions.append(
                            {
                                "session_id": sid,
                                "similarity": 1.0,
                                "reason": "Surgeon-specific",
                            }
                        )

            # Add top 2 surgeon-specific sessions
            recommendations.extend(
                surgeon_sessions
            )  # # remove the limit . add all surgeon specific Juan 23042025 surgeon_sessions[:2]

        # Rule 3: Check for Zoetis recommendations if similar visitors are last_year_lva
        # and attended Zoetis-sponsored sessions
        zoetis_sessions = check_for_zoetis_recommendations(
            tx, similar_visitor_badge_ids
        )

        # Filter Zoetis sessions to ensure they match job role requirements
        filtered_zoetis = []
        for zsid in zoetis_sessions:
            if zsid in this_year_sessions:
                # Apply Rule 1: Check if session is appropriate for job role
                if is_session_appropriate_for_role(
                    this_year_sessions[zsid]["stream"], job_role
                ):
                    # Add with similarity score of 0.9 (high) for sponsor-specific recommendations
                    filtered_zoetis.append(
                        {
                            "session_id": zsid,
                            "similarity": 0.9,
                            "reason": "Zoetis sponsored",
                        }
                    )

        # Add up to 2 Zoetis sessions
        recommendations.extend(filtered_zoetis[:2])

        # Process sessions from similar visitors
        for similar_vid in similar_visitor_badge_ids:
            query_sim_past = """
                MATCH (v:Visitor_this_year {BadgeId: $similar_vid})-[:Same_Visitor]->(vp)-[:attended_session]->(sp:Sessions_past_year)
                WHERE vp:Visitor_last_year_bva OR vp:Visitor_last_year_lva
                RETURN sp.session_id AS session_id,
                       CASE WHEN vp:Visitor_last_year_lva THEN true ELSE false END as is_lva
            """
            user_past_sessions = tx.run(query_sim_past, similar_vid=similar_vid).data()

            for past_sess in user_past_sessions:
                # Check if the session_id exists in embeddings
                if past_sess["session_id"] not in session_embeddings:
                    continue  # Skip this session if it's not in our embeddings

                past_emb = session_embeddings[past_sess["session_id"]]["embedding"]

                similarities = []
                for sid, data in this_year_sessions.items():
                    # Skip if we've already recommended this session
                    if any(r["session_id"] == sid for r in recommendations):
                        continue

                    # Apply Rule 1: Check if session is appropriate for job role
                    if not is_session_appropriate_for_role(data["stream"], job_role):
                        continue

                    sim = cosine_similarity([past_emb], [data["embedding"]])[0][0]
                    reason = f"Similar to session attended by similar visitor"

                    # Apply Rule 2: If surgeon, boost "Referrals & Diagnostics Theatre" sessions
                    if (
                        is_surgeon
                        and data["theatre__name"] == "Referrals & Diagnostics Theatre"
                    ):
                        sim += 0.2  # Boost by 0.2
                        reason += " (boosted for surgeon)"

                    similarities.append(
                        {"session_id": sid, "similarity": sim, "reason": reason}
                    )

                # Sort by similarity and pick top similar ones
                similarities.sort(key=lambda x: -x["similarity"])
                recommendations.extend(
                    similarities
                )  # remove the limit . add all similar specific Juan 23042025 similarities[:2]

    # Remove duplicates (keeping the first occurrence with its similarity score)
    seen = set()
    unique_recommendations = []
    for rec in recommendations:
        if rec["session_id"] not in seen:
            seen.add(rec["session_id"])
            unique_recommendations.append(rec)

    # Return top 10 recommendations with their similarity scores
    return unique_recommendations  # remove the limit . add all recommendations Juan 23042025 unique_recommendations[:10]


# Modified function to filter sessions with similarity scores
def filter_sessions_by_visitor_stream_relationships(
    tx, visitor_id, session_recommendations
):
    """
    Filter sessions to keep only those where the visitor has a relationship
    (specialization_to_stream or job_to_stream) to any Stream that has a HAS_STREAM
    relationship to the session. Preserves similarity scores.
    """
    if not session_recommendations:
        return []

    # Extract just the session IDs for the query
    session_ids = [rec["session_id"] for rec in session_recommendations]

    # Query to find sessions with valid stream relationships to the visitor
    query = """
    MATCH (v:Visitor_this_year {BadgeId: $visitor_id})
    MATCH (s:Sessions_this_year)
    WHERE s.session_id IN $session_ids
    MATCH (s)-[:HAS_STREAM]->(stream:Stream)<-[r]-(v)
    WHERE type(r) IN ['specialization_to_stream', 'job_to_stream']
    RETURN DISTINCT s.session_id as session_id
    """

    results = tx.run(query, visitor_id=visitor_id, session_ids=session_ids).data()
    valid_session_ids = set(r["session_id"] for r in results)

    # Filter the original recommendations to keep only valid sessions with their scores
    filtered_recommendations = [
        rec for rec in session_recommendations if rec["session_id"] in valid_session_ids
    ]

    return filtered_recommendations


# Modified get_session_attributes to include similarity scores
def get_session_attributes(session_recommendations):
    # Handle empty recommendations to prevent errors
    if not session_recommendations:
        return []

    # Extract just the session IDs for the query
    session_ids = [rec["session_id"] for rec in session_recommendations]

    # Create a mapping of session_id to similarity score for later use
    similarity_map = {
        rec["session_id"]: {"similarity": rec["similarity"], "reason": rec["reason"]}
        for rec in session_recommendations
    }

    query = """
    MATCH (s:Sessions_this_year)
    WHERE s.session_id IN $session_ids
    RETURN s {
        .stream,
        .session_id,
        .title,
        .synopsis_stripped,
        .end_time,
        .start_time,
        .date,
        .theatre__name,
        .sponsored_by,
        .sponsored_session
    } AS session_details
    """

    with driver.session() as session:
        results = session.run(query, session_ids=session_ids)
        session_details = []

        for record in results:
            details = record["session_details"]
            # Add similarity score and reason to the session details
            session_id = details["session_id"]
            if session_id in similarity_map:
                details["similarity_score"] = similarity_map[session_id]["similarity"]
                details["recommendation_reason"] = similarity_map[session_id]["reason"]
            session_details.append(details)

    # Sort by similarity score (highest first)
    session_details.sort(key=lambda x: x.get("similarity_score", 0), reverse=True)

    return session_details


# Cached embeddings at the module level
_session_embeddings = None


def get_recommendations(visitor_id):
    """
    Main function to get recommendations for a visitor.
    Returns full session details for recommended sessions, including similarity scores.
    """
    global _session_embeddings

    # Create session embeddings once if they don't exist
    if _session_embeddings is None:
        with driver.session() as session:
            _session_embeddings = session.execute_read(embed_all_sessions)

    # Use the cached embeddings for recommendations
    with driver.session() as session:
        # Get initial recommended sessions with similarity scores
        recommended_sessions = session.execute_read(
            recommend_sessions,
            visitor_id=visitor_id,
            session_embeddings=_session_embeddings,
        )

        # Apply the additional filter for stream relationships
        filtered_sessions = session.execute_read(
            filter_sessions_by_visitor_stream_relationships,
            visitor_id=visitor_id,
            session_recommendations=recommended_sessions,
        )

    # Get the details of the filtered recommended sessions with similarity scores
    recommended_sessions_details = get_session_attributes(filtered_sessions)

    return recommended_sessions_details


# Example usage:
# recommended_sessions = get_recommendations("MV4H3PQ")

In [None]:
# Example usage:
visitor_id_example = "MV4H3PQ"  #  No visit last year
driver = GraphDatabase.driver(uri, auth=(username, password))
recommended_sessions = get_recommendations(visitor_id_example)

In [None]:
for session in recommended_sessions:
    print(session)
    print("*" * 25)

In [None]:
# for session in recommended_sessions:
#     print(session)
#     print("*"*25)

In [None]:
# BRKLA7T Visit last year

visitor_id_example = "Z642DJP"  # Visitor_this_year BadgeId Example No visit last year
driver = GraphDatabase.driver(uri, auth=(username, password))
recommended_sessions = get_recommendations(visitor_id_example)

In [None]:
for session in recommended_sessions:
    print(session)
    print("*" * 25)

In [None]:
csv_file_path = "data/bva/output/df_reg_demo_this.csv"

In [None]:
data = pd.read_csv(csv_file_path)
len(data)

In [None]:
list_badgeId_this = list(data["BadgeId"].unique())
len(list_badgeId_this)

In [None]:
pa_recommendations = {}
counter = 0  # Initialize a counter

for badge in list_badgeId_this:
    recommended_sessions = get_recommendations(badge)
    pa_recommendations[badge] = recommended_sessions

    # Increment the counter
    counter += 1

    # Print the badge every 50 iterations
    if counter % 30 == 0:
        print(f"Processed {counter} badges so far. Current badge: {badge}")

# Optionally, print the total count at the end
print(f"Total badges processed: {counter}")

In [None]:
import json

In [None]:
for key in pa_recommendations.keys():
    for ele in pa_recommendations[key]:
        ele["similarity_score"] = float(ele["similarity_score"])

In [None]:
count = 0
for key in pa_recommendations.keys():
    if len(pa_recommendations[key]) == 0:
        count += 1
print(count)

In [None]:
with open("data/bva/bva_pa_recomendations.json", "w") as f:
    json.dump(pa_recommendations, f, indent=4)

In [None]:
pa_recommendations["Q5WEKRV"][0].keys()

In [None]:
def transform_recommendations_to_dataframe(pa_recommendations):
    """
    Transforms a dictionary of session recommendations to a Pandas DataFrame.

    Args:
        pa_recommendations (dict): A dictionary where keys are badge IDs (strings)
            and values are lists of session dictionaries.  Each session dictionary
            contains information about a recommended session.

    Returns:
        pandas.DataFrame: A DataFrame with columns 'badgeid', 'session_id', 'stream',
            'title', 'end_time', 'synopsis_stripped', 'start_time', 'date',
            and 'theatre__name'. Returns an empty DataFrame if the input dictionary is empty.
    """

    data = []  # List to hold the rows of the DataFrame
    for badgeid, session_list in pa_recommendations.items():
        for session in session_list:
            row = {
                "badgeid": badgeid,
                "session_id": session.get("session_id", None),  # Use .get() for safety
                "stream": session.get("stream", None),
                "title": session.get("title", None),
                "synopsis_stripped": session.get("synopsis_stripped", None),
                "date": session.get("date", None),
                "start_time": session.get("start_time", None),
                "end_time": session.get("end_time", None),
                "theatre__name": session.get("theatre__name", None),
                "sponsored_by": session.get("sponsored_by", None),
                "similarity_score": session.get("similarity_score", None),
                "recommendation_reason": session.get("recommendation_reason", None),
            }
            data.append(row)

    df = pd.DataFrame(data)
    return df


df = transform_recommendations_to_dataframe(pa_recommendations)

In [None]:
df.head(10)

In [None]:
import pandas as pd
import numpy as np


def flag_overlapping_sessions(df):
    # Create a copy to avoid modifying the original
    df = df.copy()

    # Convert string time columns to datetime
    df["start_datetime"] = pd.to_datetime(df["date"] + " " + df["start_time"])
    df["end_datetime"] = pd.to_datetime(df["date"] + " " + df["end_time"])

    # Initialize the overlapping_sessions column
    df["overlapping_sessions"] = None

    # Process each badge group
    for badge_id, group in df.groupby("badgeid"):
        # Skip if only one session
        if len(group) <= 1:
            continue

        # Get indices in the original dataframe
        group_indices = group.index

        # For each session in the group
        for i, idx in enumerate(group_indices):
            # Get current session times
            current_start = df.loc[idx, "start_datetime"]
            current_end = df.loc[idx, "end_datetime"]
            current_id = df.loc[idx, "session_id"]

            # Create mask for overlapping sessions (vectorized comparison)
            # A session overlaps if it starts before current ends AND ends after current starts
            mask = (
                (group["start_datetime"] < current_end)
                & (group["end_datetime"] > current_start)
                & (group["session_id"] != current_id)
            )

            # Get overlapping session IDs
            overlapping_ids = group.loc[mask, "session_id"].tolist()

            # Update if overlaps found
            if overlapping_ids:
                df.at[idx, "overlapping_sessions"] = ",".join(map(str, overlapping_ids))

    # Drop the temporary datetime columns
    df.drop(["start_datetime", "end_datetime"], axis=1, inplace=True)

    return df


# Apply the function to your DataFrame
df = flag_overlapping_sessions(df)

In [None]:
df.overlapping_sessions.value_counts()

In [None]:
df.head(2)

In [None]:
len(df)

In [None]:
df.to_csv("data/bva/bva_pa_recomendations.csv", index=False)

In [None]:
df["recommendation_reason"].unique()

# ADD registration Demo Information

In [None]:
data.columns

In [None]:
data.columns = [
    "Email",
    "Email_domain",
    "Company",
    "JobTitle",
    "Country",
    "BadgeType",
    "ShowRef",
    "badgeid",
    "Source",
    "Days_since_registration",
    "assist_year_before",
    "BadgeId_last_year_bva",
    "BadgeId_last_year_lva",
    "what_type_does_your_practice_specialise_in",
    "organisation_type",
    "job_role",
]

In [None]:
data_final = pd.merge(df, data, on=["badgeid"], how="left")

In [None]:
len(df), len(data_final)

In [None]:
data_final.to_csv("data/bva/bva_pa_recomendations_with_demo.csv", index=False)

In [None]:
## OLD Stuff

In [None]:
# from graphdatascience import GraphDataScience

# # Connect to Neo4j (Please update URI, USER and PASSWORD )
# NEO4J_URI =  "bolt://127.0.0.1:7687"
# USER = "neo4j"
# PASSWORD = "Larisa10@"

# # Connect to Neo4j
# gds = GraphDataScience(NEO4J_URI, auth=(USER, PASSWORD))

# # Check existing relationships
# existing_rels = gds.run_cypher("CALL db.relationshipTypes()")
# print("Existing Relationship Types in database:")
# print(existing_rels)

In [None]:
# # Adjust these variables exactly as they appear in the print statement above
# SAME_PERSON_REL = "Same_Visitor"          # <-- adjust as per exact output
# ATTENDED_SESSION_REL = "attended_session" # <-- adjust as per exact output
# HAS_STREAM_REL = "HAS_STREAM"             # <-- adjust as per exact output

# graph_name = "visitor_session_graph"

In [None]:
# # Drop the graph if previously exists
# if gds.graph.exists(graph_name)["exists"]:
#     gds.graph.drop(graph_name)

In [None]:
# MATCH (node:Label)
# WHERE node.propertyName = "Embeddings"
# MATCH (node:Visitor_last_year)
# SET node.Embeddings = null

In [None]:
# # Re-create graph with correctly adjusted relationship names
# try:
#     g, result = gds.graph.project(
#         graph_name,
#         ['Visitor_this_year'],
#         {
#             SAME_PERSON_REL: {'orientation': 'UNDIRECTED'},

#         }
#     )
#     print(f"Graph '{graph_name}' projected successfully:")
#     print(result)

# except Exception as e:
#     print("Failed to project graph:")
#     print(e)

In [None]:
# # STEP 2 - FastRP Embeddings for each Node Type individually
# EMBEDDING_SIZE = 128

# # Function to generate embeddings
# def generate_embeddings(label, embedding_property):
#     result = gds.fastRP.write(
#         g,
#         embeddingDimension=EMBEDDING_SIZE,
#         iterationWeights=[0.8, 1, 1, 1],
#         nodeLabels=[label],
#         writeProperty=embedding_property
#     )
#     print(f"Embeddings created for nodes '{label}' written to property '{embedding_property}':")
#     print(result)

In [None]:
# Visitor_last_year embeddings
generate_embeddings("Visitor_this_year", "Embeddings")