In [None]:
# Import required libraries
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

uri = "bolt://127.0.0.1:7687"
username = "neo4j"
password = ""  # Replace with your password.

# Initialize Neo4j driver
driver = GraphDatabase.driver(uri, auth=(username, password))

# Load a good text embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Default values for properties
default_properties = {
    "Days_since_registration": "119",
    "Country": "UK",
    "Source": "BVA Key Stakeholders",
    "Email_domain": "effem.com",
    "assist_year_before": "1",
    "job_role": "NA",
    "what_type_does_your_practice_specialise_in": "NA",
    "organisation_type": "NA",
    "JobTitle": "NA",
}


def set_default_properties(tx, properties):
    query = """
    MATCH (n:Visitor_this_year)
    SET
    """ + ",\n    ".join(
        [f"n.{key} = COALESCE(n.{key}, $props.{key})" for key in properties.keys()]
    )

    tx.run(query, props=properties)


def main():

    with driver.session() as session:
        session.execute_write(set_default_properties, default_properties)
    driver.close()
    print("Missing properties set to default values for all Visitor_this_year nodes.")

In [None]:
main()

In [None]:
# Load a good text embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")


# define session embedding
def create_session_embedding(session, stream_descriptions=None):
    # Include title, synopsis_stripped, theatre__name, and now the sponsored attributes
    base_text = f"{session['title']} {session['synopsis_stripped']} {session['theatre__name']} {session['sponsored_by']} {session['sponsored_session']}"

    # Add stream descriptions if provided
    if stream_descriptions and len(stream_descriptions) > 0:
        stream_desc_text = " ".join(stream_descriptions)
        text = f"{base_text} {stream_desc_text}"
    else:
        text = base_text

    return model.encode(text)


# Embedding function for all sessions
def embed_all_sessions(tx):
    # Query all Sessions from past_year_dva, past_year_lva, Sessions_past_year (for compatibility) and this_year
    # Now including sponsored_by and sponsored_session fields
    query = """
    MATCH (s)
    WHERE s:Sessions_past_year_dva OR s:Sessions_past_year_lva OR s:Sessions_this_year OR s:Sessions_past_year
    RETURN s.session_id as session_id, s.title as title, 
           s.stream as stream, s.synopsis_stripped as synopsis_stripped,
           s.theatre__name as theatre__name, s.sponsored_by as sponsored_by,
           s.sponsored_session as sponsored_session, labels(s)[0] as type
    """
    sessions = tx.run(query).data()

    # Fetch all stream descriptions once to avoid multiple queries
    stream_query = """
    MATCH (s:Stream)
    RETURN s.stream as stream, s.description as description
    """
    stream_data = tx.run(stream_query).data()

    # Create a dictionary of stream descriptions for quick lookup
    stream_descriptions = {s["stream"].lower(): s["description"] for s in stream_data}

    embeddings = {}
    for s in sessions:
        # Process the stream field - split by semicolon and handle duplicates
        session_streams = []
        if s["stream"]:
            # Split the stream string and strip whitespace
            stream_list = [stream.strip().lower() for stream in s["stream"].split(";")]
            # Remove duplicates by converting to set and back to list
            stream_list = list(set(stream_list))

            # Get the description for each stream
            for stream in stream_list:
                if stream in stream_descriptions:
                    session_streams.append(stream_descriptions[stream])

        # Handle possible None values for sponsored attributes
        s["sponsored_by"] = s["sponsored_by"] if s["sponsored_by"] else "Not Sponsored"
        s["sponsored_session"] = (
            s["sponsored_session"] if s["sponsored_session"] else "False"
        )

        # Create embedding with the session data and stream descriptions
        embeddings[s["session_id"]] = {
            "type": s["type"],
            "embedding": create_session_embedding(s, session_streams),
            "sponsored_by": s["sponsored_by"],
            "sponsored_session": s["sponsored_session"],
        }

    return embeddings


def get_session_attributes(session_ids):
    # Handle empty session_ids to prevent errors
    if not session_ids:
        return []

    # Updated to include sponsored attributes
    query = """
    MATCH (s:Sessions_this_year)
    WHERE s.session_id IN $session_ids
    RETURN s {
        .stream,
        .session_id,
        .title,
        .synopsis_stripped,
        .end_time,
        .start_time,
        .date,
        .theatre__name,
        .sponsored_by,
        .sponsored_session
    } AS session_details
    """

    with driver.session() as session:
        results = session.run(query, session_ids=session_ids)
        session_details = [record["session_details"] for record in results]

    return session_details


# visitor-based similarity
def visitor_similarity(v1, v2, attrs):
    vec1 = np.array([1 if v1[a] == v2[a] else 0 for a in attrs])
    return vec1.mean()


# Recommend sessions with sponsored content handling
def recommend_sessions(tx, visitor_id, session_embeddings):
    # First check if visitor assisted last year
    visitor_query = """
    MATCH (v:Visitor_this_year {BadgeId: $visitor_id})
    RETURN v
    """
    visitor_data = tx.run(visitor_query, visitor_id=visitor_id).single()
    if not visitor_data:
        return []
    visitor = visitor_data["v"]
    assisted = visitor["assist_year_before"]

    recommendations = []
    sponsored_recommendations = []

    if assisted == "1":
        # Directly using past year attended session
        query_past = """
        MATCH (v:Visitor_this_year {BadgeId: $visitor_id})-[:Same_Visitor]->(vp_bva:Visitor_last_year_bva)-[:attended_session]->(sp_bva:Sessions_past_year)
        RETURN sp_bva.session_id as session_id
        UNION
        MATCH (v:Visitor_this_year {BadgeId: $visitor_id})-[:Same_Visitor]->(vp_lva:Visitor_last_year_lva)-[:attended_session]->(sp_lva:Sessions_past_year)
        RETURN sp_lva.session_id as session_id
        """
        past_sessions = tx.run(query_past, visitor_id=visitor_id).data()
        for past_sess in past_sessions:
            # Check if the session_id exists in embeddings
            if past_sess["session_id"] not in session_embeddings:
                continue  # Skip this session if it's not in our embeddings

            past_emb = session_embeddings[past_sess["session_id"]]["embedding"]

            this_year_sessions = {
                k: v
                for k, v in session_embeddings.items()
                if v["type"] == "Sessions_this_year"
            }
            similarities = []
            for sid, data in this_year_sessions.items():
                sim = cosine_similarity([past_emb], [data["embedding"]])[0][0]
                similarities.append(
                    (sid, sim, data["sponsored_session"], data["sponsored_by"])
                )

            # Sort by similarity and pick top 2 most similar
            similarities.sort(key=lambda x: -x[1])

            # Separate sponsored from non-sponsored content
            for sid, sim, is_sponsored, sponsor in similarities[
                :4
            ]:  # Check more to ensure we have enough of each type
                if is_sponsored.lower() == "true":
                    sponsored_recommendations.append((sid, sim, sponsor))
                else:
                    recommendations.append((sid, sim))

    else:
        # For new visitors, find similar visitor with history
        visitor_attrs = [
            "Days_since_registration",
            "Country",
            "Source",
            "Email_domain",
            "JobTitle",
            "assist_year_before",
            "job_role",
            "what_type_does_your_practice_specialise_in",
            "organisation_type",
        ]

        all_visitors = tx.run(
            """
            MATCH (v:Visitor_this_year)
            WHERE v.assist_year_before = '1'
            RETURN v
        """
        ).data()

        similarities = []
        for vdata in all_visitors:
            v_compare = vdata["v"]
            sim = visitor_similarity(visitor, v_compare, visitor_attrs)
            similarities.append((v_compare["BadgeId"], sim))

        similarities.sort(key=lambda x: -x[1])
        similar_visitor_badge_ids = [sid for sid, _ in similarities[:2]]

        for similar_vid in similar_visitor_badge_ids:
            query_sim_past = """
                MATCH (v:Visitor_this_year {BadgeId: $similar_vid})-[:Same_Visitor]->(vp)-[:attended_session]->(sp:Sessions_past_year)
                WHERE vp:Visitor_last_year_bva OR vp:Visitor_last_year_lva
                RETURN sp.session_id AS session_id
            """
            user_past_sessions = tx.run(query_sim_past, similar_vid=similar_vid).data()

            for past_sess in user_past_sessions:
                # Check if the session_id exists in embeddings
                if past_sess["session_id"] not in session_embeddings:
                    continue  # Skip this session if it's not in our embeddings

                past_emb = session_embeddings[past_sess["session_id"]]["embedding"]

                this_year_sessions = {
                    k: v
                    for k, v in session_embeddings.items()
                    if v["type"] == "Sessions_this_year"
                }
                session_similarities = []
                for sid, data in this_year_sessions.items():
                    sim = cosine_similarity([past_emb], [data["embedding"]])[0][0]
                    session_similarities.append(
                        (sid, sim, data["sponsored_session"], data["sponsored_by"])
                    )

                session_similarities.sort(key=lambda x: -x[1])

                # Separate sponsored from non-sponsored content
                for sid, sim, is_sponsored, sponsor in session_similarities[
                    :4
                ]:  # Check more to ensure we have enough of each type
                    if is_sponsored.lower() == "true":
                        sponsored_recommendations.append((sid, sim, sponsor))
                    else:
                        recommendations.append((sid, sim))

    # Create final recommendation list with a mix of sponsored and non-sponsored content
    # Take top 2 from each, prioritizing non-sponsored but ensuring some sponsored content is included
    final_recommendations = []

    # Add top non-sponsored recommendations (limit to 3)
    final_recommendations.extend(
        [r[0] for r in sorted(recommendations, key=lambda x: -x[1])[:3]]
    )

    # Add top sponsored recommendations (limit to 2)
    if sponsored_recommendations:
        final_recommendations.extend(
            [r[0] for r in sorted(sponsored_recommendations, key=lambda x: -x[1])[:2]]
        )

    # Remove duplicates and return
    return list(set(final_recommendations))


# Scoring function that boosts sponsored sessions slightly
def score_session(
    session_embedding, candidate_embedding, is_sponsored, sponsor_relevance=0.1
):
    # Base similarity score
    base_score = cosine_similarity([session_embedding], [candidate_embedding])[0][0]

    # Boost for sponsored content (smaller boost to not overwhelm relevance)
    sponsor_boost = sponsor_relevance if is_sponsored.lower() == "true" else 0

    return base_score + sponsor_boost

In [None]:
def filter_sessions_by_visitor_stream_relationships(tx, visitor_id, session_ids):
    """
    Filter sessions to keep only those where the visitor has a relationship
    (specialization_to_stream or job_to_stream) to any Stream that has a HAS_STREAM
    relationship to the session.
    """
    if not session_ids:
        return []

    # Query to find sessions with valid stream relationships to the visitor
    query = """
    MATCH (v:Visitor_this_year {BadgeId: $visitor_id})
    MATCH (s:Sessions_this_year)
    WHERE s.session_id IN $session_ids
    MATCH (s)-[:HAS_STREAM]->(stream:Stream)<-[r]-(v)
    WHERE type(r) IN ['specialization_to_stream', 'job_to_stream']
    RETURN DISTINCT s.session_id as session_id
    """

    results = tx.run(query, visitor_id=visitor_id, session_ids=session_ids).data()
    valid_session_ids = [r["session_id"] for r in results]

    return valid_session_ids


# Cached embeddings at the module level
_session_embeddings = None


def get_recommendations(visitor_id):
    global _session_embeddings

    # Create session embeddings once if they don't exist
    if _session_embeddings is None:
        with driver.session() as session:
            _session_embeddings = session.execute_read(embed_all_sessions)

    # Use the cached embeddings for recommendations
    with driver.session() as session:
        # Get initial recommended session IDs based on existing logic
        recommended_session_ids = session.execute_read(
            recommend_sessions,
            visitor_id=visitor_id,
            session_embeddings=_session_embeddings,
        )

        # Apply the additional filter for stream relationships
        filtered_session_ids = session.execute_read(
            filter_sessions_by_visitor_stream_relationships,
            visitor_id=visitor_id,
            session_ids=recommended_session_ids,
        )

    # Get the details of the filtered recommended sessions
    recommended_sessions_details = get_session_attributes(filtered_session_ids)

    return recommended_sessions_details

In [None]:
# # Cached embeddings at the module level
# _session_embeddings = None

# def get_recommendations(visitor_id):
#     global _session_embeddings

#     # Create session embeddings once if they don't exist
#     if _session_embeddings is None:
#         with driver.session() as session:
#             _session_embeddings = session.execute_read(embed_all_sessions)

#     # Use the cached embeddings for recommendations
#     with driver.session() as session:
#         recommended_session_ids = session.execute_read(
#             recommend_sessions,
#             visitor_id=visitor_id,
#             session_embeddings=_session_embeddings
#         )

#     # Get the details of the recommended sessions
#     recommended_sessions_details = get_session_attributes(recommended_session_ids)

# return recommended_sessions_details

In [None]:
# Example usage:
visitor_id_example = "MV4H3PQ"  #  No visit last year
driver = GraphDatabase.driver(uri, auth=(username, password))
recommended_sessions = get_recommendations(visitor_id_example)

In [None]:
for session in recommended_sessions:
    print(session)
    print("*" * 25)

In [None]:
# BRKLA7T Visit last year

visitor_id_example = "Z642DJP"  # Visitor_this_year BadgeId Example No visit last year
driver = GraphDatabase.driver(uri, auth=(username, password))
recommended_sessions = get_recommendations(visitor_id_example)

In [None]:
for session in recommended_sessions:
    print(session)
    print("*" * 25)

In [None]:
csv_file_path = "data/bva/output/df_reg_demo_this.csv"

In [None]:
data = pd.read_csv(csv_file_path)
len(data)

In [None]:
list_badgeId_this = list(data["BadgeId"].unique())
len(list_badgeId_this)

In [None]:
pa_recommendations = {}
counter = 0  # Initialize a counter

for badge in list_badgeId_this:
    recommended_sessions = get_recommendations(badge)
    pa_recommendations[badge] = recommended_sessions

    # Increment the counter
    counter += 1

    # Print the badge every 50 iterations
    if counter % 30 == 0:
        print(f"Processed {counter} badges so far. Current badge: {badge}")

# Optionally, print the total count at the end
print(f"Total badges processed: {counter}")

In [None]:
import json

In [None]:
with open("data/bva/bva_pa_recomendations.json", "w") as f:
    json.dump(pa_recommendations, f, indent=4)

In [None]:
def transform_recommendations_to_dataframe(pa_recommendations):
    """
    Transforms a dictionary of session recommendations to a Pandas DataFrame.

    Args:
        pa_recommendations (dict): A dictionary where keys are badge IDs (strings)
            and values are lists of session dictionaries.  Each session dictionary
            contains information about a recommended session.

    Returns:
        pandas.DataFrame: A DataFrame with columns 'badgeid', 'session_id', 'stream',
            'title', 'end_time', 'synopsis_stripped', 'start_time', 'date',
            and 'theatre__name'. Returns an empty DataFrame if the input dictionary is empty.
    """

    data = []  # List to hold the rows of the DataFrame
    for badgeid, session_list in pa_recommendations.items():
        for session in session_list:
            row = {
                "badgeid": badgeid,
                "session_id": session.get("session_id", None),  # Use .get() for safety
                "stream": session.get("stream", None),
                "title": session.get("title", None),
                "end_time": session.get("end_time", None),
                "synopsis_stripped": session.get("synopsis_stripped", None),
                "start_time": session.get("start_time", None),
                "date": session.get("date", None),
                "theatre__name": session.get("theatre__name", None),
            }
            data.append(row)

    df = pd.DataFrame(data)
    return df


df = transform_recommendations_to_dataframe(pa_recommendations)

In [None]:
df.head(10)

In [None]:
import pandas as pd
import numpy as np


#
def flag_overlapping_sessions(df):
    # Convert string time columns to datetime
    df["start_datetime"] = pd.to_datetime(df["date"] + " " + df["start_time"])
    df["end_datetime"] = pd.to_datetime(df["date"] + " " + df["end_time"])

    # Create a column to store overlapping session_ids
    df["overlapping_sessions"] = None

    # Group by badgeid to check overlaps for each visitor
    for badge_id, group in df.groupby("badgeid"):
        # If there's only one session for this badge, no need to check
        if len(group) <= 1:
            continue

        # For each session in the group
        for idx1, row1 in group.iterrows():
            overlaps = []

            # Compare with all other sessions for the same badge
            for idx2, row2 in group.iterrows():
                if idx1 == idx2:  # Skip comparing with itself
                    continue

                # Check if sessions overlap
                # Session1 starts before Session2 ends AND Session1 ends after Session2 starts
                if (
                    row1["start_datetime"] < row2["end_datetime"]
                    and row1["end_datetime"] > row2["start_datetime"]
                ):
                    overlaps.append(row2["session_id"])

            # If overlaps found, update the column
            if overlaps:
                df.at[idx1, "overlapping_sessions"] = ",".join(map(str, overlaps))

    # Drop the temporary datetime columns if not needed
    df.drop(["start_datetime", "end_datetime"], axis=1, inplace=True)

    return df


# Apply the function to your DataFrame
df = flag_overlapping_sessions(df)

In [None]:
df.overlapping_sessions.value_counts()

In [None]:
df.head(2)

In [None]:
with open("data/bva/output/streams.json", "r") as f:
    streams = json.load(f)

In [None]:
streams.keys()

In [None]:
len(df)

In [None]:
df.to_csv("data/bva/bva_pa_recomendations.csv", index=False)

In [None]:
df_reg_24_25_bva_valid_columns = pd.read_csv(
    "data/bva/csv/Registration_data_bva_24_25_only_valid.csv"
)
df_reg_24_25_lva_valid_columns = pd.read_csv(
    "data/bva/csv/Registration_data_lva_24_25_only_valid.csv"
)

In [None]:
df_reg_24_25_bva_valid_columns.head()

In [None]:
list_dva_24_25 = set(list(df_reg_24_25_bva_valid_columns["BadgeId"].unique()))
list_lva_24_25 = set(list(df_reg_24_25_lva_valid_columns["BadgeId"].unique()))

In [None]:
list_vistor

In [None]:
data["visit"]

In [None]:
data.columns

In [None]:
data.columns = [
    "Email",
    "Email_domain",
    "Company",
    "JobTitle",
    "Country",
    "BadgeType",
    "ShowRef",
    "badgeid",
    "Source",
    "Days_since_registration",
    "assist_year_before",
    "BadgeId_last_year_bva",
    "BadgeId_last_year_lva",
    "what_type_does_your_practice_specialise_in",
    "organisation_type",
    "job_role",
]

In [None]:
data_final = pd.merge(df, data, on=["badgeid"], how="left")

In [None]:
data_final.to_csv("data/bva/bva_pa_recomendations_with_demo.csv", index=False)

In [None]:
## OLD Stuff

In [None]:
# from graphdatascience import GraphDataScience

# # Connect to Neo4j (Please update URI, USER and PASSWORD )
# NEO4J_URI =  "bolt://127.0.0.1:7687"
# USER = "neo4j"
# PASSWORD = ""

# # Connect to Neo4j
# gds = GraphDataScience(NEO4J_URI, auth=(USER, PASSWORD))

# # Check existing relationships
# existing_rels = gds.run_cypher("CALL db.relationshipTypes()")
# print("Existing Relationship Types in database:")
# print(existing_rels)

In [None]:
# # Adjust these variables exactly as they appear in the print statement above
# SAME_PERSON_REL = "Same_Visitor"          # <-- adjust as per exact output
# ATTENDED_SESSION_REL = "attended_session" # <-- adjust as per exact output
# HAS_STREAM_REL = "HAS_STREAM"             # <-- adjust as per exact output

# graph_name = "visitor_session_graph"

In [None]:
# # Drop the graph if previously exists
# if gds.graph.exists(graph_name)["exists"]:
#     gds.graph.drop(graph_name)

In [None]:
# MATCH (node:Label)
# WHERE node.propertyName = "Embeddings"
# MATCH (node:Visitor_last_year)
# SET node.Embeddings = null

In [None]:
# # Re-create graph with correctly adjusted relationship names
# try:
#     g, result = gds.graph.project(
#         graph_name,
#         ['Visitor_this_year'],
#         {
#             SAME_PERSON_REL: {'orientation': 'UNDIRECTED'},

#         }
#     )
#     print(f"Graph '{graph_name}' projected successfully:")
#     print(result)

# except Exception as e:
#     print("Failed to project graph:")
#     print(e)

In [None]:
# # STEP 2 - FastRP Embeddings for each Node Type individually
# EMBEDDING_SIZE = 128

# # Function to generate embeddings
# def generate_embeddings(label, embedding_property):
#     result = gds.fastRP.write(
#         g,
#         embeddingDimension=EMBEDDING_SIZE,
#         iterationWeights=[0.8, 1, 1, 1],
#         nodeLabels=[label],
#         writeProperty=embedding_property
#     )
#     print(f"Embeddings created for nodes '{label}' written to property '{embedding_property}':")
#     print(result)

In [None]:
# Visitor_last_year embeddings
# generate_embeddings("Visitor_this_year", "Embeddings")