In [None]:
# Import required libraries
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

uri = "bolt://127.0.0.1:7687"
username = "neo4j"
password = ""  # Replace with your password.

# Initialize Neo4j driver
driver = GraphDatabase.driver(uri, auth=(username, password))

# Load a good text embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Default values for properties
default_properties = {
    "Days_since_registration": "119",
    "Country": "UK",
    "Source": "BVA Key Stakeholders",
    "Email_domain": "effem.com",
    "assist_year_before": "1",
    "job_role": "NA",
    "what_type_does_your_practice_specialise_in": "NA",
    "organisation_type": "NA",
}


def set_default_properties(tx, properties):
    query = """
    MATCH (n:Visitor_this_year)
    SET
    """ + ",\n    ".join(
        [f"n.{key} = COALESCE(n.{key}, $props.{key})" for key in properties.keys()]
    )

    tx.run(query, props=properties)


def main():

    with driver.session() as session:
        session.execute_write(set_default_properties, default_properties)
    driver.close()
    print("Missing properties set to default values for all Visitor_this_year nodes.")

In [None]:
main()

In [None]:
def get_session_attributes(session_ids):
    query = """
    MATCH (s:Sessions_this_year)
    WHERE s.session_id IN $session_ids
    RETURN s {
        .stream,
        .session_id,
        .title,
        .synopsis_stripped,
        .end_time,
        .start_time,
        .date,
        .theatre__name
    } AS session_details
    """

    with driver.session() as session:
        results = session.run(query, session_ids=session_ids)
        session_details = [record["session_details"] for record in results]

    return session_details


# define session embedding
def create_session_embedding(session):
    text = f"{session['title']} {session['stream']} {session['synopsis_stripped']} {session['theatre__name']}"
    return model.encode(text)


# Embedding function for all sessions
def embed_all_sessions(tx):
    # Query all Sessions from past and this year
    query = """
    MATCH (s)
    WHERE s:Sessions_past_year OR s:Sessions_this_year
    RETURN s.session_id as session_id, s.title as title, 
           s.stream as stream, s.synopsis_stripped as synopsis_stripped,
           s.theatre__name as theatre__name, labels(s)[0] as type
    """

    sessions = tx.run(query).data()
    embeddings = {}

    for s in sessions:
        embeddings[s["session_id"]] = {
            "type": s["type"],
            "embedding": create_session_embedding(s),
        }

    return embeddings


# visitor-based similarity
def visitor_similarity(v1, v2, attrs):
    vec1 = np.array([1 if v1[a] == v2[a] else 0 for a in attrs])
    return vec1.mean()


# Recommend sessions
def recommend_sessions(tx, visitor_id, session_embeddings):

    # First check if visitor assisted last year
    visitor_query = """
    MATCH (v:Visitor_this_year {BadgeId: $visitor_id})
    RETURN v
    """
    visitor_data = tx.run(visitor_query, visitor_id=visitor_id).single()
    if not visitor_data:
        return []
    visitor = visitor_data["v"]
    assisted = visitor["assist_year_before"]

    recommendations = []

    if assisted == "1":
        # Directly using past year attended session
        query_past = """
        MATCH (v:Visitor_this_year {BadgeId: $visitor_id})-[:Same_Visitor]->(vp_bva:Visitor_last_year_bva)-[:attended_session]->(sp_bva:Sessions_past_year)
        RETURN sp_bva.session_id as session_id
        UNION
        MATCH (v:Visitor_this_year {BadgeId: $visitor_id})-[:Same_Visitor]->(vp_lva:Visitor_last_year_lva)-[:attended_session]->(sp_lva:Sessions_past_year)
        RETURN sp_lva.session_id as session_id
        """
        past_sessions = tx.run(query_past, visitor_id=visitor_id).data()
        for past_sess in past_sessions:
            past_emb = session_embeddings[past_sess["session_id"]]["embedding"]

            this_year_sessions = {
                k: v
                for k, v in session_embeddings.items()
                if v["type"] == "Sessions_this_year"
            }
            similarities = []
            for sid, data in this_year_sessions.items():
                sim = cosine_similarity([past_emb], [data["embedding"]])[0][0]
                similarities.append((sid, sim))

            # Sort by similarity and pick top 2 most similar
            similarities.sort(key=lambda x: -x[1])
            recommendations.extend([s[0] for s in similarities[:2]])

    else:
        # For new visitors, find similar visitor with history
        visitor_attrs = [
            "Days_since_registration",
            "Country",
            "Source",
            "Email_domain",
            "assist_year_before",
            "job_role",
            "what_type_does_your_practice_specialise_in",
            "organisation_type",
        ]

        all_visitors = tx.run(
            """
             MATCH (v:Visitor_this_year)
             WHERE v.assist_year_before = '1'
             RETURN v
        """
        ).data()

        similarities = []
        for vdata in all_visitors:
            v_compare = vdata["v"]
            sim = visitor_similarity(visitor, v_compare, visitor_attrs)
            similarities.append((v_compare["BadgeId"], sim))

        similarities.sort(key=lambda x: -x[1])
        similar_visitor_badge_ids = [sid for sid, _ in similarities[:2]]

        for similar_vid in similar_visitor_badge_ids:
            query_sim_past = """
                MATCH (v:Visitor_this_year {BadgeId: $similar_vid})-[:Same_Visitor]->(vp)-[:attended_session]->(sp:Sessions_past_year)
                WHERE vp:Visitor_last_year_bva OR vp:Visitor_last_year_lva
                RETURN sp.session_id AS session_id
            """
            user_past_sessions = tx.run(query_sim_past, similar_vid=similar_vid).data()

            for past_sess in user_past_sessions:
                past_emb = session_embeddings[past_sess["session_id"]]["embedding"]

                this_year_sessions = {
                    k: v
                    for k, v in session_embeddings.items()
                    if v["type"] == "Sessions_this_year"
                }
                similarities = []
                for sid, data in this_year_sessions.items():
                    sim = cosine_similarity([past_emb], [data["embedding"]])[0][0]
                    similarities.append((sid, sim))

                similarities.sort(key=lambda x: -x[1])
                recommendations.extend([s[0] for s in similarities[:2]])

    # Remove duplicates and return
    return list(set(recommendations))

In [None]:
def get_recommendations(visitor_id):
    with driver.session() as session:
        session_embeddings = session.execute_read(embed_all_sessions)
        recommended_session_ids = session.execute_read(
            recommend_sessions,
            visitor_id=visitor_id,
            session_embeddings=session_embeddings,
        )

        recommended_sessions_details = get_session_attributes(recommended_session_ids)

    return recommended_sessions_details

In [None]:
# Example usage:
visitor_id_example = "MV4H3PQ"  #  No visit last year
driver = GraphDatabase.driver(uri, auth=(username, password))
recommended_sessions = get_recommendations(visitor_id_example)

In [None]:
for session in recommended_sessions:
    print(session)
    print("*" * 25)

In [None]:
# BRKLA7T Visit last year

visitor_id_example = "Z642DJP"  # Visitor_this_year BadgeId Example No visit last year
driver = GraphDatabase.driver(uri, auth=(username, password))
recommended_sessions = get_recommendations(visitor_id_example)

In [None]:
for session in recommended_sessions:
    print(session)
    print("*" * 25)

In [None]:
csv_file_path = "data/bva/output/df_reg_demo_this.csv"

In [None]:
data = pd.read_csv(csv_file_path)
len(data)

In [None]:
list_badgeId_this = list(data["BadgeId"].unique())
len(list_badgeId_this)

In [None]:
pa_recommendations = {}
counter = 0  # Initialize a counter

for badge in list_badgeId_this:
    recommended_sessions = get_recommendations(badge)
    pa_recommendations[badge] = recommended_sessions

    # Increment the counter
    counter += 1

    # Print the badge every 50 iterations
    if counter % 30 == 0:
        print(f"Processed {counter} badges so far. Current badge: {badge}")

# Optionally, print the total count at the end
print(f"Total badges processed: {counter}")

In [None]:
def transform_recommendations_to_dataframe(pa_recommendations):
    """
    Transforms a dictionary of session recommendations to a Pandas DataFrame.

    Args:
        pa_recommendations (dict): A dictionary where keys are badge IDs (strings)
            and values are lists of session dictionaries.  Each session dictionary
            contains information about a recommended session.

    Returns:
        pandas.DataFrame: A DataFrame with columns 'badgeid', 'session_id', 'stream',
            'title', 'end_time', 'synopsis_stripped', 'start_time', 'date',
            and 'theatre__name'. Returns an empty DataFrame if the input dictionary is empty.
    """

    data = []  # List to hold the rows of the DataFrame
    for badgeid, session_list in pa_recommendations.items():
        for session in session_list:
            row = {
                "badgeid": badgeid,
                "session_id": session.get("session_id", None),  # Use .get() for safety
                "stream": session.get("stream", None),
                "title": session.get("title", None),
                "end_time": session.get("end_time", None),
                "synopsis_stripped": session.get("synopsis_stripped", None),
                "start_time": session.get("start_time", None),
                "date": session.get("date", None),
                "theatre__name": session.get("theatre__name", None),
            }
            data.append(row)

    df = pd.DataFrame(data)
    return df


df = transform_recommendations_to_dataframe(pa_recommendations)

In [None]:
df.head()

In [None]:
len(df)

In [None]:
df.to_csv("data/bva/bva_pa_recomendations.csv", index=True)

In [None]:
## OLD Stuff

In [None]:
# from graphdatascience import GraphDataScience

# # Connect to Neo4j (Please update URI, USER and PASSWORD )
# NEO4J_URI =  "bolt://127.0.0.1:7687"
# USER = "neo4j"
# PASSWORD = ""

# # Connect to Neo4j
# gds = GraphDataScience(NEO4J_URI, auth=(USER, PASSWORD))

# # Check existing relationships
# existing_rels = gds.run_cypher("CALL db.relationshipTypes()")
# print("Existing Relationship Types in database:")
# print(existing_rels)

In [None]:
# # Adjust these variables exactly as they appear in the print statement above
# SAME_PERSON_REL = "Same_Visitor"          # <-- adjust as per exact output
# ATTENDED_SESSION_REL = "attended_session" # <-- adjust as per exact output
# HAS_STREAM_REL = "HAS_STREAM"             # <-- adjust as per exact output

# graph_name = "visitor_session_graph"

In [None]:
# # Drop the graph if previously exists
# if gds.graph.exists(graph_name)["exists"]:
#     gds.graph.drop(graph_name)

In [None]:
# MATCH (node:Label)
# WHERE node.propertyName = "Embeddings"
# MATCH (node:Visitor_last_year)
# SET node.Embeddings = null

In [None]:
# # Re-create graph with correctly adjusted relationship names
# try:
#     g, result = gds.graph.project(
#         graph_name,
#         ['Visitor_this_year'],
#         {
#             SAME_PERSON_REL: {'orientation': 'UNDIRECTED'},

#         }
#     )
#     print(f"Graph '{graph_name}' projected successfully:")
#     print(result)

# except Exception as e:
#     print("Failed to project graph:")
#     print(e)

In [None]:
# # STEP 2 - FastRP Embeddings for each Node Type individually
# EMBEDDING_SIZE = 128

# # Function to generate embeddings
# def generate_embeddings(label, embedding_property):
#     result = gds.fastRP.write(
#         g,
#         embeddingDimension=EMBEDDING_SIZE,
#         iterationWeights=[0.8, 1, 1, 1],
#         nodeLabels=[label],
#         writeProperty=embedding_property
#     )
#     print(f"Embeddings created for nodes '{label}' written to property '{embedding_property}':")
#     print(result)

In [None]:
# Visitor_last_year embeddings
# generate_embeddings("Visitor_this_year", "Embeddings")