# This Notebook prepare session Data BVA this past year and LVA past year

### Needed Files

- BVA25_session_export.csv # This refreshed every week
- BVA24_session_export.csv
- LVA24_session_export.csv 

In [None]:
import pandas as pd
from pandas.errors import SettingWithCopyWarning
import logging
import warnings
from dotenv import load_dotenv
from dotenv import dotenv_values
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

warnings.simplefilter(action="ignore", category=(SettingWithCopyWarning))

In [None]:
session_this_path = "data/bva/BVA25_session_export.csv"
session_past_path_bva = "data/bva/BVA24_session_export.csv"
session_past_path_lva = "data/bva/LVA24_session_export.csv"

In [None]:
session_this = pd.read_csv(session_this_path)
session_past_bva = pd.read_csv(session_past_path_bva)
session_past_lva = pd.read_csv(session_past_path_lva)

In [None]:
len(session_this), len(session_past_bva), len(session_past_lva)

In [None]:
import string


def clean_text(text):
    # Remove punctuation and spaces, and convert to lowercase
    return "".join(char for char in text if char.isalnum()).lower()

In [None]:
session_this.title.unique()

In [None]:
title_remove1 = "exhibitor showcase"
title_remove2 = "tbc session"
title_remove3 = "-"
titles_to_remove = [title_remove1, title_remove2, title_remove3]

In [None]:
for title in titles_to_remove:
    session_this_filtered = session_this[
        ~(session_this["title"].str.contains(title, case=False, na=False))
    ]

    session_last_filtered_bva = session_past_bva[
        ~(session_past_bva["title"].str.contains(title, case=False, na=False))
    ]
    session_last_filtered_lva = session_past_lva[
        ~(session_past_lva["title"].str.contains(title, case=False, na=False))
    ]

In [None]:
len(session_this_filtered), len(session_last_filtered_bva), len(
    session_last_filtered_lva
)  # 16042025 (88, 208, 701)

# create key to join scan data

In [None]:
session_this_filtered["key_text"] = session_this_filtered["title"].apply(clean_text)
session_last_filtered_bva["key_text"] = session_last_filtered_bva["title"].apply(
    clean_text
)
session_last_filtered_lva["key_text"] = session_last_filtered_lva["title"].apply(
    clean_text
)

In [None]:
session_this_filtered = session_this_filtered.drop_duplicates(
    subset=["date", "key_text"]
)
session_last_filtered_bva = session_last_filtered_bva.drop_duplicates(
    subset=["date", "key_text"]
)
session_last_filtered_lva = session_last_filtered_lva.drop_duplicates(
    subset=["date", "key_text"]
)
len(session_this_filtered), len(session_last_filtered_bva), len(
    session_last_filtered_lva
)

In [None]:
session_this_filtered.columns

In [None]:
cols_to_keep = [
    "session_id",
    "date",
    "start_time",
    "end_time",
    "theatre__name",
    "title",
    "stream",
    "synopsis_stripped",
    "sponsored_session",
    "sponsored_by",
    "key_text",
]

In [None]:
session_last_filtered_valid_cols_bva = session_last_filtered_bva[cols_to_keep]
session_last_filtered_valid_cols_lva = session_last_filtered_lva[cols_to_keep]
session_this_filtered_valid_cols = session_this_filtered[cols_to_keep]

In [None]:
session_last_filtered_valid_cols_bva = session_last_filtered_valid_cols_bva.fillna(
    "No Data"
)
session_last_filtered_valid_cols_lva = session_last_filtered_valid_cols_lva.fillna(
    "No Data"
)
session_this_filtered_valid_cols = session_this_filtered_valid_cols.fillna("No Data")

In [None]:
session_this_filtered_valid_cols = session_this_filtered_valid_cols[
    ~(session_this_filtered_valid_cols.title == "-")
]
session_last_filtered_valid_cols_bva = session_last_filtered_valid_cols_bva[
    ~(session_last_filtered_valid_cols_bva.title == "-")
]
session_last_filtered_valid_cols_lva = session_last_filtered_valid_cols_lva[
    ~(session_last_filtered_valid_cols_lva.title == "-")
]

In [None]:
len(session_this_filtered_valid_cols), len(session_last_filtered_valid_cols_bva), len(
    session_last_filtered_valid_cols_lva
)

In [None]:
total_sessions = pd.concat(
    [
        session_last_filtered_valid_cols_bva,
        session_last_filtered_valid_cols_lva,
        session_this_filtered_valid_cols,
    ],
    ignore_index=True,
)
len(total_sessions)

In [None]:
# session_last_filtered_valid_cols.title.unique()

# Generate list of Valid Streams

In [None]:
list_stream_this = list(session_this_filtered_valid_cols.stream.unique())
list_stream_last_bva = list(session_last_filtered_valid_cols_bva.stream.unique())
list_stream_last_lva = list(session_last_filtered_valid_cols_lva.stream.unique())

In [None]:
list_stream_last_lva

In [None]:
streams = set()


def generate_streams(streams, list_streams):
    for ele in list_streams:
        for sub_ele in ele.split(";"):
            stream = sub_ele.lower().strip()
            streams.add(stream)
    return streams

In [None]:
streams = generate_streams(streams, list_stream_this)
streams = generate_streams(streams, list_stream_last_bva)
streams = generate_streams(streams, list_stream_last_lva)

In [None]:
streams

### Create a dictionary with the name of the stream as key and as value the concatenation of (title and synopsis_stripped) events tagged with that key(stream)

In [None]:
def generate_stream_descriptions(df, streams):
    # Remove 'no data' from streams set if present
    if "no data" in streams:
        streams.remove("no data")

    # Initialize the dictionary to hold descriptions for each stream
    stream_descriptions = {stream: "" for stream in streams}

    # Iterate over each row in the dataframe
    for _, row in df.iterrows():
        # Split the stream column for current row and process each sub-stream
        session_streams = [s.lower().strip() for s in row["stream"].split(";")]

        # Remove duplicates while preserving order
        unique_streams = []
        [unique_streams.append(s) for s in session_streams if s not in unique_streams]
        # Concatenate title and synopsis_stripped
        session_description = (
            f"Title: {row['title']}.\nDescription: {row['synopsis_stripped']} \n\n "
        )

        # Add session description to relevant streams
        for stream in unique_streams:
            if stream in stream_descriptions:
                stream_descriptions[stream] += session_description

    return stream_descriptions


# Example usage:
# df is your dataframe and streams is the set you've generated
stream_descriptions_last = generate_stream_descriptions(total_sessions, streams)

In [None]:
print(stream_descriptions_last["anaesthesia"])

In [None]:
status = load_dotenv(".env")
config = dotenv_values(".env")

In [None]:
config["OPENAI_API_KEY"]

In [None]:
llm = ChatOpenAI(
    model="gpt-4.1-mini",
    openai_api_key=config["OPENAI_API_KEY"],
    temperature=0.5,
    top_p=0.9,
)

In [None]:
def generate_prompt():
    """Generate a prompt for getting Visitor Classification"""
    system_prompt = """
    you are an assistant specialized in create a definition from a given category label. You will receive the title and sinopsip of diferent session of an event under that category
    and based on that information you will prepare a description of the category label
    
    """

    return system_prompt

In [None]:
system_prompt = generate_prompt()
system_prompt

In [None]:
prompt = PromptTemplate(
    input_variables=["key ", "text"],
    template=system_prompt
    + """Produce a description of the category: {key} based on the title and descriptions of the folowing session events {text}.\n Produce a description in 3 or 4 sentences of that category""",
)

In [None]:
chain = prompt | llm
print(f"length of profile: {len(prompt.template)}")

In [None]:
def generate_description(llm, key, text, system_prompt):
    prompt = PromptTemplate(
        input_variables=["key ", "text"],
        template=system_prompt
        + """Produce a description of the category: {key} based on the title and descriptions of the folowing session events {text}.\n Produce a description in 3 or 4 sentences of that category""",
    )
    chain = prompt | llm
    ai_msg = chain.invoke({"key": key, "text": text})
    return ai_msg.content

In [None]:
generate_description(
    llm=llm,
    key="anaesthesia",
    text=stream_descriptions_last["anaesthesia"],
    system_prompt=system_prompt,
)

In [None]:
streams = {}
for stream in stream_descriptions_last.keys():
    print(stream)
    streams[stream] = generate_description(
        llm=llm,
        key=stream,
        text=stream_descriptions_last[stream],
        system_prompt=system_prompt,
    )

In [None]:
for s in streams.keys():

    print(f"{s} " + "*" * 25)
    print(streams[s])

In [None]:
import json


# # Convert the set to a list
# streams.discard("no data")
# streams_list = list(streams)

# Save the list to a JSON file
with open("data/bva/output/streams.json", "w") as json_file:
    json.dump(streams, json_file)

In [None]:
session_last_filtered_valid_cols_bva.head()

# Map abreviatures to dictionaries

transform the Abreviatures of the Sponsors into their descriptions

In [None]:
def find_short_labels(input_set):
    """
    Finds a list of labels in a set that have 5 characters or less.

    Args:
      input_set: A set of strings (labels).

    Returns:
      A list containing the labels from the input set with a length of 5 or less.
    """
    short_labels = [
        label for label in input_set if isinstance(label, str) and len(label) <= 5
    ]
    return short_labels

In [None]:
list_bva_this = set(list(session_this_filtered_valid_cols.sponsored_by.unique()))
list_lva_last = set(list(session_last_filtered_valid_cols_lva.sponsored_by.unique()))
list_bva_last = set(list(session_last_filtered_valid_cols_bva.sponsored_by.unique()))

In [None]:
full_list_sponsors = list_bva_this.union(list_lva_last, list_bva_last)

In [None]:
# full_list_sponsors

In [None]:
list_abreviatures = set(find_short_labels(full_list_sponsors))

In [None]:
# Enrich this dictionary if new abreviatures appears
map_vets = {
    "RCVS": "Royal College of Veterinary Surgeons",
    "VPG": "Veterinary Pathology Group",
    "IDEXX": "IDEXX Laboratories",
    "VCMS": "Veterinary Client Mediation Service",
    "NVS": "National Veterinary Services",
    "ACD": "ACD Labs",
    "VMD": "The Veterinary Medicines Directorate",
    "TVM": "TVM Uk",
    "IVAMP": "International Veterinary Academy of Pain Management",
    "JAK": "JAK Marketing",
    "DMS": "DMS Veterinary",
    "VRS": "Veterinarian Recommended Solutions",
    "No Data": "Not Sponsored",
    "BVZS": "British Veterinary Zoological Society",
    "BSAVA": "British Small Animal Veterinary Association",
    "BCVA": "Bristol Commercial Valuers and Auctioneers",
    "PVS": "Pig Veterinary Society",
    "BVPA": "British Veterinary Poultry Association",
    "BBVA": "British Bee Veterinary Association",
    "BVCS": "Bachelor of Veterinary Science",
    "BVDA": "British Veterinary Dental Association",
    "DEFRA": "Department for Environment, Food & Rural Affairs",
    "Agria": "Agria Pet Insurance Limited",
    "Cubex": "Veterinary Practice Software",
    "Forte": "Forte Healthcare Ltd",
    "Lupa": "LUPA PETS LTD",
    "iM3": " iM3 The Global Name in Veterinary Dentistry",
    "Omni": "Omni Pet Limited",
}

In [None]:
map_keys = set(list(map_vets.keys()))

In [None]:
list_abreviatures.difference(map_keys)

In [None]:
session_last_filtered_valid_cols_lva["sponsored_by"] = (
    session_last_filtered_valid_cols_lva["sponsored_by"].replace(map_vets)
)
session_last_filtered_valid_cols_bva["sponsored_by"] = (
    session_last_filtered_valid_cols_bva["sponsored_by"].replace(map_vets)
)
session_this_filtered_valid_cols["sponsored_by"] = session_this_filtered_valid_cols[
    "sponsored_by"
].replace(map_vets)

In [None]:
session_this_filtered_valid_cols.sponsored_by.unique()

In [None]:
session_last_filtered_valid_cols_bva.to_csv(
    "data/bva/output/session_last_filtered_valid_cols_bva.csv", index=False
)
session_last_filtered_valid_cols_lva.to_csv(
    "data/bva/output/session_last_filtered_valid_cols_lva.csv", index=False
)
session_this_filtered_valid_cols.to_csv(
    "data/bva/output/session_this_filtered_valid_cols.csv", index=False
)