# Clustering Data and Prompting Classification

In [None]:
import json
import pprint

In [None]:
nomenclature = {
    "networking": """Networking: Visitors in this group are currently focused primarily on building professional or personal relationships and expanding their network. They attend this event to meet industry peers, experts, and potential partners. Their interactions are more about establishing connections than purchasing. They value personal interactions and opportunities for collaboration""",
    "learning": """Learning: These attendees are currently motivated by educational and learning opportunities. They seek to gain new insights, learn about industry trends, and enhance their knowledge and skills. They are likely to attend workshops, seminars, and keynote speeches. Their interest in products or services is secondary to their desire for professional development""",
    "searching_info_on_products_and_vendors": """Searching for info on products and vendor (Gathering Information on Products, Services, Vendors or Providers): Individuals in this category might have identified a need or a problem to solve but are still exploring what products, services, vendors or providers that can meet their specific requirements. Their primary goal is to collect as much information as possible without a strong inclination towards immediate purchasing. They may engage in more specific conversations about products or services but are not yet ready to make a decision. They are likely to engage with multiple vendors to compare offerings, gather brochures, and ask general questions""",
    "early_purchasing_intention": """Early Purchasing Intention: Attendees in this group are actively engaged in the sourcing process. They know well the type of product or service they require and might have narrowed down their options and are delving deeper into specific products or services. Their interactions are more detailed, involving discussions about pricing, implementation, or customization. They are seeking to understand how different offerings stack up against each other and may be forming preferences for vendor selection""",
    "high_purchasing_intention": """ High Purchase Intention: This group represents delegates who are at the final stages of their purchasing journey. They have all the information they need and are making final evaluations to choose a vendor. Their interactions are decisive, focusing on final terms, delivery, support, and other post-purchase considerations. Engagements with these individuals are very relevant and time-critical as they are on the verge of making a purchase decision.""",
}

In [None]:
with open("output/nomenclature.json", "w") as f:
    json.dump(nomenclature, f, indent=4)

In [None]:
registration_data_path = "output/registration_data.json"
demographic_data_path = "output/demographic_data.json"
demographic_data_badge_path = "output/demographic_data_with_badge.json"
examples_path = "output/examples.json"
merger_data_path = "output/merger_data.json"

In [None]:
with open(registration_data_path, "r") as f:
    registration_data = json.load(f)
with open(demographic_data_path, "r") as f:
    demographic_data = json.load(f)
with open(demographic_data_badge_path, "r") as f:
    demographic_data_badge = json.load(f)
with open(examples_path, "r") as f:
    examples = json.load(f)

In [None]:
len(registration_data), len(demographic_data), len(demographic_data_badge)

# Merge Data

In [None]:
!python --version

In [None]:
type(demographic_data_badge)

In [None]:
demographic_data_badge[1]

In [None]:
list(demographic_data_badge[0].keys())[0]

# Include scan in the Demographic Data. 
Come in file demographic_data_badge. Those which scan the badge in 2024 haS A TEXT , information only available for those 
which visit in 2024 and 2025

In [None]:
def data_merger(registration, demographic, year: str = "2024"):
    merged_data = {}
    for reg, demo in zip(registration, demographic):

        if len(reg.keys()) == 0:
            print("empty Dictionary")
            continue
        reg_key = list(reg.keys())[0]
        demo_key = list(demo.keys())[0]
        if demo_key != reg_key:
            print(f"key reg {reg_key} doesnt match key demo {demo_key}")
            continue
        else:
            if "Seminars" in list(demo.get(reg_key).keys()):
                if demo.get(reg_key).get("Seminars") == "NA":
                    txt = "No scan badge in stands"
                else:
                    txt = demo.get(reg_key).get("Seminars")

                texto = " ".join(
                    [
                        reg.get(reg_key),
                        "\nKey Question:",
                        demo.get(reg_key).get("vip"),
                        "\nOther Questions:",
                        demo.get(reg_key).get("normal"),
                        f"\n Attended seminars in {year}:",
                        txt,
                    ]
                )
            else:
                texto = " ".join(
                    [
                        reg.get(reg_key),
                        "\nKey Question:",
                        demo.get(reg_key).get("vip"),
                        "\nOther Questions:",
                        demo.get(reg_key).get("normal"),
                    ]
                )
            merged_data[reg_key] = texto

    return merged_data

In [None]:
merged_data = data_merger(registration_data, demographic_data_badge)

In [None]:
print(merged_data.get("BDAWL25_J59MXE4"))

In [None]:
len(merged_data.keys())

In [None]:
with open(merger_data_path, "w") as f:
    json.dump(merged_data, f, indent=4)

In [None]:
list(examples.keys())

In [None]:
examples["high_purchasing_intention"] = examples["Sourcing – In Process"]

In [None]:
del examples["Sourcing – Deciding"]

In [None]:
with open(examples_path, "w") as f:
    json.dump(examples, f, indent=4)

In [None]:
class LLama_PromptTemplate:
    def __init__(self, nomenclature, examples):
        self.nomenclature = nomenclature
        self.examples = examples
        self.base_template = """{begin_text}{start_header_id}system{end_header_id}You are a clever classifier assessing whether a profile of a Event Visitor,
        belongs to one of the 5 categories in section CATEGORIES. The format of each category is Category = Description of this category.
        You will be provided with an example of profile of each category on the section EXAMPLES. The format of each example is Category = Profile of this category.
        CATEGORIES
        ----------
        {categories}
        ------------
        EXAMPLES
        --------
        {examples}
        ----------
        Intructions to classify the Profile:
        {reasoning}
        Profile to Classify: {profile} 
        {eot_id}{start_header_id}assistant{end_header_id}

"""

    def generate_nomemclature(self):
        return "\n".join(
            [f"{key} = {value}\n" for key, value in self.nomenclature.items()]
        )

    def generate_examples(self):
        return "\n".join([f"{key} = {value}\n" for key, value in self.examples.items()])

    def generate_keys(self):
        return list(self.examples.keys())

    def generate_clustering_prompt(self, profile):
        """Generate a prompt for getting Visitor Classification"""
        return self.base_template.format(
            profile=profile,
            begin_text="<|begin_of_text|>",
            start_header_id="<|start_header_id|>",
            end_header_id="<|end_header_id|>",
            categories=self.generate_nomemclature(),
            examples=self.generate_examples(),
            reasoning=f"""1. Key Questions (What best describes your reason for attending and Decision making power) and Its answers has more weight than Other Questions.
2. More Days_since_registration means more interest to come to the event with a purchase purpose.
3. JobTitle, Job Level , Size of the company, Number of Employees, Number of Days in Hotel, can give you an Idea of how much interest has the visitor to come to the Event
4. Give 1 of the 5 choices in this list {self.generate_keys} based on the profile provided
5.Return the a JSON with a single key 'Class' and no premable or explanation.""",
            eot_id="<|eot_id|>",
        )

In [None]:
csm_template = LLama_PromptTemplate(nomenclature, examples)

In [None]:
profile = mg.get("BDAWL25_J59MXE4")

In [None]:
profile_template = csm_template.generate_clustering_prompt(profile)

print("Profile Template:")
print("-" * 50)
print(profile_template)
print("\n")

In [None]:
from langchain_ollama import ChatOllama
from langchain_core.messages import AIMessage

In [None]:
# llama3.2:3b

In [None]:
llm = ChatOllama(
    model="llama3.2:3b",
    temperature=0.2,
    num_ctx=4096,
)

In [None]:
ai_msg = llm.invoke(profile_template)

In [None]:
  print(ai_msg.content)
  print("-"*100)

In [None]:
list_profiles = list(merged_data.keys())
len(list_profiles)

In [None]:
output = []
for p in list_profiles:
    profile = merged_data.get(p)
    profile_template = csm_template.generate_clustering_prompt(profile)
    ai_msg = llm.invoke(profile_template)
    print(ai_msg.content)
    print("-" * 100)
    output.append(ai_msg.content)