In [None]:
import os
from pathlib import Path
import pprint
import pandas as pd
import json
import re

In [None]:
ROOT_PATH = Path(os.getcwd())
PARENT_PATH = Path(os.getcwd()).parent
DATA_PATH = os.path.join(PARENT_PATH, "data")

In [None]:
csv_path = Path(
    os.path.join(
        DATA_PATH,
        "new_claire_db_badge_cluster_data_with_aggregated_info_GIO_GT_LABELS.csv",
    )
)
# csv_path = Path(os.path.join(DATA_PATH, '20240512_new_labels_WITH_AGGINFO.csv'))
nomenclature_embeddings_path = Path(
    os.path.join(DATA_PATH, "cluster_numeculature.json")
)

In [None]:
data = pd.read_csv(csv_path)
data.head()

In [None]:
len(data)

In [None]:
data.ClusterId.value_counts()

In [None]:
df_networking = data[data["ClusterId"] == "Networking"]
df_learning = data[data["ClusterId"] == "Learning"]
df_searching = data[data["ClusterId"] == "Searching"]
df_sourcing_early = data[data["ClusterId"] == "Sourcing – Early"]
df_sourcing_inprocess = data[data["ClusterId"] == "Sourcing – In Process"]
df_sourcing_deciding = data[data["ClusterId"] == "Sourcing – Deciding"]

In [None]:
example_networking = df_networking[0:1].to_dict()
example_learning = df_learning[0:1].to_dict()
example_searching = df_searching[0:1].to_dict()
example_sourcing_early = df_sourcing_early[0:1].to_dict()
example_sourcing_inprocess = df_sourcing_inprocess[0:1].to_dict()
example_sourcing_deciding = df_sourcing_deciding[0:1].to_dict()

# List Examples

In [None]:
df_sourcing_deciding.to_csv("source_deciding.csv", index=False)

In [None]:
list_examples = [
    example_networking,
    example_learning,
    example_searching,
    example_sourcing_early,
    example_sourcing_inprocess,
    example_sourcing_deciding,
]

In [None]:
examples = {}
for example in list_examples:

    examples[list(example["ClusterId"].values())[0]] = list(
        example["AggregatedInfo"].values()
    )[0]

In [None]:
examples.keys()

In [None]:
# with open("examples.json", "w", encoding="utf8") as f:
#     json.dump(examples, f, indent=4)

In [None]:
# This file contains clean example data from 2024
with open("csm_data/examples.json", "r") as f:
    examples = json.load(f)

In [None]:
examples.keys()

In [None]:
with open(nomenclature_embeddings_path, "r") as f:
    nomenclatures = json.load(f)

In [None]:
text_example = df_networking[0:1]["AggregatedInfo"].values[0]

In [None]:
pattern = r"Unnamed: \d{1,2}: no answer provided"

# Using re.sub to remove the matched strings
text_example = re.sub(pattern, "", text_example)

In [None]:
pattern = r"(\. ){2,}"

# Using re.sub to remove the matched strings
text_example = re.sub(pattern, "", text_example)
# Output the cleaned text

In [None]:
text_example = text_example.replace(
    "Show Ref: CCSEL24. Badge Id: I7Z9TJS. Registration Code: 6K6Q2. Registration Guid: b4bc60a6-16bc-49de-81f2-b124013412c1. Session Guid: 146b9536-dff0-406e-8820-1dc38c523ffb. ",
    "",
)
text_example = text_example.replace(
    "Status: Approved. Registration Date: 2024-02-28 18:41:39.280000",
    "Days_since_registration: 98",
)
text_example = text_example.replace(
    "Weeks Out: 1.0. Title: Mr. Firstname: Ross. Surname: Haywood. ", ""
)
text_example = text_example.replace(
    "Address 1: The Yard Raunds 2. Address 2: no answer provided. Address 3: no answer provided. Town/City: Raunds. County: no answer provided. Postcode: NN9 6RJ. ",
    "",
)
text_example = text_example.replace(
    "Telephone: +44 07918 621801. Mobile: +44 7918621801. Fax: no answer provided. Email: ross.haywood",
    "Email: ",
)
text_example = text_example.replace(
    "Secondary Email: no answer provided. Website: no answer provided. Nationality: no answer provided. Account Number: 6648886.0. Booker Forename: no answer provided. Booker Surname: no answer provided. Booker Company: no answer provided. Booker Email: no answer provided. Offer Code: no answer provided. Referee Forename / Referee Forename (text): no answer provided. Referee Surname / Referee Surname (text): no answer provided. ",
    "",
)

In [None]:
# Regular expression pattern to match the unwanted sentences
pattern = r"\. [A-Za-z?:/)( ]+: no answer provided\."

# Using re.sub to replace the matched strings with ". "
text_example = re.sub(pattern, ". ", text_example)

# Output the cleaned text
print(text_example)

In [None]:
# Function to generate the classification prompt
def generate_prompt(label_dict, examples, text_example):
    prompt = "Categories and Descriptions:\n\n"
    for label, description in label_dict.items():
        prompt += f"category: {label} --> {description[len(label)+1:]}\n"

    prompt += "\nExamples: (1 text associated to a category)\n"
    for label, example in examples.items():
        prompt += f"category: {label} --> {example}\n"

    prompt += "\nText to classify:\n"
    prompt += "----------------------------------------------------\n"
    prompt += text_example
    prompt += "----------------------------------------------------\n"
    prompt += "\n\n Only return 1 category of this list  = ['Networking', 'Learning', 'Searching', 'Sourcing – Early', 'Sourcing – In Process', 'Sourcing – Deciding'] \
    format to return {'category': value from list}"

    return prompt

In [None]:
prompt = generate_prompt(nomenclatures, examples, text_example)
len(prompt)

In [None]:
print(prompt)

In [None]:
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
from typing import Literal, Optional


# Schema for structured response
class Badge(BaseModel):
    category: str = Field(
        description="Classificacion of the text. 1 value of among these 6 categories 'Networking', 'Learning', 'Searching', 'Sourcing – Early', 'Sourcing – In Process', 'Sourcing – Deciding' "
    )

In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt_structured = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a clever chatbot specialized in classify text.Only provide Label, no other Information. Let's think step by Step",
        ),
        ("placeholder", "{messages}"),
    ]
)

In [None]:
from langchain_openai import ChatOpenAI

In [None]:
llm_2 = ChatOpenAI(
    model="deepseek-r1:1.5b",
    base_url="http://localhost:11434/v1",
    temperature=0,
    api_key="ollama",
).with_structured_output(Badge)

In [None]:
chain = prompt_structured | llm_2

In [None]:
chain.invoke({"messages": [("user", prompt)]}, kwargs={"num_ctx": 16000})

In [None]:
from langchain_ollama import ChatOllama
from langchain_core.messages import AIMessage
from langchain_core.output_parsers import BaseOutputParser, PydanticToolsParser
from typing import Any
from langchain_core.outputs import Generation, ChatGeneration

from pydantic import BaseModel as TypeBaseModel
from typing import cast
import json
from langchain.output_parsers import PydanticOutputParser

In [None]:
llm = ChatOllama(
    model="Phi3.5",
    temperature=0.5,
    num_ctx=16000,
    format="json",
)

In [None]:
# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Badge)

In [None]:
prompt_template = PromptTemplate(
    template="You are a clever chatbot specialized in classify text.I will provide you ,descriptions of the classes and its \
    labels and example of each class. Let's think step by Step format the output of classifing this text --> {query} \n\
    with this instruction:\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [None]:
chain = prompt_template | llm | parser

In [None]:
llm = ChatOllama(
    model="deepseek-r1:1.5b",
    temperature=0.5,
    num_ctx=16000,
)
messages = [
    (
        "system",
        "You are a clever chatbot specialized in classify text.I will provide you a text ,descriptions of the classes and its labels and example of each class. \
        Only provide Label, no other Information need. Let's think step by Step",
    ),
    ("human", prompt),
]
ai_msg = llm.invoke(messages)

In [None]:
print(ai_msg.content)
print("-" * 100)

In [None]:
llm = ChatOllama(
    model="llama3.2:3b",
    temperature=0.5,
    num_ctx=16000,
)
messages = [
    (
        "system",
        "You are a clever chatbot specialized in classify text.I will provide you a text ,descriptions of the classes and its labels and example of each class. \
        Only provide Label, no other Information need. Let's think step by Step",
    ),
    ("human", prompt),
]
ai_msg = llm.invoke(messages)

print(ai_msg.content)
print("-" * 100)

In [None]:
llm = ChatOllama(
    model="phi3.5",
    temperature=0.5,
    num_ctx=16000,
)
messages = [
    (
        "system",
        "You are a clever chatbot specialized in classify text.I will provide you a text ,descriptions of the classes and its labels and example of each class. \
        Only provide Label, no other Information need. Let's think step by Step",
    ),
    ("human", prompt),
]
ai_msg = llm.invoke(messages)

print(ai_msg.content)
print("-" * 100)

In [None]:
llm = ChatOllama(
    model="qwen2.5:latest",
    temperature=0.5,
    num_ctx=16000,
)
messages = [
    (
        "system",
        "You are a clever chatbot specialized in classify text.I will provide you a text ,descriptions of the classes and its labels and example of each class. \
        Only provide Label, no other Information need. Let's think step by Step",
    ),
    ("human", prompt),
]
ai_msg = llm.invoke(messages)


print(ai_msg.content)
print("-" * 100)