In [2]:
import os
import json
from tqdm import tqdm
from decouple import config
import pandas as pd
import re
from pathlib import Path

# genai
from langchain.chat_models import AzureChatOpenAI
from langchain import LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

In [291]:
# Settings
tqdm.pandas()
pd.options.plotting.backend = "plotly"

### GenAI - Classification

In [293]:
os.environ["OPENAI_API_TYPE"] = config("OPENAI_API_TYPE")
os.environ["OPENAI_API_VERSION"] = config("OPENAI_API_VERSION")
os.environ["OPENAI_API_BASE"] = config("OPENAI_API_BASE")
os.environ["OPENAI_API_KEY"] = config("OPENAI_API_KEY")

In [3]:
llm = AzureChatOpenAI(
    deployment_name="bv-llm",
    model_name="gpt-35-turbo-16k",
)



In [4]:

system_message = """Your job is to identify abstract purpose categories from the vision and mission statement of a company. Based on a text, identify the underlaying focuses of the company: What do they believe in? List only the general categories in and nothing else.

Examples ofpurpose categories could be
* Customer centrism
* Affordability
* Stakeholder value
* Sustainability
* Innovation
"""
system_message_prompt = SystemMessagePromptTemplate.from_template(system_message)

In [5]:

human_template="""Text: 
'{text}'

What are the corporate purpose categories of the text above? Answer with a comma-separated list of up to five general purpose categories.
"""
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

In [6]:
chat_prompt = ChatPromptTemplate.from_messages(
    [
        system_message_prompt,
        human_message_prompt
    ]
)

In [7]:
chain = LLMChain(llm=llm, prompt=chat_prompt)

In [None]:
def run_llm(text):
    result = chain.run(text)
    return result.split(", ")

### Load data

In [135]:
dataf = pd.read_excel(
    "data/purpose.xlsx",
    # read only specific columns
    usecols=["Purpose Text", "Company name"],
)

### Classification

In [22]:
dataf["Purpose Categories"] = dataf["Purpose Text"].progress_apply(run_llm)

100%|██████████| 437/437 [02:41<00:00,  2.70it/s]


### Save results

In [23]:
# split comma separated values into columns
dataf.to_csv("data/classified-raw.csv", index=False)

### Load results

In [7]:
dataf = pd.read_csv("data/classified-raw.csv")

In [9]:
def handle_bugs(categories):
    cats = eval(categories)
    if "1." in cats[0]:
        cats = re.sub(r"\d\.", "", cats[0])
        return [s.strip() for s in cats.split("\n")]

    elif "\n- " in cats[0]:
        return [s.strip(":").strip() for s in cats[0].split("\n- ")]

    return cats

dataf["Purpose Categories"] = dataf["Purpose Categories"].apply(handle_bugs)

### Load mapping

AI-generated mapping of high-variability labels to general labels.

In [12]:
with Path("mapping.json").open("r") as f:
    category_mapping = json.load(f)

In [13]:
dataf["Purpose Categories"] = (
    dataf["Purpose Categories"]
    .apply(lambda x: list(set([category_mapping.get(i.strip(), i.strip()) for i in x])))
    .apply(lambda x: [i for i in x if i])
)

In [15]:
dataf.to_csv("data/classified-mapped.csv", index=False)