In [None]:
import pandas as pd
import sys

sys.path.append("../")
from toolkit.helpers.df_functions import fix_null_ints
from toolkit.compare_case_groups.temporal_process import (
    build_temporal_data,
    create_window_df,
)
import polars as pl

from toolkit.compare_case_groups.build_dataframes import (
    build_attribute_df,
    build_grouped_df,
    build_ranked_df,
    filter_df,
)


case_groups_model_df_pd = pd.read_csv(
    "../example_outputs/compare_case_groups/customer_complaints_prepared.csv"
)
case_groups_model_df_pd["Subject ID"] = [
    str(x) for x in range(1, len(case_groups_model_df_pd) + 1)
]
filters = ""
groups = ["product_code"]
aggregates = ["age"]
temporal = ""#quarter

case_groups_model_df_pd = case_groups_model_df_pd.replace("", None)

case_groups_model_df_pd = fix_null_ints(case_groups_model_df_pd)

filtered_df = filter_df(case_groups_model_df_pd, filters)

# create group df
grouped_df = build_grouped_df(filtered_df, groups)

# create attribute df
filtered_df_pl = pl.from_pandas(filtered_df)
attributes_df = build_attribute_df(filtered_df_pl, groups, aggregates)

temporal_df = pl.DataFrame()
temporal_atts = []
# create Window df
if temporal != "":
    temporal_df = create_window_df(
        groups, temporal, aggregates, pl.from_pandas(filtered_df)
    )

    temporal_atts = sorted(case_groups_model_df_pd[temporal].astype(str).unique())

    temporal_df = build_temporal_data(temporal_df, groups, temporal_atts, temporal)
# Create overall df
ranked_df = build_ranked_df(
    temporal_df,
    pl.from_pandas(grouped_df),
    attributes_df,
    temporal,
    groups,
)
ranked_df = ranked_df.select(
    *groups,
    "Group Count",
    "Group Rank",
    "Attribute Value",
    "Attribute Count",
    "Attribute Rank",
)
print(len(ranked_df))

In [None]:
groups_text = f"[{", ".join(["**" + g + "**" for g in groups])}]"
description = "This table shows:"
description += f"A summary of all **{len(case_groups_model_df_pd)}** data records"
description += f"\n- The **Group Count** of records for all {groups_text} groups, and corresponding **Group Rank**"
description += f"\n- The **Attribute Count** of each **Attribute Value** for all {groups_text} groups, and corresponding **Attribute Rank**"
print(description)

## Narrate by groups

In [None]:
# groups to narrate
from toolkit.AI.utils import generate_messages
from toolkit.compare_case_groups.prompts import user_prompt, report_prompt
from toolkit.AI.metaprompts import do_not_harm

groups_to_select = (
    ranked_df.select(
        groups
    ).unique()  # Select the columns to group by  # Get unique combinations of these columns
)

selected_groups = groups_to_select[:5]
selected_df = pl.DataFrame(selected_groups, schema=groups)

# Filter the ranked_df to keep only the rows where the group keys are in selected_groups
fdf = ranked_df.join(selected_df, on=groups, how="inner")

print(f"##### Filtered data summary to narrate ({len(fdf)} rows)")
filter_description = f'Filtered to the following groups only: {", ".join([str(s) for s in selected_groups])}'

## Narrate by rank

In [None]:
# top_group_ranks = 5
# fdf = ranked_df.filter(ranked_df["Group Rank"] <= top_group_ranks)
# filter_description = f"Filtered to the top {top_group_ranks} groups by record count"
# filter_description

In [7]:
variables = {
    "description": description,
    "dataset": fdf.write_csv(),
    "filters": filter_description,
}

messages = generate_messages(user_prompt, report_prompt, variables, do_not_harm)

In [None]:
import os
from toolkit.AI.openai_configuration import OpenAIConfiguration
from toolkit.AI.client import OpenAIClient

ai_configuration = OpenAIConfiguration(
    {
        "api_type": "OpenAI",
        "api_key": os.getenv("OPENAI_API_KEY"),
        "model": "gpt-4o-2024-08-06",
    }
)
response = OpenAIClient(ai_configuration).generate_chat(messages, stream=False)
print(response)