In [14]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
import json
from pydantic import BaseModel

In [15]:
CHAT_GPT_MODEL = "gpt-4o-mini"

In [16]:
from os.path import expanduser
load_dotenv(os.path.join(expanduser("~"), ".env"))

True

In [17]:
client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [18]:
data_file = "data_input.xlsx"
times = pd.read_excel(data_file, sheet_name="Data", index_col="ID")
demog = pd.read_excel(data_file, sheet_name="Demographic")

In [19]:
times.columns = [
    "start_time",
    "completion_time",
    "star_rating",
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
    "recommend_likelihood",
    "ref_num",
    "person_id",
    "start_group",
    "gender",
    "age_group",
    "country",
    "province",
    "number_finished",
    "reg_day",
    "reg_hour",
    "PPA",
    "has_result",
]

demog.columns = [
    "person_id",
    "start_group",
    "gender",
    "age_group",
    "country",
    "province",
    "number_finished",
    "reg_day",
    "reg_hour",
    "PPA",
    "has_result",
]

In [20]:
all_data = pd.merge(
    times, demog, on="person_id", how="left", suffixes=(None, "_demog")
).convert_dtypes()

all_data.drop(
    columns=[
        "start_group",
        "gender",
        "age_group",
        "country",
        "province",
        "number_finished",
        "reg_day",
        "reg_hour",
        "PPA",
        "has_result",
    ],
    inplace=True,
)

In [21]:
all_data = all_data.astype(
    {
        "start_group_demog": "category",
        "gender_demog": "category",
        "age_group_demog": "category",
        "country_demog": "category",
        "province_demog": "category",
    }
)

In [22]:
# Text cleanup
no_answer_text = "no answer given"

txt_cols = [
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
]

all_data[txt_cols] = all_data[txt_cols].fillna(no_answer_text)

for col in txt_cols:
    all_data[col] = all_data[col].str.replace(r"[^a-zA-Z ]", "", regex=True) #not needed
    all_data[col] = all_data[col].str.replace("\n", " ") #remove line breaks

In [23]:
len_times = len(times)
lost_records = len(all_data) - len_times
lost_records_percent = lost_records / len_times
print(
    f"Data with no demographic records: {lost_records} rows, {lost_records_percent:.2%} of {len_times} total records."
)

Data with no demographic records: 55 rows, 0.95% of 5784 total records.


In [24]:
# GPT call for themes

def summarize_gpt(responses):
    # OpenAI API call to summarize the text

    class Theme(BaseModel):
        theme_text: str

    class AllThemes(BaseModel):
        themes: list[Theme]

    messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": f"""The following is a list of responses to a single question in a market research survey. 
                Create an overall list of themes extracted from all answers. There shoud be at most 12 themes, and they should have mninimal overlap. Each theme should be a maximum of 20 words.
                Here are the responses{responses}""",
            },
        ]

    completion   = client.beta.chat.completions.parse(
        model=CHAT_GPT_MODEL,
        #temperature = 0.5,
        messages=messages,
        response_format=AllThemes,
    )

    # Extract the summary from the response
    return completion.choices[0].message.parsed
  

In [25]:
# GPT call for theme matching
# https://platform.openai.com/docs/guides/structured-outputs
def theme_matching(themes, responses):

    class EachAnswer(BaseModel):
        response_id: int
        response_text: str
        theme_id: int
        theme_text: str

    class AllAnswers(BaseModel):
        classifications: list[EachAnswer]

    messages = [
            {
                "role": "system",
                "content": "You are an assistant for matching human responses to a survey to pre-existing themes.",
            },
            {
                "role": "user",
                "content": f"""I have a list of themes summarised over some responses to a survey question. The themes represent common topics found in the resposnes.
        Here are the themes: {themes}""",
            },
            {
                "role": "user",
                "content": f"""I will give you the responses used to generate the themes.
        For each response, I want you to identify which one of the themes most closely represents the response.
        For each response, return the index of the response, the response text, the index and text of the most representative theme.
        However, if the response text is {no_answer_text}, there will be no theme. Return 0 as the theme index and give the theme as 'no theme'.
        It is critically important that there be as many classifications in the output as there are responses in the input.
        Here are the responses {responses}:""",
            },
        ]
    
    completion = client.beta.chat.completions.parse(
        model=CHAT_GPT_MODEL,
        temperature=0.1,
        messages=messages,
        response_format=AllAnswers,
    )

    return completion.choices[0].message.parsed

In [26]:
# GPT call to summarise inputs

def summarize_responses(inputs):
    # OpenAI API call to summarize the text
    response = client.chat.completions.create(
        model=CHAT_GPT_MODEL,  # Specify the model you want to use
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant for summarising survey responses.",
            },
            {
                "role": "user",
                "content": f"""You will be given a list of responses to a question in a survey. Your job is to extract key themes from the responses.
                Ignore any responses that are very short, are empty, or have the text {no_answer_text}
                Each theme should have a headline, followed by an explanatory paragraph. For each theme, provide from 1 to 3 verbatim quotes to illustrate the theme along side the explanatory paragraph.
                Don't provide any duplicated verbatim quotes.
                Sort the themes by their decreasing frequency of appearance. At the end, be sure to say which was the most commonly seen theme, and which was the least commonly seen.

                Here are your inputs:\n\n{inputs}""",
            },
        ],
    )

    # Extract the summary from the response
    summary = response.choices[0].message.content.strip()
    return summary

In [37]:
def classify_text_column(dataframe, column_name):

    print(f"Column: {column_name}")

    response_list = dataframe[column_name].to_list()
    expected_num_outputs = len(response_list)

    # responses_input = "\n".join(response_list)
    responses_input_for_output = "".join([f"{i}. {response} \n" for i, response in enumerate(response_list)])

    responses_input = str(response_list)

    with open(f"outputs/A {column_name} responses_input.txt", "w") as text_file:
        text_file.write(responses_input_for_output)

    output = summarize_gpt(responses_input)
    # print(output.themes)
    # print(len(output.themes))

    themes_for_input = "\n".join([theme.theme_text for theme in output.themes])
    response = theme_matching(themes_for_input, response_list)
    """
    

    for i in range(1):
        response = theme_matching(themes_for_input, response_list)
        num_outputs = len(response.classifications)

        print(
            f"Attempt {i}: expected, actual responses: {expected_num_outputs}, {num_outputs}"
        )

        if num_outputs == expected_num_outputs:
            break
        else:
            print(
                f"Mismatch, trying again"
            )

            print(responses_input)
            print([f'{el}\n'for el in response.classifications])
    else:
        print("Retry iterations completed. No answers found.")

    """
    num_outputs = len(response.classifications)
    print(
        f"Expected, actual responses: {expected_num_outputs}, {num_outputs}"
    )

    with open(f"outputs/B {column_name} classifications.txt", "w") as text_file:
        for el in response.classifications:
            text_file.write(f"{el.response_id} {el.response_text} {el.theme_id} {el.theme_text} \n")

    classified_themes = [resp.theme_text for resp in response.classifications]

    column_index = dataframe.columns.get_loc(column_name)
    new_name = f'{column_name}_theme'
    # dataframe.insert(column_index + 1, new_name, classified_themes)
    print()

    return response.classifications

In [28]:
txt_cols_to_classify = [
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
]

txt_cols_to_summarise = [
    "txt_do_to_improve",
    "txt_anything_else",
]

In [33]:
working_data = all_data[:43]
response_list, classifications = None, None

classifications = classify_text_column(
    working_data,
    "txt_what_not_liked",
)

Column: txt_what_not_liked
Expected, actual responses: 43, 43



In [39]:
working_data = all_data[:70]

for col in txt_cols_to_classify:

    classifications = classify_text_column(
        working_data,
        col,
    )

Column: txt_what_liked
Expected, actual responses: 70, 70

Column: txt_what_not_liked
Expected, actual responses: 70, 70

Column: txt_do_to_improve
Expected, actual responses: 70, 69

