In [1]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
import json
from pydantic import BaseModel

In [2]:
CHAT_GPT_MODEL = "gpt-4o-mini"

In [3]:
from os.path import expanduser
load_dotenv(os.path.join(expanduser("~"), ".env"))

True

In [4]:
client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [5]:
data_file = "data_input.xlsx"
times = pd.read_excel(data_file, sheet_name="Data", index_col="ID")
demog = pd.read_excel(data_file, sheet_name="Demographic")

In [6]:
times.columns = [
    "start_time",
    "completion_time",
    "star_rating",
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
    "recommend_likelihood",
    "ref_num",
    "person_id",
    "start_group",
    "gender",
    "age_group",
    "country",
    "province",
    "number_finished",
    "reg_day",
    "reg_hour",
    "PPA",
    "has_result",
]

demog.columns = [
    "person_id",
    "start_group",
    "gender",
    "age_group",
    "country",
    "province",
    "number_finished",
    "reg_day",
    "reg_hour",
    "PPA",
    "has_result",
]

In [7]:
all_data = pd.merge(
    times, demog, on="person_id", how="left", suffixes=(None, "_demog")
).convert_dtypes()

all_data.drop(
    columns=[
        "start_group",
        "gender",
        "age_group",
        "country",
        "province",
        "number_finished",
        "reg_day",
        "reg_hour",
        "PPA",
        "has_result",
    ],
    inplace=True,
)

In [8]:
all_data = all_data.astype(
    {
        "start_group_demog": "category",
        "gender_demog": "category",
        "age_group_demog": "category",
        "country_demog": "category",
        "province_demog": "category",
    }
)

In [9]:
no_answer_text = "no answer given"

txt_cols = [
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
]

all_data[txt_cols] = all_data[txt_cols].fillna(no_answer_text)

In [10]:
len_times = len(times)
lost_records = len(all_data) - len_times
lost_records_percent = lost_records / len_times
print(
    f"Data with no demographic records: {lost_records} rows, {lost_records_percent:.2%} of {len_times} total records."
)

Data with no demographic records: 55 rows, 0.95% of 5784 total records.


In [11]:
# GPT call for themes

def summarize_gpt(responses):
    # OpenAI API call to summarize the text

    class Theme(BaseModel):
        theme_text: str

    class AllThemes(BaseModel):
        themes: list[Theme]

    messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": f"""The following is a list of responses to a single question in a market research survey. 
                Create an overall list of themes extracted from all answers. There shoud be at most 12 themes, and they should have mninimal overlap. Each theme should be a maximum of 20 words.
                Here are the responses{responses}""",
            },
        ]

    completion   = client.beta.chat.completions.parse(
        model=CHAT_GPT_MODEL,
        messages=messages,
        response_format=AllThemes,
    )

    # Extract the summary from the response
    return completion.choices[0].message.parsed
  

In [12]:
# GPT call for theme matching
# https://platform.openai.com/docs/guides/structured-outputs
def theme_matching(themes, responses):

    class EachAnswer(BaseModel):
        response_id: int
        response_text: str
        theme_id: int
        theme_text: str

    class AllAnswers(BaseModel):
        classifications: list[EachAnswer]

    messages = [
            {
                "role": "system",
                "content": "You are an assistant for matching human responses to a survey to pre-existing themes.",
            },
            {
                "role": "user",
                "content": f"""I have a list of themes summarised over some responses to a survey question. The themes represent common topics found in the resposnes.
        Here are the themes: {themes}""",
            },
            {
                "role": "user",
                "content": f"""I will give you the responses used to generate the themes.
        For each response, I want you to identify which one of the themes most closely represents the response.
        For each response, return the index of the response, the response text, the index and text of the most representative theme.
        However, if the response text is {no_answer_text}, there will be no theme. Return 0 as the theme index and give the theme as 'no theme'.
        It is critically important that there be as many classifications in the output as there are responses in the input.
        Here are the responses {responses}:""",
            },
        ]
    
    completion = client.beta.chat.completions.parse(
        model=CHAT_GPT_MODEL,
        #model="gpt-4o-mini-2024-07-18",
        messages=messages,
        response_format=AllAnswers,
    )

    return completion.choices[0].message.parsed

In [13]:
# GPT call to summarise inputs

def summarize_responses(inputs):
    # OpenAI API call to summarize the text
    response = client.chat.completions.create(
        model=CHAT_GPT_MODEL,  # Specify the model you want to use
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant for summarising survey responses.",
            },
            {
                "role": "user",
                "content": f"""You will be given a list of responses to a question in a survey. Your job is to extract key themes from the responses.
                Ignore any responses that are very short, are empty, or have the text {no_answer_text}
                Each theme should have a headline, followed by an explanatory paragraph. For each theme, provide from 1 to 3 verbatim quotes to illustrate the theme along side the explanatory paragraph.
                Don't provide any duplicated verbatim quotes.
                Sort the themes by their decreasing frequency of appearance. At the end, be sure to say which was the most commonly seen theme, and which was the least commonly seen.

                Here are your inputs:\n\n{inputs}""",
            },
        ],
    )

    # Extract the summary from the response
    summary = response.choices[0].message.content.strip()
    return summary

In [32]:
def classify_text_column(dataframe, column_name):

    print(f"Column: {column_name}")

    response_list = dataframe[column_name].to_list()
    expected_num_outputs = len(response_list)

    responses_input = "\n".join(response_list)

    output = summarize_gpt(responses_input)
    # print(output.themes)
    # print(len(output.themes))

    themes_for_input = "\n".join([theme.theme_text for theme in output.themes])


    for i in range(3):
        response = theme_matching(themes_for_input, response_list)
        num_outputs = len(response.classifications)

        print(
            f"Attempt {i}: expected, actual responses: {expected_num_outputs}, {num_outputs}"
        )

        if num_outputs == expected_num_outputs:
            break
        else:
            print(
                f"Mismatch, trying again"
            )
    else:
        print('Retry iterations completed. No answers found.')

    classified_themes = [resp.theme_text for resp in response.classifications]

    column_index = dataframe.columns.get_loc(column_name)
    new_name = f'{column_name}_theme'
    dataframe.insert(column_index + 1, new_name, classified_themes)
    print()

    return response_list, response.classifications

In [16]:
txt_cols_to_classify = [
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
]

txt_cols_to_summarise = [
    "txt_do_to_improve",
    "txt_anything_else",
]

In [40]:
working_data = all_data[600:650]

response_list, classifications = classify_text_column(
    working_data,
    "txt_what_not_liked",
)

working_data

Column: txt_what_not_liked
Expected responses, actual responses: 50, 50



Unnamed: 0,start_time,completion_time,star_rating,txt_what_liked,txt_what_not_liked,txt_what_not_liked_theme,txt_do_to_improve,txt_anything_else,recommend_likelihood,ref_num,...,start_group_demog,gender_demog,age_group_demog,country_demog,province_demog,number_finished_demog,reg_day_demog,reg_hour_demog,PPA_demog,has_result_demog
600,2024-03-20 14:39:29,2024-03-20 14:43:25,5.0,Very well organised,The wind!,Wind was a major factor affecting participants.,Nothing,I felt that there was less promotion of the ev...,10,CT065565,...,5A,2.0,75-79,South Africa,Western Cape,26.0,8.0,12.0,,1.0
601,2024-03-20 14:40:38,2024-03-20 14:43:26,4.0,Every was well organised.,I had a tyre burst a few meters from the start...,Need for better assistance with flat tyres.,Have some assistance to fix a tyre.,no answer given,10,CT858626,...,2E,2.0,30-34,South Africa,Gauteng,1.0,8.0,12.0,,1.0
602,2024-03-20 14:38:26,2024-03-20 14:43:28,5.0,The views,The wind,Wind was a major factor affecting participants.,More beer along the route. Less powerade and coke,Nope,10,CT331670,...,7E,2.0,45-49,South Africa,Western Cape,6.0,7.0,10.0,,1.0
603,2024-03-20 14:41:08,2024-03-20 14:43:29,5.0,"The vibe, from start to finish","42km finish is far from the action, had to mis...",42km route could be more interesting as a circ...,I dont know,The event as a whole was extremely fun!!,10,CT865093,...,9B,2.0,30-34,South Africa,Gauteng,,8.0,12.0,,1.0
604,2024-03-20 14:39:43,2024-03-20 14:43:31,5.0,The route is awesome,Bike transfer service from JHB to CT. Used Bik...,no theme,Clean up the start. The smell around the castl...,no answer given,10,CT509998,...,Dh,2.0,45-49,South Africa,Gauteng,1.0,9.0,10.0,,1.0
605,2024-03-20 14:40:42,2024-03-20 14:43:33,4.0,"Very well organised, refreshment stations well...","I did the 42km, it would be a lot more interes...",42km route could be more interesting as a circ...,"As question 3, otherwise everything was fine",no answer given,9,CT865418,...,9C,2.0,55-59,South Africa,Gauteng,,8.0,18.0,,1.0
606,2024-03-20 14:42:36,2024-03-20 14:43:35,4.0,Closed roads,The smell at the start line,no theme,Road surfaces down south need work.,no answer given,10,CT311783,...,$,2.0,30-34,South Africa,Western Cape,3.0,7.0,15.0,,1.0
607,2024-03-20 14:39:52,2024-03-20 14:43:35,,no answer given,no answer given,no theme,no answer given,I missed the 2024 CTCT because we had health i...,10,CT542581,...,,2.0,70-74,South Africa,Gauteng,7.0,,,,
608,2024-03-20 14:40:38,2024-03-20 14:43:37,4.0,The great orginisation of the whole event,no answer given,no theme,Keep on doing the excellent job you are doing,First time ever and definitely not my last CTC...,8,CT865258,...,9B,1.0,Elite,South Africa,Free State,,9.0,12.0,,1.0
609,2024-03-20 14:41:31,2024-03-20 14:43:37,4.0,Organization and spirit along the way,Wind!!!,Wind was a major factor affecting participants.,Nothing really,Very helpful organizers in changing our seedin...,8,CT136223,...,3F,2.0,45-49,South Africa,Western Cape,5.0,9.0,13.0,,1.0


In [41]:
working_data = all_data[:20]

for col in txt_cols_to_classify:
    classify_text_column(
        working_data,
        col,
    )

working_data.to_pickle("working_data.pkl")
working_data

Column: txt_what_liked
Expected responses, actual responses: 20, 20

Column: txt_what_not_liked
Expected responses, actual responses: 20, 20

Column: txt_do_to_improve
Expected responses, actual responses: 20, 20



Unnamed: 0,start_time,completion_time,star_rating,txt_what_liked,txt_what_liked_theme,txt_what_not_liked,txt_what_not_liked_theme,txt_do_to_improve,txt_do_to_improve_theme,txt_anything_else,...,start_group_demog,gender_demog,age_group_demog,country_demog,province_demog,number_finished_demog,reg_day_demog,reg_hour_demog,PPA_demog,has_result_demog
0,2024-03-20 13:45:57,2024-03-20 13:47:45,,no answer given,no theme,no answer given,no theme,no answer given,no theme,no answer given,...,,,,,,,,,,
1,2024-03-20 14:00:18,2024-03-20 14:02:21,5.0,The Team who organise it!,Team organization praised by participants,Nothing comes to mind,no theme,Alp du Hez experience at the top of Alp du Sui...,no theme,brilliant!,...,,,,,,,,,,
2,2024-03-20 14:27:17,2024-03-20 14:28:30,4.0,"well organised, great helpers",Well organized event and great helpers,start a little too late,Late start times causing inconvenience for par...,start earlier,Suggestions to start the event earlier in the ...,much better than 109KM's,...,9E,2.0,45-49,South Africa,Western Cape,3.0,8.0,11.0,,1.0
3,2024-03-20 14:27:17,2024-03-20 14:28:48,5.0,Road closures,Closed roads enhancing the experience,It is getting dangerous with larger groups.,Safety concerns with large groups and cyclists.,Rider safety in large groups.,no theme,Nope,...,#,2.0,40-44,South Africa,Western Cape,20.0,8.0,13.0,,1.0
4,2024-03-20 14:27:32,2024-03-20 14:28:52,5.0,Well organized event and great route,Impressive overall organization philosophy,Think it’s starting to get quite pricey,Rising costs affecting participation.,N/a,no theme,N/a,...,8B,2.0,35-39,South Africa,Western Cape,3.0,7.0,12.0,,1.0
5,2024-03-20 14:28:27,2024-03-20 14:28:58,5.0,no answer given,no theme,no answer given,no theme,no answer given,no theme,no answer given,...,7C,2.0,Elite,South Africa,Western Cape,,8.0,16.0,,1.0
6,2024-03-20 14:27:00,2024-03-20 14:29:17,5.0,"Awesome scenery, closed roads, excellent ""gees""!",Beautiful scenery along the route,I was under-prepared!,no theme,I missed the presence of the big cycling store...,Desire for larger presence of cycling stores a...,"Most enjoyable, as always!",...,6E,2.0,60-64,South Africa,Eastern Cape,8.0,8.0,12.0,1.0,1.0
7,2024-03-20 14:28:03,2024-03-20 14:29:25,4.0,no answer given,no theme,Just didnt feel it was quite up to your normal...,Disappointment with event standards compared t...,no answer given,no theme,no answer given,...,6A,2.0,45-49,South Africa,Western Cape,7.0,7.0,17.0,,1.0
8,2024-03-20 14:28:16,2024-03-20 14:29:49,5.0,Full road closure,Closed roads enhancing the experience,Some crazy cyclists making is a bit unsafe,Safety concerns with large groups and cyclists.,Not sure,no theme,Was fun,...,6D,2.0,40-44,South Africa,Western Cape,11.0,7.0,10.0,,1.0
9,2024-03-20 14:29:05,2024-03-20 14:29:50,5.0,Riding in a beautiful race on closed roads,Beautiful scenery along the route,no answer given,no theme,no answer given,no theme,no answer given,...,5C,2.0,50-54,South Africa,Western Cape,23.0,9.0,11.0,,1.0


In [19]:
col = "txt_anything_else"
inputs = "\n".join(working_data[col].to_list())
a = summarize_responses(inputs)

with open("anything_else.txt", "w") as text_file:
    text_file.write(a)

In [43]:
len(all_data) * 30

175170