In [1]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
import json
from pydantic import BaseModel

In [2]:
from os.path import expanduser

load_dotenv(os.path.join(expanduser("~"), ".env"))

True

In [3]:
client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [4]:
data_file = "data_input.xlsx"
times = pd.read_excel(data_file, sheet_name="Data", index_col="ID")
demog = pd.read_excel(data_file, sheet_name="Demographic")

In [5]:
times.columns = [
    "start_time",
    "completion_time",
    "star_rating",
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
    "recommend_likelihood",
    "ref_num",
    "person_id",
    "start_group",
    "gender",
    "age_group",
    "country",
    "province",
    "number_finished",
    "reg_day",
    "reg_hour",
    "PPA",
    "has_result",
]

demog.columns = [
    "person_id",
    "start_group",
    "gender",
    "age_group",
    "country",
    "province",
    "number_finished",
    "reg_day",
    "reg_hour",
    "PPA",
    "has_result",
]

In [6]:
all_data = pd.merge(
    times, demog, on="person_id", how="left", suffixes=(None, "_demog")
).convert_dtypes()

all_data.drop(
    columns=[
        "start_group",
        "gender",
        "age_group",
        "country",
        "province",
        "number_finished",
        "reg_day",
        "reg_hour",
        "PPA",
        "has_result",
    ],
    inplace=True,
)

In [7]:
all_data = all_data.astype(
    {
        "start_group_demog": "category",
        "gender_demog": "category",
        "age_group_demog": "category",
        "country_demog": "category",
        "province_demog": "category",
    }
)

In [8]:
no_answer_text = "no answer given"

txt_cols = [
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
]

all_data[txt_cols] = all_data[txt_cols].fillna(no_answer_text)
all_data[txt_cols].head(20)

Unnamed: 0,txt_what_liked,txt_what_not_liked,txt_do_to_improve,txt_anything_else
0,no answer given,no answer given,no answer given,no answer given
1,The Team who organise it!,Nothing comes to mind,Alp du Hez experience at the top of Alp du Sui...,brilliant!
2,"well organised, great helpers",start a little too late,start earlier,much better than 109KM's
3,Road closures,It is getting dangerous with larger groups.,Rider safety in large groups.,Nope
4,Well organized event and great route,Think it’s starting to get quite pricey,N/a,N/a
5,no answer given,no answer given,no answer given,no answer given
6,"Awesome scenery, closed roads, excellent ""gees""!",I was under-prepared!,I missed the presence of the big cycling store...,"Most enjoyable, as always!"
7,no answer given,Just didnt feel it was quite up to your normal...,no answer given,no answer given
8,Full road closure,Some crazy cyclists making is a bit unsafe,Not sure,Was fun
9,Riding in a beautiful race on closed roads,no answer given,no answer given,no answer given


In [9]:
len_times = len(times)
lost_records = len(all_data) - len_times
lost_records_percent = lost_records / len_times
print(
    f"Data with no demographic records: {lost_records} rows, {lost_records_percent:.2%} of {len_times} total records."
)

Data with no demographic records: 55 rows, 0.95% of 5784 total records.


In [36]:
# GPT call for themes

def summarize_gpt(responses):
    # OpenAI API call to summarize the text

    class Theme(BaseModel):
        theme_text: str

    class AllThemes(BaseModel):
        themes: list[Theme]

    messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": f"""The following is a list of responses to a single question in a market research survey. 
                Create an overall list of themes extracted from all answers. There should be at most 7 themes. Each theme should be a maximum of 20 words.
                Here are the responses{responses}""",
            },
        ]

    completion   = client.beta.chat.completions.parse(
        model="gpt-4o-mini",  # Specify the model you want to use
        messages=messages,
        response_format=AllThemes,
    )

    # Extract the summary from the response
    return completion.choices[0].message.parsed
  

In [23]:
# GPT call for theme matching
# https://platform.openai.com/docs/guides/structured-outputs
def theme_matching(themes, responses):

    class EachAnswer(BaseModel):
        response_id: int
        response_text: str
        theme_id: int
        theme_text: str

    class AllAnswers(BaseModel):
        classifications: list[EachAnswer]

    messages = [
            {
                "role": "system",
                "content": "You are an assistant for matching human responses to a survey to pre-existing themes.",
            },
            {
                "role": "user",
                "content": f"""I have a list of themes summarised over some responses to a survey question. The themes represent common topics found in the resposnes.
        Here are the themes: {themes}""",
            },
            {
                "role": "user",
                "content": f"""I will give you the responses used to generate the themes.
        For each response, I want you to identify which one of the themes most closely represents the response.
        For each response, return the index of the response, the response text, the index and text of the most representative theme.
        However, if the response text is {no_answer_text}, there will be no theme. Return 0 as the theme index and leave the theme text empty.
        Here are the responses {responses}:""",
            },
        ]
    
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini-2024-07-18",
        messages=messages,
        response_format=AllAnswers,
    )

    return completion.choices[0].message.parsed

In [46]:
working_data = all_data[:20]

In [None]:
txt_cols_to_classify = [
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
]

txt_cols_to_summarise = [
    "txt_do_to_improve",
    "txt_anything_else",
]

In [47]:
answers_list = working_data["txt_what_liked"].to_list()
answers_input = "\n".join(answers_list)
answers_input

'no answer given\nThe Team who organise it!\nwell organised, great helpers \nRoad closures \nWell organized event and great route\nno answer given\nAwesome scenery, closed roads, excellent "gees"!\nno answer given\nFull road closure \nRiding in a beautiful race on closed roads\nno answer given\nWell organised as always\nGreat route and vibe\nEfficient start process\nWell organised. Support from the public. Great weather!\nBeautiful route, excellent organization, great vibe - on route and afterwards around the beer tents.\nEnough watering points along the route\nno answer given\nRoute, people, vibe\nThe Vibe'

In [37]:
output = summarize_gpt(answers_input)

themes=[Theme(theme_text='Well-organized event with great support from helpers'), Theme(theme_text='Road closures that enhance the experience'), Theme(theme_text='Beautiful scenery along the route'), Theme(theme_text='Positive vibes and community support'), Theme(theme_text='Sufficient watering points during the race'), Theme(theme_text='Efficient start process and overall organization'), Theme(theme_text='Enjoyable atmosphere with beer tents and post-race celebrations')]


In [48]:
output.themes

[Theme(theme_text='Well-organized event with great support from helpers'),
 Theme(theme_text='Road closures that enhance the experience'),
 Theme(theme_text='Beautiful scenery along the route'),
 Theme(theme_text='Positive vibes and community support'),
 Theme(theme_text='Sufficient watering points during the race'),
 Theme(theme_text='Efficient start process and overall organization'),
 Theme(theme_text='Enjoyable atmosphere with beer tents and post-race celebrations')]

In [45]:
themes_for_input = "\n".join([theme.theme_text for theme in output.themes])
themes_for_input

'Well-organized event with great support from helpers\nRoad closures that enhance the experience\nBeautiful scenery along the route\nPositive vibes and community support\nSufficient watering points during the race\nEfficient start process and overall organization\nEnjoyable atmosphere with beer tents and post-race celebrations'

In [44]:
temp_themes_for_input = themes_for_input
temp_themes_for_input

'Well-organized event with great support from helpers\nRoad closures that enhance the experience\nBeautiful scenery along the route\nPositive vibes and community support\nSufficient watering points during the race\nEfficient start process and overall organization\nEnjoyable atmosphere with beer tents and post-race celebrations'

In [24]:
response = theme_matching(temp_themes_for_input, answers_list)
response.classifications

classifications=[EachAnswer(response_id=0, response_text='no answer given', theme_id=0, theme_text=''), EachAnswer(response_id=1, response_text='The Team who organise it!', theme_id=1, theme_text='Well organized event'), EachAnswer(response_id=2, response_text='well organised, great helpers ', theme_id=1, theme_text='Well organized event'), EachAnswer(response_id=3, response_text='Road closures ', theme_id=5, theme_text='Full road closures'), EachAnswer(response_id=4, response_text='Well organized event and great route', theme_id=1, theme_text='Well organized event'), EachAnswer(response_id=5, response_text='no answer given', theme_id=0, theme_text=''), EachAnswer(response_id=6, response_text='Awesome scenery, closed roads, excellent "gees"!', theme_id=2, theme_text='Positive atmosphere and vibe'), EachAnswer(response_id=7, response_text='no answer given', theme_id=0, theme_text=''), EachAnswer(response_id=8, response_text='Full road closure ', theme_id=5, theme_text='Full road closure

In [30]:
response.classifications

[EachAnswer(response_id=0, response_text='no answer given', theme_id=0, theme_text=''),
 EachAnswer(response_id=1, response_text='The Team who organise it!', theme_id=1, theme_text='Well organized event'),
 EachAnswer(response_id=2, response_text='well organised, great helpers ', theme_id=1, theme_text='Well organized event'),
 EachAnswer(response_id=3, response_text='Road closures ', theme_id=5, theme_text='Full road closures'),
 EachAnswer(response_id=4, response_text='Well organized event and great route', theme_id=1, theme_text='Well organized event'),
 EachAnswer(response_id=5, response_text='no answer given', theme_id=0, theme_text=''),
 EachAnswer(response_id=6, response_text='Awesome scenery, closed roads, excellent "gees"!', theme_id=2, theme_text='Positive atmosphere and vibe'),
 EachAnswer(response_id=7, response_text='no answer given', theme_id=0, theme_text=''),
 EachAnswer(response_id=8, response_text='Full road closure ', theme_id=5, theme_text='Full road closures'),
 Ea