In [30]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
import pickle
from pydantic import BaseModel
import tiktoken
import os.path
import pprint

In [31]:
#https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken

CHAT_GPT_MODEL = "gpt-4o-mini"
encoding = tiktoken.encoding_for_model("gpt-4")

In [32]:
from os.path import expanduser
load_dotenv(os.path.join(expanduser("~"), ".env"))

True

In [33]:
client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [34]:
data_file = "data_input.xlsx"
times = pd.read_excel(data_file, sheet_name="Data", index_col="ID")
demog = pd.read_excel(data_file, sheet_name="Demographic")

In [35]:
times.columns = [
    "start_time",
    "completion_time",
    "star_rating",
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
    "recommend_likelihood",
    "ref_num",
    "person_id",
    "start_group",
    "gender",
    "age_group",
    "country",
    "province",
    "number_finished",
    "reg_day",
    "reg_hour",
    "PPA",
    "has_result",
]

demog.columns = [
    "person_id",
    "start_group",
    "gender",
    "age_group",
    "country",
    "province",
    "number_finished",
    "reg_day",
    "reg_hour",
    "PPA",
    "has_result",
]

In [36]:
all_data = pd.merge(
    times, demog, on="person_id", how="left", suffixes=(None, "_demog")
).convert_dtypes()

all_data.drop(
    columns=[
        "start_group",
        "gender",
        "age_group",
        "country",
        "province",
        "number_finished",
        "reg_day",
        "reg_hour",
        "PPA",
        "has_result",
    ],
    inplace=True,
)

In [37]:
all_data = all_data.astype(
    {
        "start_group_demog": "category",
        "gender_demog": "category",
        "age_group_demog": "category",
        "country_demog": "category",
        "province_demog": "category",
    }
)

In [38]:
# Text cleanup
no_answer_text = "(none)"

txt_cols = [
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
]

all_data[txt_cols] = all_data[txt_cols].fillna(no_answer_text)

search_pattern = "|".join(["NA", "Na"])

for col in txt_cols:
    all_data[col] = all_data[col].str.replace(search_pattern, no_answer_text, regex=True)
    all_data[col] = all_data[col].replace("", no_answer_text)
    all_data[col] = all_data[col].str.replace(r"[^a-zA-Z ]", "", regex=True) #not needed
    all_data[col] = all_data[col].str.replace("\n", " ") #remove line breaks

In [39]:
# play with tiktoken
for col in txt_cols:
    test_text = " ".join(all_data[col].to_list())
    tokens = encoding.encode(test_text)
    print(col, len(tokens))

txt_what_liked 62476
txt_what_not_liked 96681
txt_do_to_improve 78861
txt_anything_else 58871


In [40]:
len_times = len(times)
lost_records = len(all_data) - len_times
lost_records_percent = lost_records / len_times
print(
    f"Data with no demographic records: {lost_records} rows, {lost_records_percent:.2%} of {len_times} total records."
)

Data with no demographic records: 55 rows, 0.95% of 5784 total records.


In [41]:
# GPT call for themes

def GPT_get_themes(responses):
    # OpenAI API call to summarize the text

    class Theme(BaseModel):
        theme_id: int
        theme_text: str

    class AllThemes(BaseModel):
        themes: list[Theme]

    messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": f"""The following is a list of responses to a single question in a market research survey. 
                
                Create an overall list of themes extracted from all answers. There shoud be at most 20 themes, 
                and they should have mninimal overlap. Each theme should be a maximum of 20 words.
                Each theme will have an index called theme_id and the theme itself as theme_text.
                Return all the themes ina list called 'themes'
                
                Here are the responses{responses}""",
            },
        ]

    completion   = client.beta.chat.completions.parse(
        model=CHAT_GPT_MODEL,
        temperature = 0.3,
        messages=messages,
        response_format=AllThemes,
    )

    # Extract the summary from the response
    return completion.choices[0].message.parsed
  

In [42]:
# GPT call for theme matching
# https://platform.openai.com/docs/guides/structured-outputs
def GPT_match_themes(themes, responses):

    class EachAnswer(BaseModel):
        response_id: int
        response_text: str
        theme_id: int
        theme_text: str

    class AllAnswers(BaseModel):
        classifications: list[EachAnswer]

    messages = [
            {
                "role": "system",
                "content": "You are an assistant for matching human responses to a survey to pre-existing themes.",
            },
            {
                "role": "user",
                "content": f"""I have a list of themes summarised over some responses to a survey question. The themes represent common topics found in the resposnes.
        Here are the themes: {themes}.
        
        I will give you the responses used to generate the themes. Each response has its own id called response_id.

        For each response, I want you to identify which one of the themes most closely represents the response.
        Return the answers in the object EachAnswer.
    
        Return the original response_id, the response_text, the theme_id and theme_text of the most representative theme.
        
        However, if the response text is {no_answer_text}, there will be no theme.
        In this case. return the original response_id, the response_text, 0 as the theme index and "no theme" as the theme text.

        Return all the EachAnswer objects in a final object called AllAnswers

        Here are the responses {responses}:""",
            },
        ]
    
    completion = client.beta.chat.completions.parse(
        model=CHAT_GPT_MODEL,
        temperature=0.1,
        messages=messages,
        response_format=AllAnswers,
    )

    return completion.choices[0].message.parsed

In [43]:
# GPT call to summarise inputs

def summarize_responses(inputs):
    # OpenAI API call to summarize the text
    response = client.chat.completions.create(
        model=CHAT_GPT_MODEL,  # Specify the model you want to use
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant for summarising survey responses.",
            },
            {
                "role": "user",
                "content": f"""You will be given a list of responses to a question in a survey. Your job is to extract key themes from the responses.
                Ignore any responses that are very short, are empty, or have the text {no_answer_text}
                Each theme should have a headline, followed by an explanatory paragraph. For each theme, provide from 1 to 3 verbatim quotes to illustrate the theme along side the explanatory paragraph.
                Don't provide any duplicated verbatim quotes.
                Sort the themes by their decreasing frequency of appearance. At the end, be sure to say which was the most commonly seen theme, and which was the least commonly seen.

                Here are your inputs:\n\n{inputs}""",
            },
        ],
    )

    # Extract the summary from the response
    summary = response.choices[0].message.content.strip()
    return summary

In [83]:
txt_cols_to_classify = [
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
]

txt_cols_to_summarise = [
    "txt_anything_else",
]

In [84]:
# Get the themes

def extract_themes(dataframe, columns):

    themes_cache = "cache/themes.pkl"

    if os.path.isfile(themes_cache):

        with open(themes_cache, "rb") as picklefile:
            themes_dict = pickle.load(picklefile)

        return themes_dict

    themes_dict = {}

    for column in columns:

        print(f"Column: {column}")
        response_list = " ".join(dataframe[column].to_list())

        result = GPT_get_themes(response_list)
        print(result)
        theme_list = [(theme.theme_id, theme.theme_text) for theme in result.themes]

        print(theme_list)

        themes_dict[column] = theme_list

        print()

    with open(themes_cache, "wb") as picklefile:
        pickle.dump(themes_dict, picklefile)

    return themes_dict

In [85]:
working_data = all_data
working_txt_cols_to_classify = txt_cols_to_classify

themes_dict = extract_themes(working_data, working_txt_cols_to_classify)

themes_dict

Column: txt_what_liked
themes=[Theme(theme_id=1, theme_text='Well organized event with efficient logistics.'), Theme(theme_id=2, theme_text='Spectacular scenic route around the Cape Peninsula.'), Theme(theme_id=3, theme_text='Great atmosphere and camaraderie among participants.'), Theme(theme_id=4, theme_text='Excellent support from spectators along the route.'), Theme(theme_id=5, theme_text='Sufficient and well-placed water stations.'), Theme(theme_id=6, theme_text='Smooth registration and number collection process.'), Theme(theme_id=7, theme_text='Friendly and helpful marshals and volunteers.'), Theme(theme_id=8, theme_text='Full road closures ensuring cyclist safety.'), Theme(theme_id=9, theme_text='Positive energy and excitement from participants.'), Theme(theme_id=10, theme_text='Improved timing system and tracking for participants.'), Theme(theme_id=11, theme_text='Beautiful weather contributing to a great experience.'), Theme(theme_id=12, theme_text='Community spirit and local s

{'txt_what_liked': [(1, 'Well organized event with efficient logistics.'),
  (2, 'Spectacular scenic route around the Cape Peninsula.'),
  (3, 'Great atmosphere and camaraderie among participants.'),
  (4, 'Excellent support from spectators along the route.'),
  (5, 'Sufficient and well-placed water stations.'),
  (6, 'Smooth registration and number collection process.'),
  (7, 'Friendly and helpful marshals and volunteers.'),
  (8, 'Full road closures ensuring cyclist safety.'),
  (9, 'Positive energy and excitement from participants.'),
  (10, 'Improved timing system and tracking for participants.'),
  (11, 'Beautiful weather contributing to a great experience.'),
  (12, 'Community spirit and local support for the event.'),
  (13, 'Variety of cyclists participating, from novices to pros.'),
  (14, 'Festive atmosphere at the start and finish.'),
  (15, 'Challenging yet enjoyable course for all levels.'),
  (16, 'Great post-race celebrations and hospitality.'),
  (17, 'Increased public

In [72]:
# Classify the columns

def classify_text_column(dataframe, column_name):

    print(f"Column: {column_name}")
    response_list = dataframe[column_name].to_list()
    expected_num_outputs = len(response_list)

    responses_input = [f'{idx} {txt}' for idx,txt in zip(dataframe[column_name].index, dataframe[column_name])]
    print(responses_input)

    expected_num_outputs = len(response_list)

    # responses_input = "\n".join(response_list)
    responses_input_for_output = "".join([f"{i}. {response} \n" for i, response in enumerate(response_list)])

    # responses_input = str(response_list)

    with open(f"outputs/A {column_name} responses_input.txt", "w") as text_file:
        text_file.write(responses_input_for_output)

    output = summarize_gpt(responses_input)
    # print(output.themes)
    # print(len(output.themes))

    themes_for_input = "\n".join([theme.theme_text for theme in output.themes])
    response = theme_matching(themes_for_input, responses_input)
    """
    

    for i in range(1):
        response = theme_matching(themes_for_input, response_list)
        num_outputs = len(response.classifications)

        print(
            f"Attempt {i}: expected, actual responses: {expected_num_outputs}, {num_outputs}"
        )

        if num_outputs == expected_num_outputs:
            break
        else:
            print(
                f"Mismatch, trying again"
            )

            print(responses_input)
            print([f'{el}\n'for el in response.classifications])
    else:
        print("Retry iterations completed. No answers found.")

    """
    num_outputs = len(response.classifications)
    print(
        f"Expected, actual responses: {expected_num_outputs}, {num_outputs}"
    )

    with open(f"outputs/B {column_name} classifications.txt", "w") as text_file:
        for el in response.classifications:
            text_file.write(f"{el.response_id} {el.response_text} {el.theme_id} {el.theme_text} \n")

    classified_themes = [resp.theme_text for resp in response.classifications]

    column_index = dataframe.columns.get_loc(column_name)
    new_name = f'{column_name}_theme'
    # dataframe.insert(column_index + 1, new_name, classified_themes)
    print()

    return response.classifications

In [79]:
#ONESHOT
working_data = all_data[4000:4200]
response_list, classifications = None, None

classifications = classify_text_column(
    working_data,
    "txt_what_not_liked",
)

Column: txt_what_not_liked
['4000 The lack of cups at the refreshment points was disappointing they seemed less organized than usual ', '4001 no answer given', '4002 no answer given', '4003 Parking not enough', '4004 Cyclist not giving way to pass', '4005 I finished my ride at around  Just before the  hour cutoff and was expecting a medal Was sent from pillar to post and ended up spending an hour in the queue with everyone else People train hard and it really sucks when they cant go home with something to show their families after such a grueling ride Please make sure the medal mix up never happens  I only got to spend  minutes in the beer tent with fellow racers and share in the camaraderie Would be great if you could open it for an extra  minutes or so post the race cutoff So everybody can get to feel the gees ', '4006 the wind The first  km ', '4007 Too much hills for km LOLIts a pity the km starts so late', '4008 No issues', '4009 Terrible goodie bags also preferred the mat to mat 

In [75]:
#MULTISHOT
working_data = all_data[:114]

for col in txt_cols_to_classify:

    classifications = classify_text_column(
        working_data,
        col,
    )

Column: txt_what_liked
['0 no answer given', '1 The Team who organise it', '2 well organised great helpers ', '3 Road closures ', '4 Well organized event and great route', '5 no answer given', '6 Awesome scenery closed roads excellent gees', '7 no answer given', '8 Full road closure ', '9 Riding in a beautiful race on closed roads', '10 no answer given', '11 Well organised as always', '12 Great route and vibe', '13 Efficient start process', '14 Well organised Support from the public Great weather', '15 Beautiful route excellent organization great vibe  on route and afterwards around the beer tents', '16 Enough watering points along the route', '17 no answer given', '18 Route people vibe', '19 The Vibe', '20 All round great event ', '21 Overall enjoyed it ', '22 Much bigger and better experience than the first few years after Covid  Expo almost back to its former glory', '23 The vibe great event', '24 Well organized at start fantastic route ', '25 EVERYTHING', '26 Venue organisation etc

KeyboardInterrupt: 