In [151]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
import pickle
from pydantic import BaseModel
import tiktoken
import os.path
import pprint

In [152]:
#https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken

CHAT_GPT_MODEL = "gpt-4o-mini"
encoding = tiktoken.encoding_for_model("gpt-4")

In [153]:
themes_cache = "cache/themes.pkl"

In [154]:
from os.path import expanduser
load_dotenv(os.path.join(expanduser("~"), ".env"))

True

In [155]:
client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [156]:
data_file = "data_input.xlsx"
times = pd.read_excel(data_file, sheet_name="Data", index_col="ID")
demog = pd.read_excel(data_file, sheet_name="Demographic")

In [157]:
times.columns = [
    "start_time",
    "completion_time",
    "star_rating",
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
    "recommend_likelihood",
    "ref_num",
    "person_id",
    "start_group",
    "gender",
    "age_group",
    "country",
    "province",
    "number_finished",
    "reg_day",
    "reg_hour",
    "PPA",
    "has_result",
]

demog.columns = [
    "person_id",
    "start_group",
    "gender",
    "age_group",
    "country",
    "province",
    "number_finished",
    "reg_day",
    "reg_hour",
    "PPA",
    "has_result",
]

In [158]:
all_data = pd.merge(
    times, demog, on="person_id", how="left", suffixes=(None, "_demog")
).convert_dtypes()

all_data.drop(
    columns=[
        "start_group",
        "gender",
        "age_group",
        "country",
        "province",
        "number_finished",
        "reg_day",
        "reg_hour",
        "PPA",
        "has_result",
    ],
    inplace=True,
)

In [159]:
all_data = all_data.astype(
    {
        "start_group_demog": "category",
        "gender_demog": "category",
        "age_group_demog": "category",
        "country_demog": "category",
        "province_demog": "category",
    }
)

In [160]:
# Text cleanup
no_answer_text = "(none)"

txt_cols = [
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
]

all_data[txt_cols] = all_data[txt_cols].fillna(no_answer_text)

search_pattern = "|".join(["NA", "Na"])

for col in txt_cols:
    all_data[col] = all_data[col].str.replace(search_pattern, no_answer_text, regex=True)
    all_data[col] = all_data[col].replace("", no_answer_text)
    all_data[col] = all_data[col].str.replace(r"[^a-zA-Z ]", "", regex=True) #not needed
    all_data[col] = all_data[col].str.replace("\n", " ") #remove line breaks

In [161]:
# play with tiktoken
for col in txt_cols:
    test_text = " ".join(all_data[col].to_list())
    tokens = encoding.encode(test_text)
    print(col, len(tokens))

txt_what_liked 62476
txt_what_not_liked 96681
txt_do_to_improve 78861
txt_anything_else 58871


In [162]:
len_times = len(times)
lost_records = len(all_data) - len_times
lost_records_percent = lost_records / len_times
print(
    f"Data with no demographic records: {lost_records} rows, {lost_records_percent:.2%} of {len_times} total records."
)

Data with no demographic records: 55 rows, 0.95% of 5784 total records.


In [163]:
# GPT call for themes

def GPT_get_themes(responses):
    # OpenAI API call to summarize the text

    class Theme(BaseModel):
        theme_id: int
        theme_text: str

    class AllThemes(BaseModel):
        themes: list[Theme]

    messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": f"""The following is a list of responses to a single question in a market research survey. 
                
                Create an overall list of themes extracted from all answers. There shoud be at most 20 themes, 
                and they should have mninimal overlap. Each theme should be a maximum of 20 words.
                Each theme will have an index called theme_id and the theme itself as theme_text.
                Return all the themes ina list called 'themes'
                
                Here are the responses{responses}""",
            },
        ]

    completion   = client.beta.chat.completions.parse(
        model=CHAT_GPT_MODEL,
        temperature = 0.3,
        messages=messages,
        response_format=AllThemes,
    )

    # Extract the summary from the response
    return completion.choices[0].message.parsed
  

In [164]:
# GPT call for theme matching
# https://platform.openai.com/docs/guides/structured-outputs
def GPT_match_themes(themes, responses):

    class EachAnswer(BaseModel):
        response_id: int
        response_text: str
        theme_id: int
        theme_text: str

    class AllAnswers(BaseModel):
        classifications: list[EachAnswer]

    messages = [
            {
                "role": "system",
                "content": "You are an assistant for matching human responses to a survey to pre-existing themes.",
            },
            {
                "role": "user",
                "content": f"""I have a list of themes summarised over some responses to a survey question. The themes represent common topics found in the resposnes.
        Here are the themes: {themes}.
        
        I will give you the responses used to generate the themes. Each response has its own id called response_id.

        For each response, I want you to identify which one of the themes most closely represents the response.
        Return the answers in the object EachAnswer.
    
        Return the original response_id, the response_text, the theme_id and theme_text of the most representative theme.
        
        However, if the response text is {no_answer_text}, there will be no theme.
        In this case. return the original response_id, the response_text, 0 as the theme index and "no theme" as the theme text.

        Return all the EachAnswer objects in a final object called AllAnswers

        Here are the responses {responses}:""",
            },
        ]
    
    completion = client.beta.chat.completions.parse(
        model=CHAT_GPT_MODEL,
        temperature=0.1,
        messages=messages,
        response_format=AllAnswers,
    )

    return completion.choices[0].message.parsed

In [165]:
# GPT call to summarise inputs

def GPT_summarize_responses(inputs):
    # OpenAI API call to summarize the text
    response = client.chat.completions.create(
        model=CHAT_GPT_MODEL,  # Specify the model you want to use
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant for summarising survey responses.",
            },
            {
                "role": "user",
                "content": f"""You will be given a list of responses to a question in a survey. Your job is to extract key themes from the responses.
                Ignore any responses that are very short, are empty, or have the text {no_answer_text}
                Each theme should have a headline, followed by an explanatory paragraph. For each theme, provide from 1 to 3 verbatim quotes to illustrate the theme along side the explanatory paragraph.
                Don't provide any duplicated verbatim quotes.
                Sort the themes by their decreasing frequency of appearance. At the end, be sure to say which was the most commonly seen theme, and which was the least commonly seen.

                Here are your inputs:\n\n{inputs}""",
            },
        ],
    )

    # Extract the summary from the response
    summary = response.choices[0].message.content.strip()
    return summary

In [166]:
txt_cols_to_classify = [
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
]

txt_cols_to_summarise = [
    "txt_anything_else",
]

In [167]:
# Get the themes

def extract_themes(dataframe, columns):

    if os.path.isfile(themes_cache):

        with open(themes_cache, "rb") as picklefile:
            themes_dict = pickle.load(picklefile)

        return themes_dict

    themes_dict = {}

    for column in columns:

        print(f"Column: {column}")
        response_list = " ".join(dataframe[column].to_list())
        result = GPT_get_themes(response_list)
        theme_list = [(theme.theme_id, theme.theme_text) for theme in result.themes]
        themes_dict[column] = theme_list

    with open(themes_cache, "wb") as picklefile:
        pickle.dump(themes_dict, picklefile)

    return themes_dict

In [168]:
# Create themes
themes_dict = extract_themes(all_data, txt_cols_to_classify)

In [169]:
themes_dict

{'txt_what_liked': [(1, 'Well organized event with efficient logistics.'),
  (2, 'Spectacular scenic route around the Cape Peninsula.'),
  (3, 'Great atmosphere and camaraderie among participants.'),
  (4, 'Excellent support from spectators along the route.'),
  (5, 'Sufficient and well-placed water stations.'),
  (6, 'Smooth registration and number collection process.'),
  (7, 'Friendly and helpful marshals and volunteers.'),
  (8, 'Full road closures ensuring cyclist safety.'),
  (9, 'Positive energy and excitement from participants.'),
  (10, 'Improved timing system and tracking for participants.'),
  (11, 'Beautiful weather contributing to a great experience.'),
  (12, 'Community spirit and local support for the event.'),
  (13, 'Variety of cyclists participating, from novices to pros.'),
  (14, 'Festive atmosphere at the start and finish.'),
  (15, 'Challenging yet enjoyable course for all levels.'),
  (16, 'Great post-race celebrations and hospitality.'),
  (17, 'Increased public

In [170]:
# Classify the columns

def classify_text_column(orig_dataframe, columns):

    # Create a deep copy with indices to return for pickling
    dataframe = orig_dataframe.copy()

    with open(themes_cache, "rb") as picklefile:
        themes_dict = pickle.load(picklefile)

    for column in columns:

        print(f"Column: {column}")
        response_list = dataframe[column].to_list()
        expected_num_outputs = len(response_list)

        responses_input = [
            f"{idx} {txt}"
            for idx, txt in zip(dataframe[column].index, dataframe[column])
        ]
        # print(responses_input)

        expected_num_outputs = len(response_list)

        # responses_input = "\n".join(response_list)
        responses_input_for_output = "".join([f"{i}. {response} \n" for i, response in enumerate(response_list)])

        # responses_input = str(response_list)

        with open(f"outputs/A {column} responses_input.txt", "w") as text_file:
            text_file.write(responses_input_for_output)

        themes_for_input = themes_dict[column]

        # print(themes_for_input)

        response = GPT_match_themes(themes_for_input, responses_input)

        actual_num_outputs = len(response.classifications)

        #pprint.pp(response.classifications)

        print(
            f"Expected, actual responses: {expected_num_outputs}, {actual_num_outputs}"
        )

        with open(f"outputs/B {column} classifications.txt", "w") as text_file:
            for el in response.classifications:
                text_file.write(f"{el.response_id} {el.response_text} {el.theme_id} {el.theme_text} \n")

        classified_themes = [resp.theme_text for resp in response.classifications]

        column_index = dataframe.columns.get_loc(column)
        new_name = f'{column}_theme'

        try:
            dataframe.insert(column_index + 1, new_name, classified_themes)
            print(f'New column {new_name} inserted')
            # print(dataframe)

        except:
            print("failed to insert column")
            pass

        # dataframe.to_pickle(classifications_cache)

    return dataframe

In [171]:
# Loop through chunks

working_data = all_data[3000:3059].copy()
total_records = len(working_data)

print("Total Records", total_records)
chunk_size = 7
working_txt_cols_to_classify = txt_cols_to_classify#[0:1]
chunks_folder = 'df_chunks/'

start, end = 0, chunk_size

# Delete Previous Chunks
for filename in os.listdir(chunks_folder):
    os.remove(f'{chunks_folder}{filename}')

counter = 1

while end <= total_records:

    print('Start, end = ', start , end)

    current_chunk = working_data[start:end].copy()

    classifications = classify_text_column(
        current_chunk,
        working_txt_cols_to_classify,
    )

    # print(classifications['txt_what_liked'])

    output_file = f"{chunks_folder}df {counter:04d} {start:04d}-{end:04d}.pkl"

    with open(output_file, "wb") as f:
        classifications.to_pickle(f)

    if start + chunk_size <= total_records:
        start = end + 1
        end = start + chunk_size

    else:
        remaining_records = total_records - end
        start = end + 1
        end = start + remaining_records

    print("New start, end", start, end)

    print()

    counter += 1

# classifications
# working_data

Total Records 59
Start, end =  0 7
Column: txt_what_liked


In [150]:
df_chunks = []

print('Reconsituting...')
for filename in os.listdir(chunks_folder):
    print(filename)
    temp_df = pd.read_pickle(f'{chunks_folder}{filename}')
    
    df_chunks.append(temp_df)

reconstitued = pd.concat(df_chunks)

reconstitued.to_pickle('outputs/final_reconsituted.pkl')
print(len(reconstitued))
reconstitued

Reconsituting...
df 0000-0007.pkl
df 0040-0047.pkl
df 0032-0039.pkl
df 0024-0031.pkl
df 0008-0015.pkl
df 0048-0055.pkl
df 0016-0023.pkl
49


Unnamed: 0,start_time,completion_time,star_rating,txt_what_liked,txt_what_liked_theme,txt_what_not_liked,txt_what_not_liked_theme,txt_do_to_improve,txt_do_to_improve_theme,txt_anything_else,...,start_group_demog,gender_demog,age_group_demog,country_demog,province_demog,number_finished_demog,reg_day_demog,reg_hour_demog,PPA_demog,has_result_demog
3000,2024-03-20 17:15:50,2024-03-20 17:21:23,4.0,Everything well organized good support and ama...,Well organized event with efficient logistics.,Nothing what is not to like,no theme,Maybe small cups for the coke and energade alo...,"Provide better hydration options, including cu...",Nothing,...,,,,,,,,,,
3001,2024-03-20 17:16:48,2024-03-20 17:21:45,,New Timing methods was nice less hassle,Improved timing system and tracking for partic...,I cant think of anything,no theme,To have more Marshalls in the main roads to wa...,Improve communication about race rules and eti...,sand sections before Simons town needed warni...,...,3E,2.0,65-69,South Africa,Western Cape,12.0,9.0,13.0,,1.0
3002,2024-03-20 17:18:43,2024-03-20 17:21:55,5.0,Everything except the starting venue,no theme,The start prefer Herzog Boulevard,no theme,none,no theme,none,...,4C,2.0,60-64,Australia,,13.0,9.0,14.0,,1.0
3003,2024-03-20 17:14:53,2024-03-20 17:22:09,4.0,The event was well organised the start was arr...,Well organized event with efficient logistics.,There is not enough foodstuffs at the waterin...,Lack of food and snacks at water stations.,A better experience after the race Once the ra...,Encourage more crowd support and entertainment...,It was a great event well done to the organisers,...,3A,2.0,50-54,South Africa,Gauteng,1.0,8.0,12.0,,1.0
3004,2024-03-20 17:08:48,2024-03-20 17:22:46,3.0,Organised and Finish set up,Well organized event with efficient logistics.,I really did not enjoy the fact that there was...,Lack of food and snacks at water stations.,Provide food at the supporting tables,Provide more food options at water points duri...,Please provide food at the supporting tables,...,2B,2.0,Elite,South Africa,Western Cape,,8.0,17.0,,1.0
3005,2024-03-20 17:19:31,2024-03-20 17:23:03,5.0,Good organizing and love this race,Well organized event with efficient logistics.,Enjoy the race,no theme,Better seeding for me to cycle better times,Implement a rolling start to reduce bottleneck...,Its my nr for and i want to come back next year,...,2C,2.0,65-69,South Africa,Eastern Cape,19.0,8.0,11.0,,1.0
3006,2024-03-20 17:20:57,2024-03-20 17:23:15,5.0,My first event EVER and it was fking AWESOME,no theme,i could not find any fault in my view yes I a...,no theme,none,no theme,It got me HOOKED on cycling Before the event I...,...,7E,2.0,35-39,South Africa,Western Cape,,9.0,14.0,,1.0
3040,2024-03-20 17:24:59,2024-03-20 17:29:52,5.0,Vibe scenery and just the way in which things ...,Well organized event with efficient logistics.,Size of the batches and starting intervals al...,Large start groups causing safety concerns.,Refer to previous comment,no theme,Overall very enjoyable ride and experience,...,1F,2.0,40-44,South Africa,North-West,,9.0,13.0,,1.0
3041,2024-03-20 14:41:55,2024-03-20 17:30:10,4.0,Well organised Marshalling on point,Friendly and helpful marshals and volunteers.,Selfish riders making riding CTCT unpleasurable,Inexperienced cyclists not following road etiq...,Better goodie bags and contents was not the sa...,Improve the quality and variety of items in th...,My Cycletour th came to an abrupt end coming ...,...,7A,2.0,50-54,South Africa,Western Cape,15.0,9.0,10.0,,1.0
3042,2024-03-20 17:21:30,2024-03-20 17:31:18,3.0,The start was very professional and on time,Smooth registration and number collection proc...,That only lanes were available to use Lots of...,"Poor organization at the start, leading to con...",Perhaps less people to start in the groups whi...,Implement a rolling start to reduce bottleneck...,Starting times to be earlier for all the group...,...,6B,1.0,50-54,South Africa,Western Cape,4.0,7.0,17.0,,1.0
