In [1]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
import pickle
from pydantic import BaseModel
import tiktoken
import os.path
import pprint

In [2]:
#https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken

CHAT_GPT_MODEL = "gpt-4o-mini"
encoding = tiktoken.encoding_for_model("gpt-4")

In [3]:
themes_cache = "cache/themes.pkl"

In [4]:
from os.path import expanduser
load_dotenv(os.path.join(expanduser("~"), ".env"))

True

In [5]:
client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [6]:
data_file = "data_input.xlsx"
times = pd.read_excel(data_file, sheet_name="Data", index_col="ID")
demog = pd.read_excel(data_file, sheet_name="Demographic")

In [7]:
times.columns = [
    "start_time",
    "completion_time",
    "star_rating",
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
    "recommend_likelihood",
    "ref_num",
    "person_id",
    "start_group",
    "gender",
    "age_group",
    "country",
    "province",
    "number_finished",
    "reg_day",
    "reg_hour",
    "PPA",
    "has_result",
]

demog.columns = [
    "person_id",
    "start_group",
    "gender",
    "age_group",
    "country",
    "province",
    "number_finished",
    "reg_day",
    "reg_hour",
    "PPA",
    "has_result",
]

In [8]:
all_data = pd.merge(
    times, demog, on="person_id", how="left", suffixes=(None, "_demog")
).convert_dtypes()

all_data.drop(
    columns=[
        "start_group",
        "gender",
        "age_group",
        "country",
        "province",
        "number_finished",
        "reg_day",
        "reg_hour",
        "PPA",
        "has_result",
    ],
    inplace=True,
)

In [9]:
all_data = all_data.astype(
    {
        "start_group_demog": "category",
        "gender_demog": "category",
        "age_group_demog": "category",
        "country_demog": "category",
        "province_demog": "category",
    }
)

In [10]:
# Text cleanup
no_answer_text = "(none)"

txt_cols = [
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
]

all_data[txt_cols] = all_data[txt_cols].fillna(no_answer_text)

search_pattern = "|".join(["NA", "Na"])

for col in txt_cols:
    all_data[col] = all_data[col].str.replace(search_pattern, no_answer_text, regex=True)
    all_data[col] = all_data[col].replace("", no_answer_text)
    all_data[col] = all_data[col].str.replace(r"[^a-zA-Z ]", "", regex=True) #not needed
    all_data[col] = all_data[col].str.replace("\n", " ") #remove line breaks

In [11]:
# play with tiktoken
for col in txt_cols:
    test_text = " ".join(all_data[col].to_list())
    tokens = encoding.encode(test_text)
    print(col, len(tokens))

txt_what_liked 62476
txt_what_not_liked 96681
txt_do_to_improve 78861
txt_anything_else 58871


In [12]:
len_times = len(times)
lost_records = len(all_data) - len_times
lost_records_percent = lost_records / len_times
print(
    f"Data with no demographic records: {lost_records} rows, {lost_records_percent:.2%} of {len_times} total records."
)

Data with no demographic records: 55 rows, 0.95% of 5784 total records.


In [13]:
# GPT call for themes

def GPT_get_themes(responses):
    # OpenAI API call to summarize the text

    class Theme(BaseModel):
        theme_id: int
        theme_text: str

    class AllThemes(BaseModel):
        themes: list[Theme]

    messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": f"""The following is a list of responses to a single question in a market research survey. 
                
                Create an overall list of themes extracted from all answers. There shoud be at most 20 themes, 
                and they should have mninimal overlap. Each theme should be a maximum of 20 words.
                Each theme will have an index called theme_id and the theme itself as theme_text.
                Return all the themes ina list called 'themes'
                
                Here are the responses{responses}""",
            },
        ]

    completion   = client.beta.chat.completions.parse(
        model=CHAT_GPT_MODEL,
        temperature = 0.3,
        messages=messages,
        response_format=AllThemes,
    )

    # Extract the summary from the response
    return completion.choices[0].message.parsed
  

In [14]:
# GPT call for theme matching
# https://platform.openai.com/docs/guides/structured-outputs
def GPT_match_themes(themes, responses):

    class EachAnswer(BaseModel):
        response_id: int
        response_text: str
        theme_id: int
        theme_text: str

    class AllAnswers(BaseModel):
        classifications: list[EachAnswer]

    messages = [
            {
                "role": "system",
                "content": "You are an assistant for matching human responses to a survey to pre-existing themes.",
            },
            {
                "role": "user",
                "content": f"""I have a list of themes summarised over some responses to a survey question. The themes represent common topics found in the resposnes.
        Here are the themes: {themes}.
        
        I will give you the responses used to generate the themes. Each response has its own id called response_id.

        For each response, I want you to identify which one of the themes most closely represents the response.
        Return the answers in the object EachAnswer.
    
        Return the original response_id, the response_text, the theme_id and theme_text of the most representative theme.
        
        However, if the response text is {no_answer_text}, there will be no theme.
        In this case. return the original response_id, the response_text, 0 as the theme index and "no theme" as the theme text.

        Return all the EachAnswer objects in a final object called AllAnswers

        Here are the responses {responses}:""",
            },
        ]
    
    completion = client.beta.chat.completions.parse(
        model=CHAT_GPT_MODEL,
        temperature=0.1,
        messages=messages,
        response_format=AllAnswers,
    )

    return completion.choices[0].message.parsed

In [15]:
# GPT call to summarise inputs

def GPT_summarize_responses(inputs):
    # OpenAI API call to summarize the text
    response = client.chat.completions.create(
        model=CHAT_GPT_MODEL,  # Specify the model you want to use
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant for summarising survey responses.",
            },
            {
                "role": "user",
                "content": f"""You will be given a list of responses to a question in a survey. Your job is to extract key themes from the responses.
                Ignore any responses that are very short, are empty, or have the text {no_answer_text}
                Each theme should have a headline, followed by an explanatory paragraph. For each theme, provide from 1 to 3 verbatim quotes to illustrate the theme along side the explanatory paragraph.
                Don't provide any duplicated verbatim quotes.
                Sort the themes by their decreasing frequency of appearance. At the end, be sure to say which was the most commonly seen theme, and which was the least commonly seen.

                Here are your inputs:\n\n{inputs}""",
            },
        ],
    )

    # Extract the summary from the response
    summary = response.choices[0].message.content.strip()
    return summary

In [16]:
txt_cols_to_classify = [
    "txt_what_liked",
    "txt_what_not_liked",
    "txt_do_to_improve",
    "txt_anything_else",
]

txt_cols_to_summarise = [
    "txt_anything_else",
]

In [17]:
# Get the themes

def extract_themes(dataframe, columns):

    if os.path.isfile(themes_cache):

        with open(themes_cache, "rb") as picklefile:
            themes_dict = pickle.load(picklefile)

        return themes_dict

    themes_dict = {}

    for column in columns:

        print(f"Column: {column}")
        response_list = " ".join(dataframe[column].to_list())
        result = GPT_get_themes(response_list)
        theme_list = [(theme.theme_id, theme.theme_text) for theme in result.themes]
        themes_dict[column] = theme_list

    with open(themes_cache, "wb") as picklefile:
        pickle.dump(themes_dict, picklefile)

    return themes_dict

In [18]:
# Create themes
themes_dict = extract_themes(all_data, txt_cols_to_classify)

In [19]:
# Classify the columns

def classify_text_column(orig_dataframe, columns):

    # Create a deep copy with indices to return for pickling
    dataframe = orig_dataframe.copy()

    with open(themes_cache, "rb") as picklefile:
        themes_dict = pickle.load(picklefile)

    for column in columns:

        print(f"Column: {column}")
        response_list = dataframe[column].to_list()
        expected_num_outputs = len(response_list)

        responses_input = [
            f"{idx} {txt}"
            for idx, txt in zip(dataframe[column].index, dataframe[column])
        ]
        # print(responses_input)

        expected_num_outputs = len(response_list)

        # responses_input = "\n".join(response_list)
        responses_input_for_output = "".join([f"{i}. {response} \n" for i, response in enumerate(response_list)])

        # responses_input = str(response_list)

        with open(f"outputs/A {column} responses_input.txt", "w") as text_file:
            text_file.write(responses_input_for_output)

        themes_for_input = themes_dict[column]

        # print(themes_for_input)

        response = GPT_match_themes(themes_for_input, responses_input)

        actual_num_outputs = len(response.classifications)

        #pprint.pp(response.classifications)

        print(
            f"Expected, actual responses: {expected_num_outputs}, {actual_num_outputs}"
        )

        with open(f"outputs/B {column} classifications.txt", "w") as text_file:
            for el in response.classifications:
                text_file.write(f"{el.response_id} {el.response_text} {el.theme_id} {el.theme_text} \n")

        classified_themes = [resp.theme_text for resp in response.classifications]

        column_index = dataframe.columns.get_loc(column)
        new_name = f'{column}_theme'

        try:
            dataframe.insert(column_index + 1, new_name, classified_themes)
            print(f'New column {new_name} inserted')
            # print(dataframe)

        except:
            print("failed to insert column")
            pass

        # dataframe.to_pickle(classifications_cache)

    return dataframe

In [67]:
# Loop through chunks

working_data = all_data[2000:2025].copy()
total_records = len(working_data)

print("Total Records", total_records)
chunk_size = 5
working_txt_cols_to_classify = txt_cols_to_classify[0:1]
chunks_folder = "df_chunks/"

start, end = 0, chunk_size

# Delete Previous Chunks
for filename in os.listdir(chunks_folder):
    os.remove(f"{chunks_folder}{filename}")

counter = 1

while start != total_records:

    print("Start, end = ", start, end)

    current_chunk = working_data[start:end].copy()
    #pprint.pp(current_chunk["start_time"])

    classifications = classify_text_column(
        current_chunk,
        working_txt_cols_to_classify,
    )

    # print(classifications['txt_what_liked'])

    output_file = f"{chunks_folder}df {counter:04d} {start:04d}-{end:04d}.pkl"

    with open(output_file, "wb") as f:
        classifications.to_pickle(f)

    remaining_records = total_records - end
    print("Remaining", remaining_records)

    if remaining_records > chunk_size:

        print("banana", start, chunk_size, total_records)
        start = end
        end = start + chunk_size

    else:
        print("xxx", start, end, total_records, remaining_records)
        start = end
        end = start + remaining_records

    print("New start, end", start, end)

    print()

    counter += 1

# classifications
# working_data

Total Records 25
Start, end =  0 5
Column: txt_what_liked
Expected, actual responses: 5, 5
New column txt_what_liked_theme inserted
Remaining 20
banana 0 5 25
New start, end 5 10

Start, end =  5 10
Column: txt_what_liked
Expected, actual responses: 5, 5
New column txt_what_liked_theme inserted
Remaining 15
banana 5 5 25
New start, end 10 15

Start, end =  10 15
Column: txt_what_liked
Expected, actual responses: 5, 5
New column txt_what_liked_theme inserted
Remaining 10
banana 10 5 25
New start, end 15 20

Start, end =  15 20
Column: txt_what_liked
Expected, actual responses: 5, 5
New column txt_what_liked_theme inserted
Remaining 5
xxx 15 20 25 5
New start, end 20 25

Start, end =  20 25
Column: txt_what_liked
Expected, actual responses: 5, 5
New column txt_what_liked_theme inserted
Remaining 0
xxx 20 25 25 0
New start, end 25 25



In [68]:
df_chunks = []

print('Reconsituting...')
for filename in os.listdir(chunks_folder):

    temp_df = pd.read_pickle(f'{chunks_folder}{filename}')
    print(filename, len(temp_df))
    df_chunks.append(temp_df)

reconstituted = pd.concat(df_chunks)
reconstituted.sort_index(inplace=True)

reconstituted.to_pickle('outputs/final_reconsituted.pkl')
print(len(reconstituted))
reconstituted

Reconsituting...
df 0001 0000-0005.pkl 5
df 0005 0020-0025.pkl 5
df 0004 0015-0020.pkl 5
df 0003 0010-0015.pkl 5
df 0002 0005-0010.pkl 5
25


Unnamed: 0,start_time,completion_time,star_rating,txt_what_liked,txt_what_liked_theme,txt_what_not_liked,txt_do_to_improve,txt_anything_else,recommend_likelihood,ref_num,...,start_group_demog,gender_demog,age_group_demog,country_demog,province_demog,number_finished_demog,reg_day_demog,reg_hour_demog,PPA_demog,has_result_demog
2000,2024-03-20 15:29:12,2024-03-20 15:30:32,3,The race starts was very well organised,Well organized event with efficient logistics.,It was very crowded on the route so much so th...,At the start have porta cribs for men it will ...,none,6,CT757957,...,3B,2.0,30-34,South Africa,Western Cape,1.0,8.0,15.0,,1.0
2001,2024-03-20 14:55:05,2024-03-20 15:30:32,5,Everything starting with the closed roads the ...,Impressive organization for such a large event.,Since my first Argus cycle tour in I have nev...,Just keep up doing the good work,I can just say that I enjoyed the ride,10,CT017564,...,4B,2.0,65-69,South Africa,Northern Cape,16.0,7.0,10.0,,1.0
2002,2024-03-20 15:08:53,2024-03-20 15:30:36,5,Great venue good buzz good road closuresdirect...,Full road closures ensuring cyclist safety.,Maybe a better coffee snack spot at the expo,NA keep doing what youre doing,nothing in particular,10,CT434764,...,6F,2.0,40-44,South Africa,Gauteng,,9.0,13.0,,1.0
2003,2024-03-20 14:38:38,2024-03-20 15:30:37,5,was my th Cape Town Cycle Tour and I enjoyed ...,Excellent support from spectators along the ro...,NOTHINGEverything was just super organized,I think to have more specials from the supplie...,I got operated on days before the event and t...,10,CT839430,...,4E,1.0,45-49,Namibia,,2.0,8.0,11.0,,1.0
2004,2024-03-20 15:27:49,2024-03-20 15:30:41,5,Vibe and the people,Great atmosphere and camaraderie among partici...,Liked everything,More of those small ice water bag at the water...,Water bags at the suikerbossie start was amazi...,10,CT488229,...,5C,1.0,35-39,South Africa,Western Cape,9.0,8.0,15.0,,1.0
2005,2024-03-20 15:12:48,2024-03-20 15:30:42,5,The route crowds smooth organization beer tent,Spectacular scenic route around the Cape Penin...,Ebike riders among the seeded riders Most are ...,The Ebikers should have their own race on a di...,No Well done for a fantastic race,10,CT320501,...,3A,2.0,60-64,South Africa,Eastern Cape,12.0,9.0,10.0,,1.0
2006,2024-03-20 15:22:32,2024-03-20 15:30:48,3,entry process is easy,Smooth registration and number collection proc...,generally well organized,improve the overall road quality surfacemore ...,decrease the entry cost make it more affordabl...,6,CT098008,...,5F,2.0,50-54,South Africa,Western Cape,17.0,7.0,17.0,,1.0
2007,2024-03-20 15:30:17,2024-03-20 15:30:54,5,Everything Route weather participants spectators,Beautiful weather contributing to a great expe...,none,none,none,10,CT838886,...,7A,2.0,50-54,South Africa,Western Cape,3.0,8.0,16.0,,1.0
2008,2024-03-20 15:24:15,2024-03-20 15:30:57,4,Open road,Full road closures ensuring cyclist safety.,Will love results in the old racetec format s...,Start time with line measuring and not gunshot,Enjoyed it thanks,9,,...,,,,,,,,,,
2009,2024-03-20 15:29:01,2024-03-20 15:31:10,5,Always love the scenery and having the road ju...,Spectacular scenic route around the Cape Penin...,The wind haha nothing particularly happy with ...,CAnt think of anything at the moment,Great chilled ride,10,CT467295,...,2D,2.0,50-54,South Africa,Gauteng,8.0,8.0,13.0,1.0,1.0
