# IMPORTS AND HELPER FUNCTIONS

In [97]:
## pip install striprtf tiktoken openai
import os
from striprtf.striprtf import rtf_to_text
import pandas as pd
import numpy as np
import re
import tiktoken
import openai
from openai.embeddings_utils import get_embedding
from sklearn.metrics.pairwise import cosine_similarity
import time
import json

model='text-embedding-ada-002'

openai.api_key = '**'
tokensUsed = 0

In [84]:


def countTokens(text):
    encoding = tiktoken.get_encoding('cl100k_base')
    return len(encoding.encode(text))

def askGPT(messages):
    tokens = 0
    for value in messages:
        tokens += countTokens(value['content'])
        
    
    response = {}

    if(tokens < 3000):
        
        completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=messages
            )
        responseText = completion.choices[0].message.content
        tokens += countTokens(responseText)
        
        response = {
            "response": responseText ,
            "tokens": tokens
            
        }

    else:
        response = {
            "error": "too many tokens"
        }

    return response



# EMBEDDING THE TRANSCRIPTS

In [19]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = " ".join(" ".join(text).split()[:100])
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']








dataSnippets = pd.DataFrame(columns=["INTERVIEW_FILE_NAME", "TEXT_SNIPPET", "EMBEDDING"])

directory_path = 'data'

pattern =  re.compile(r"\b[A-Z]{1,2}[A-Za-z’]{0,2}:\s")

cumulative_tokens = 0
cumulative_parts = 0

for filename in os.listdir(directory_path):
    if filename.endswith('.rtf'):  # check if file is an RTF file
        file_path = os.path.join(directory_path, filename)
        j = 0
        with open(file_path, 'r') as file:
            rtf_text = file.read()
            plain_text = rtf_to_text(rtf_text)  # convert RTF to plain text
            
            
            parts = re.split(pattern, plain_text)
               
          
            parts= [item.replace('\n', '') for item in parts]
            parts = [part + " " for part in parts]
            parts = [re.sub(' +', ' ', item) for item in parts][1:]
            
            output = []

            # Initialize an empty string to store the current question or answer block
            current_block = ""

            # Initialize a flag indicating whether the current item is expected to be a question
            is_question = True
           
             
            
            for item in parts:
                if '?' in item[-3:]: 
                    if is_question:
                        current_block += item
                    else:
                        if current_block:
                            output.append(current_block)
                        current_block = item
                        is_question=True
                else:    #if is answer
                    if is_question:
                      current_block += item
                      is_question = False 
                    else:
                      current_block += item
                
        
            cumulative_parts += len(parts)    

            if current_block:
                output.append(current_block)

            parts=output
            
            
            string = "".join(parts)
            
            encoding = tiktoken.get_encoding('cl100k_base')
           
            num_tokens = len(encoding.encode(string))
        
            
            cumulative_tokens += num_tokens
            
            totalLeft = len(parts)
            
            for item in parts:
                embedding = get_embedding(item)
                row = pd.Series({"INTERVIEW_FILE_NAME": filename, "TEXT_SNIPPET": item, "EMBEDDING": embedding})
                dataSnippets = pd.concat([dataSnippets, row.to_frame().T], ignore_index=True)
                totalLeft -= 1
                print("EMBEDDINGS LEFT IN FILE: " + filename +  " ### " + str(totalLeft))
                if (totalLeft % 5 == 0): 
                    print("ESTIMATED TIME LEFT UNTIL END OF FILE: " + " ### " + str(0.27*totalLeft) + " seconds")
            
            
print("TOTAL TOKENS IN ALL FILES: " + str(cumulative_tokens))
print("TOTAL VECTORS: " + str(cumulative_parts))
            
dataSnippets.to_csv('embedded_snippets.csv', index=False)
        




EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 155
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 154
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 153
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 152
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 151
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 150
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 149
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 148
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 147
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 146
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 145
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 144
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 143
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 142
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 141
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 140
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 139
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 138
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 137
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 136
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 135
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 134
EMBEDDINGS LEFT IN FILE: SWW01.rtf ### 133
EMBEDDINGS 

EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 175
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 174
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 173
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 172
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 171
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 170
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 169
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 168
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 167
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 166
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 165
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 164
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 163
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 162
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 161
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 160
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 159
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 158
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 157
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 156
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 155
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 154
EMBEDDINGS LEFT IN FILE: SWW02.rtf ### 153
EMBEDDINGS 

EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 25
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 24
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 23
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 22
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 21
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 20
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 19
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 18
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 17
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 16
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 15
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 14
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 13
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 12
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 11
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 10
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 9
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 8
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 7
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 6
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 5
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 4
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 3
EMBEDDINGS LEFT IN FILE: SWW03.rtf ### 2


EMBEDDINGS LEFT IN FILE: SWW06.rtf ### 5
EMBEDDINGS LEFT IN FILE: SWW06.rtf ### 4
EMBEDDINGS LEFT IN FILE: SWW06.rtf ### 3
EMBEDDINGS LEFT IN FILE: SWW06.rtf ### 2
EMBEDDINGS LEFT IN FILE: SWW06.rtf ### 1
EMBEDDINGS LEFT IN FILE: SWW06.rtf ### 0
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 73
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 72
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 71
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 70
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 69
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 68
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 67
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 66
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 65
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 64
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 63
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 62
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 61
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 60
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 59
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 58
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 57
EMBEDDINGS LEFT IN FILE: SWW07.rtf ### 5

EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 35
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 34
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 33
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 32
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 31
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 30
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 29
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 28
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 27
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 26
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 25
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 24
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 23
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 22
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 21
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 20
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 19
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 18
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 17
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 16
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 15
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 14
EMBEDDINGS LEFT IN FILE: SWW09.rtf ### 13
EMBEDDINGS LEFT IN FILE: SWW09.rtf

EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 69
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 68
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 67
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 66
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 65
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 64
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 63
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 62
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 61
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 60
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 59
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 58
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 57
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 56
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 55
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 54
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 53
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 52
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 51
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 50
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 49
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 48
EMBEDDINGS LEFT IN FILE: SWW10.rtf ### 47
EMBEDDINGS LEFT IN FILE: SWW10.rtf

EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 47
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 46
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 45
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 44
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 43
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 42
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 41
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 40
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 39
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 38
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 37
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 36
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 35
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 34
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 33
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 32
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 31
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 30
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 29
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 28
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 27
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 26
EMBEDDINGS LEFT IN FILE: SWW13.rtf ### 25
EMBEDDINGS LEFT IN FILE: SWW13.rtf

EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 45
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 44
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 43
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 42
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 41
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 40
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 39
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 38
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 37
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 36
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 35
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 34
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 33
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 32
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 31
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 30
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 29
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 28
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 27
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 26
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 25
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 24
EMBEDDINGS LEFT IN FILE: SWW16.rtf ### 23
EMBEDDINGS LEFT IN FILE: SWW16.rtf

EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 12
EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 11
EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 10
EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 9
EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 8
EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 7
EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 6
EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 5
EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 4
EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 3
EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 2
EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 1
EMBEDDINGS LEFT IN FILE: SWW18.rtf ### 0
EMBEDDINGS LEFT IN FILE: SWW19.rtf ### 30
EMBEDDINGS LEFT IN FILE: SWW19.rtf ### 29
EMBEDDINGS LEFT IN FILE: SWW19.rtf ### 28
EMBEDDINGS LEFT IN FILE: SWW19.rtf ### 27
EMBEDDINGS LEFT IN FILE: SWW19.rtf ### 26
EMBEDDINGS LEFT IN FILE: SWW19.rtf ### 25
EMBEDDINGS LEFT IN FILE: SWW19.rtf ### 24
EMBEDDINGS LEFT IN FILE: SWW19.rtf ### 23
EMBEDDINGS LEFT IN FILE: SWW19.rtf ### 22
EMBEDDINGS LEFT IN FILE: SWW19.rtf ### 21
EMBEDDINGS LEFT IN FILE: SWW19.rtf ### 20
EM

EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 22
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 21
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 20
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 19
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 18
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 17
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 16
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 15
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 14
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 13
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 12
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 11
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 10
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 9
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 8
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 7
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 6
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 5
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 4
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 3
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 2
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 1
EMBEDDINGS LEFT IN FILE: SWW23.rtf ### 0
TOTAL TOKENS IN ALL FILES: 234028
TOTAL VECT

# READING THE EMBEDDED FILE

In [160]:
import pandas as pd
df2 = pd.read_csv('embedded_snippets.csv')

df2['EMBEDDING'] = df2['EMBEDDING'].apply(eval).apply(np.array, dtype=float)

df_one = df2[df2['INTERVIEW_FILE_NAME'] == 'SWW01.rtf']

embeddings = np.array(df_one['EMBEDDING'].tolist())

# Testing with first interview transcript


In [161]:
df_one = df2[df2['INTERVIEW_FILE_NAME'] == 'SWW01.rtf']

In [162]:
string = ""
for index, row in df_one.iterrows():
    
    string += row['TEXT_SNIPPET']
    
encoding = tiktoken.get_encoding('cl100k_base')
           
num_tokens = len(encoding.encode(string))

print("Number of tokens in file: " + str(num_tokens))
print(len(string.split(" ")))

Number of tokens in file: 19939
16765


In [163]:
def break_string_into_chunks(text, words_per_chunk):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)

        if len(current_chunk) >= words_per_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = []

    # Add the remaining words as the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Formulate a contextual summary of the first transcript

In [159]:
string = ""
for index, row in df_one.iterrows():
    
    string += row['TEXT_SNIPPET']
    
encoding = tiktoken.get_encoding('cl100k_base')
           
num_tokens = len(encoding.encode(string))

print(num_tokens)
print(len(string.split(" ")))




NameError: name 'df_one' is not defined

In [50]:
context = ""

cum_tokens = 0
encoding = tiktoken.get_encoding('cl100k_base')

for index, chunk in enumerate(break_string_into_chunks(string, 500)):
    messages = []
    systemContent = """
            You are a researcher researching the experiences of women who lived through World War II. 
            You are reading an interview transcript and trying to form a summary of the transcript which 
            encapsulates relevant information into a summative, salient, and essence-capturing summary of the transcript. 
            You are reading the transcript chunk by chunk.
    """
    
    if index == 0:
        
        userPrompt = """
            Now you will receive the first part of the interview transcript. Create a summary to include any relevant information 
            in the segment of the transcript. The new summary should be approximately 500 words long. Here is the transcript:
        """ + chunk

        messages = [
                {'role': 'system', 'content': systemContent},
                {'role': 'user', 'content': userPrompt}
        ]
        
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages
        )
        
        context = completion.choices[0].message.content

        
        num_tokens = len(encoding.encode(systemContent + userPrompt + context))
        cum_tokens += num_tokens
        
        print("TOKENS USED: " + str(num_tokens))
        print("CUMULATIVE TOKENS USED: " + str(cum_tokens))
        

    else:
        if(len(context.split(" ")) > 550):
            systemContentShorten = """
                You are a researcher researching the experiences of women who lived through World War II. 
                You are reading an interview transcript and trying to form a summary of the transcript which 
                encapsulates relevant information into a summative, salient, and essence-capturing summary of the transcript. 
                Your current summary is too long and your task is to shorten it.
            """
            
            userPrompt=f"""
            Edit this summary to be under 500 words long: "{context}". 
            
            It is extremely important that you include as much relevant information as possible from the previous summary in your new summary! 
            In your response, include only the modified new summary with no other text.
            """
            
            messages = [
                {'role': 'system', 'content': systemContentShorten},
                {'role': 'user', 'content': userPrompt}
            ]
   
            
            completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=messages
            )
            
            context = completion.choices[0].message.content
            print("Context shortened! ")
            print("New Length: " + str(len(context.split(" "))))
            
            num_tokens = len(encoding.encode(systemContentShorten + userPrompt + context))
            cum_tokens += num_tokens
        
            print("TOKENS USED: " + str(num_tokens))
            print("CUMULATIVE TOKENS USED: " + str(cum_tokens))
        
        userPrompt = f"""
            You have read part of the transcript and summarized the content so far as: "{context}". 
            Read the summary and add to it any relevant information present in the next part of the transcript: "{chunk}"
            
            It is extremely important that you include all relevant information from the previous summary in your new summary! 
            In your response, include only the modified new summary consisting of the previous summary modified to include any relevant 
            information present in the new part of the transcript. Your response should be the new summary with an absolute maximum of 500 words 
            and nothing else, with no words such as "in this part of the transcript If the response is more than 500 words, modify it to be under 500 words long".
        """
        
        messages = [
                {'role': 'system', 'content': systemContent},
                {'role': 'user', 'content': userPrompt}
        ]
   
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages
        )
        
        context = completion.choices[0].message.content
        
        num_tokens = len(encoding.encode(systemContent + userPrompt + context))
        cum_tokens += num_tokens
        
        print("TOKENS USED: " + str(num_tokens))
        print("CUMULATIVE TOKENS USED: " + str(cum_tokens))
    print("#################")
    print(context)
    print("LENGTH: " + str(len(context.split(" "))))

    
    
    


TOKENS USED: 880
CUMULATIVE TOKENS USED: 880
#################
Alice D., a woman from Belfast, shares her experiences growing up with a young widow mother and two siblings after her father's death. She recounts starting a job at Thompsons of Belfast making chocolates, but later moves to the Ormeau Bakery during the war years. Alice speaks fondly of her time at the bakery, describing it as some of her happiest years. She mentions the challenges of getting married during the war due to rationing and job availability. Eventually, she and her husband moved to Canada but returned to Belfast due to discomfort with the heat and humidity. Alice reflects on the war years at the Ormeau Bakery, particularly her role in making chocolates for troops and the bakery's involvement in supplying bread. The details of the bakery's operations and its secretive nature are intriguing to her.
LENGTH: 137
TOKENS USED: 1305
CUMULATIVE TOKENS USED: 2185
#################
Alice D., a woman from Belfast, shares h

Context shortened! 
New Length: 387
TOKENS USED: 1317
CUMULATIVE TOKENS USED: 7067
TOKENS USED: 1953
CUMULATIVE TOKENS USED: 9020
#################
Alice D., a woman from Belfast, shares her experiences growing up with a young widow mother and two siblings after her father's death. She recounts starting a job at Thompsons of Belfast making chocolates, but later moves to the Ormeau Bakery during the war years. Alice speaks fondly of her time at the bakery, describing it as some of her happiest years. She mentions the challenges of getting married during the war due to rationing and job availability. Eventually, she and her husband moved to Canada but returned to Belfast due to discomfort with the heat and humidity. The Ormeau Bakery supplied bread and made chocolates for the troops, despite rationing. The bakery had high standards and only used pure and fresh ingredients. The bakery spread its staff over different departments to avoid layoffs and maintained its high standards, even requ

TOKENS USED: 1651
CUMULATIVE TOKENS USED: 15260
#################
Alice D., a woman from Belfast, shares her experiences growing up with a young widow mother, two siblings, and her brother's trade and employment history. Alice mentions her mother's hard work and the household chores she and her siblings helped with, such as turning the mangle and ironing. She recounts her mother's weekly routine of working in an office, washing on Tuesday nights, and ironing on Thursday nights. Despite the work, Alice reflects that there was no stress or difficulty in handing in their pay packets and being able to buy affordable materials for clothing. She recalls saving up to buy material and having someone make clothes for her. Alice also mentions taking care of children for others and earning a small amount of money. She highlights the low wages in those days but expresses gratitude for the opportunity to work and the ability to save. Overall, Alice emphasizes the work ethic instilled in her by her 

TOKENS USED: 2169
CUMULATIVE TOKENS USED: 22259
#################
Alice D., a woman from Belfast, shares her experiences growing up with a young widow mother, two siblings, and her brother's trade and employment history. Alice mentions her mother's hard work and the household chores she and her siblings helped with, such as turning the mangle and ironing. She recounts her mother's weekly routine of working in an office, washing on Tuesday nights, and ironing on Thursday nights. Despite the work, Alice reflects that there was no stress or difficulty in handing in their pay packets and being able to buy affordable materials for clothing. She recalls saving up to buy material and having someone make clothes for her. Alice also mentions taking care of children for others and earning a small amount of money. She highlights the low wages in those days but expresses gratitude for the opportunity to work and the ability to save. Alice further talks about her experience working at either Thomps

TOKENS USED: 2091
CUMULATIVE TOKENS USED: 30014
#################
Alice D., a woman from Belfast, shares her experiences growing up with a hardworking young widow mother and two siblings. She discusses her mother's weekly routine and reflects on the lack of stress in handling pay packets and acquiring affordable materials. Alice mentions working various jobs while attending school, expressing gratitude for the opportunity to work and save. She later immigrated to Canada with her husband and found jobs to support her mother financially. Alice compares Canada to the UK and explains her decision to stay in Canada. She recounts her time at the Ormeau Bakery, praising the owner and discussing strict appearance policies. Alice also mentions her involvement in trade unions. Miss Grahame, a highly educated and strict woman, interviewed Alice for a position at the bakery. Alice credits her experience working in shop windows as a teenager with giving her confidence and helping her overcome fear.

TOKENS USED: 1893
CUMULATIVE TOKENS USED: 36690
#################
Alice D., a woman from Belfast, shares her experiences growing up with a hardworking young widow mother and two siblings. She discusses her mother's weekly routine and reflects on the lack of stress in handling pay packets and acquiring affordable materials. Alice mentions working various jobs while attending school, expressing gratitude for the opportunity to work and save. She later immigrated to Canada with her husband and found jobs to support her mother financially. Alice compares Canada to the UK and explains her decision to stay in Canada. She recounts her time at the Ormeau Bakery, praising the owner and discussing strict appearance policies. Alice also mentions her involvement in trade unions. Miss Grahame interviewed Alice for a position at the bakery. Alice credits her experience working in shop windows as a teenager with giving her confidence. She reveals that her appearance, work ethic, and capabilities led 

TOKENS USED: 1904
CUMULATIVE TOKENS USED: 44632
#################
Alice D., a woman from Belfast, reflects on her time at the Ormeau Bakery during World War II. She praises the bakery's high standards and hygiene practices, mentioning Miss Grahame as the supervisor. Alice herself monitored the girls in the chocolate room and took responsibility for their performance. She mentions the silver tea service as proof of her high standing in the bakery and recounts receiving a generous donation from management. Alice also describes the presence of an ambulance room with a doctor nearby for medical emergencies. She digresses to share a personal anecdote about a bike accident caused by a landmine in Cromac Square. Despite this, she continues to discuss the war and censorship, as well as the bakery's refusal to compromise on quality, even rejecting deliveries that did not meet their standards. Overall, Alice fondly recalls her experiences at the Ormeau Bakery and considers it a privilege to have

TOKENS USED: 1881
CUMULATIVE TOKENS USED: 51379
#################
Alice D., a woman from Belfast, reflects on her time at the Ormeau Bakery during World War II. She praises the bakery's high standards and hygiene practices, mentioning Miss Grahame as the supervisor. Alice herself monitored the girls in the chocolate room and took responsibility for their performance. Despite a bike accident caused by a landmine, Alice discusses the war, censorship, and the bakery's refusal to compromise on quality. She recalls the presence of an ambulance room and generous support from management during her recovery. The Ormeau Bakery supported the war effort by sending food parcels to enlisted employees and kept jobs open for those who returned. Alice emphasizes the happy times and opportunities to meet lovely people during the war. She fondly recalls her love for Co. Wicklow and cycling trips to Glendalough and Glenmalure. Alice also mentions her father's death from typhoid and her mother's struggle 

TOKENS USED: 2106
CUMULATIVE TOKENS USED: 58670
#################
Alice D., a woman from Belfast, reflects on her time at the Ormeau Bakery during World War II. She praises the bakery's high standards and hygiene practices, mentioning Miss Grahame as the supervisor. Alice discusses the war, censorship, and the bakery's refusal to compromise on quality. She recalls the presence of an ambulance room and the support from management during her recovery from a bike accident caused by a landmine. The Ormeau Bakery supported the war effort by sending food parcels to enlisted employees and kept jobs open for those who returned. Alice fondly recalls her love for Co. Wicklow and cycling trips to Glendalough and Glenmalure. She mentions her father's death from typhoid and her mother's struggle to raise her and her siblings. Alice highlights the support from neighbors in times of tragedy and the family's move to a nicer locality when they were all working and grown up. She briefly mentions her inv

TOKENS USED: 2150
CUMULATIVE TOKENS USED: 65303
#################
Alice D. reflects on her time at the Ormeau Bakery during World War II, praising its high standards and hygiene practices. She discusses the war and censorship, as well as the bakery's refusal to compromise on quality. Alice recalls the support she received from management during her recovery from a bike accident caused by a landmine. The Ormeau Bakery supported the war effort and kept jobs open for enlisted employees. Alice fondly recalls her love for Co. Wicklow and cycling trips to Glendalough and Glenmalure. She mentions her father's death from typhoid and her mother's struggle to raise her and her siblings. Alice highlights the support from neighbors in times of tragedy and the family's move to a nicer locality. She briefly mentions her involvement in the Girl Guides, camping, and guiding. Alice reflects on her love for nature and the guides' influence on her appreciation for the creation. She briefly discusses her 

In [164]:
context = "Alice D. reminisces about her time at Ormeau Bakery during WWII, praising its standards and hygiene. She discusses censorship, the bakery's commitment to quality, and the support she received from management after a bike accident caused by a landmine. The bakery kept jobs open for enlisted employees and supported the war effort. Alice fondly recalls her love for Co. Wicklow and cycling trips. She mentions her father's death, her mother's struggle to raise her and her siblings, and the support they received from neighbors. Alice reflects on her involvement in the Girl Guides, her love for nature, and her romantic relationship. The close neighborhood network and strong community support are emphasized. She comments on the changes in society and the diminished sense of neighborliness. Despite this, Alice acknowledges that overall, quality of life has improved. She remembers the freedom she had as a child, the strict rules her single mother laid down, and her mother's role as both provider and disciplinarian. Alice reflects on her family dynamics and the support they received from her mother's side. She assisted her mother with household chores and they appreciated their close-knit community on Donegall Road. Independence, self-sufficiency, and community support are valued by Alice throughout her life. The war led to the displacement of people and the breaking up of neighborhoods, but it took a long time for them to recover. However, when they did, they became far better places. The introduction of the Northern Ireland Housing Executive led to the construction of lovely estates, including the cozy houses Alice and her husband lived in. The houses were designed with thoughtfulness and had a place for everything. Overall, Alice found the conversation beneficial."

### CONTEXTUAL SUMMARY GENERATED BY AI


In [165]:
from IPython.display import Markdown as md


md(context)

Alice D. reminisces about her time at Ormeau Bakery during WWII, praising its standards and hygiene. She discusses censorship, the bakery's commitment to quality, and the support she received from management after a bike accident caused by a landmine. The bakery kept jobs open for enlisted employees and supported the war effort. Alice fondly recalls her love for Co. Wicklow and cycling trips. She mentions her father's death, her mother's struggle to raise her and her siblings, and the support they received from neighbors. Alice reflects on her involvement in the Girl Guides, her love for nature, and her romantic relationship. The close neighborhood network and strong community support are emphasized. She comments on the changes in society and the diminished sense of neighborliness. Despite this, Alice acknowledges that overall, quality of life has improved. She remembers the freedom she had as a child, the strict rules her single mother laid down, and her mother's role as both provider and disciplinarian. Alice reflects on her family dynamics and the support they received from her mother's side. She assisted her mother with household chores and they appreciated their close-knit community on Donegall Road. Independence, self-sufficiency, and community support are valued by Alice throughout her life. The war led to the displacement of people and the breaking up of neighborhoods, but it took a long time for them to recover. However, when they did, they became far better places. The introduction of the Northern Ireland Housing Executive led to the construction of lovely estates, including the cozy houses Alice and her husband lived in. The houses were designed with thoughtfulness and had a place for everything. Overall, Alice found the conversation beneficial.

# Finding codes

In [169]:
import json

codesMap = {}
cum_tokens = 0


# Store start time
start_time = time.time()


for index, chunk in enumerate(break_string_into_chunks(string, 500)):
    systemContent = """
                    You are a researcher researching the experiences of women who lived through World War II. 
                    You are reading an interview transcript and trying to find thematic codes which encapsulate the 
                    experience of women in a summative, salient, and essence-capturing summary of the transcript. The purpose
                    of the codes is to form the basis of research findings on how the war impacted women.
                """

    assistantContent = f"""You can understand the context of the interview by reading this brief contextual summary: "{context}" """

    questionContent = f"""You will receive a part of the interview transcript with an interviewee who participated in 
    the research. Based on the transcript and contextual knowledge from the summary, identify 10 codes which 
    can be seen as direct quotes in the transcript and which highlight how the war impacted women in a summative, salient, 
    and essence-capturing way. Under each code, include some elaboration of what the code means. Here is the transcript: "{chunk}".
    In your response, include nothing but each code and its description in the following format: Number. Name - Description. Eg. "1. Widow's Struggle - Highlighting the hardships faced by young widows who had to bring up children without any financial support or assistance.""
 """
    
    messages = [
            {'role': 'system', 'content': systemContent},
            {"role": "assistant", "content": assistantContent},
            {'role': 'user', 'content': questionContent}
        ]

    completion = askGPT(messages)
    response = completion['response']
    num_tokens = completion['tokens']
    cum_tokens += num_tokens
        
    print("TOKENS USED: " + str(num_tokens))
    print("CUMULATIVE TOKENS USED: " + str(cum_tokens/ 1000) + "K")
    
    print(response)
    
    codesMap[index] = {
        "text": chunk,
        "codes": response.replace('\n', '     ')
    }


# Specify the text file path
text_file = 'codes.txt'

# Open the text file in write mode
with open(text_file, mode='w') as file:
    json.dump(codesMap, file, ensure_ascii=False)

    
elapsed_time = time.time() - start_time

print(f"Data has been saved to {text_file}.")
print("Total tokens used: " + str(cum_tokens))
print("Total words analyzed: " +str(len(string)))
print("Computation took " + str(elapsed_time / 60) + "minutes" )


TOKENS USED: 1543
CUMULATIVE TOKENS USED: 1.543K
1. Widow's Struggle - Highlighting the hardships faced by young widows who had to bring up children without any financial support or assistance.

2. Chocolates and Rationing - Reflecting on the impact of war rationing and the limitations it imposed on personal choices such as marriage, and emphasizing the significance of the job making chocolates during wartime.

3. Job Stability - Appreciating the stability and security of having a good job during wartime as many people had to work in ammunition places and faced uncertainty.

4. Food Supply and Troop Support - Highlighting the essential role of the bakery in providing food for troops and contributing to the war effort through the production of parcels.

5. Hush Hush Operations - Describing the secretive nature of the bakery's involvement in supplying bread when ships came in during the war, reflecting the atmosphere of censorship and confidentiality.

6. Community Support and Neighborli

TOKENS USED: 1484
CUMULATIVE TOKENS USED: 9.162K
1. Hygiene Standards - Emphasizing the importance of standards and hygiene at Ormeau Bakery during WWII, highlighting the commitment to quality in the workplace.
2. Girl Guides Influence - Acknowledging the positive impact the Girl Guides had on the interviewee's life and how it shaped her character.
3. Improved Salary - Highlighting the significance of the increased wages the interviewee received when she joined Ormeau Bakery, especially during the war years.
4. Managerial Standing - Describing the promotion the interviewee received at the bakery, which not only increased her wages but also gave her a sense of managerial standing.
5. Appreciated Gifts - Reflecting on the value of the special gifts the interviewee received from Ormeau Bakery as tokens of appreciation.
6. Plans to Canada - Explaining the reason for leaving the bakery after getting married, as the interviewee and her husband planned to move to Canada.
7. Limited Opportunit

TOKENS USED: 1423
CUMULATIVE TOKENS USED: 16.647K
1. Ormeau Bakery - Highlighting the positive experience and high standards of working at the bakery during the war.

2. Looking Back - Expressing nostalgia for the Ormeau Bakery and how it set the standard for future workplaces.

3. Mr. Wilson's Presence - Illustrating the personal and attentive management approach of the bakery owner, Mr. Wilson.

4. Hair-Related Censorship - Demonstrating the strict dress code and expectations regarding appearance for employees at the bakery.

5. Diplomatic Correction - Highlighting the considerate and diplomatic approach of Miss Grahame in addressing appearance issues among employees.

6. Hygiene Standards - Emphasizing the importance of hygiene and cleanliness at the bakery, evidenced by the mandatory showers and provided laundry services.

7. Timekeeping - Showing the punctuality expectations and consequences for tardiness.

8. Deductions for Tardiness - Exploring the potential financial consequenc

TOKENS USED: 1669
CUMULATIVE TOKENS USED: 24.365K
1. Enjoyment amidst war - Capturing the idea that women found joy and pleasure in the simple activities and social events that allowed them to escape the challenges of war, such as going to dances and spending time with friends.

2. Resourcefulness of mothers - Illustrating the resilience and ingenuity of mothers who were able to provide for their families despite the limitations imposed by rationing. This code highlights the ability of women to make do with limited resources.

3. Community support during wartime - Highlighting the sense of solidarity and generosity within the community, where neighbors would offer food and hospitality to soldiers and provide relief from the often monotonous and rationed diet of the military barracks.

4. Transportation hardships - Describing the logistical challenges faced by women during the war due to the lack of available transportation, particularly the scarcity of petrol for cars. This code emphas

TOKENS USED: 1526
CUMULATIVE TOKENS USED: 31.939K
1. Support from soldiers - Highlighting the assistance and support provided by soldiers during the war, even in non-combat situations.
2. Importance of the bicycle - Emphasizing the role of bicycles in everyday life during the war, and the impact of a bicycle accident on the interviewee's mobility and independence.
3. Mother's premonition - Describing the intuitive connection between mothers and their children, highlighting the emotional impact of the accident on both the interviewee and her mother.
4. Resilience and determination - Demonstrating the interviewee's resilience and determination to overcome the accident and continue with her daily activities, such as going to work the next morning.
5. Support from employer - Illustrating the supportive nature of the interviewee's workplace, as her manager immediately attended to her well-being and ensured she received medical attention and time off to recover.
6. Financial compensation - N

TOKENS USED: 1534
CUMULATIVE TOKENS USED: 39.604K
1. Strong guide movement - An emphasis on the active participation and commitment to the guide movement, showcasing the strength of the organization and its impact on young women during the war.
2. Voluntary work - Highlighting the willingness of individuals to engage in community service and support during wartime, indicating the sense of duty and dedication among women.
3. Sustained connections - Illustrating the enduring bonds and connections formed through participation in organizations like the guide movement and sea scouts, emphasizing the lasting impact and loyalty to these groups.
4. Musical competitions - Demonstrating the importance of music and cultural activities, which provided opportunities for skill development and camaraderie among young women during the war.
5. Hard life for mothers - Describing the challenging circumstances faced by mothers, particularly those who had to assume family responsibilities at a young age an

TOKENS USED: 1535
CUMULATIVE TOKENS USED: 47.143K
1. Neighborhood Support - Highlighting the strong network of neighbors who provided assistance and care for each other, such as minding children and offering a helping hand.
2. Close Community - Emphasizing the tight-knit and interconnected nature of the community, where neighbors would freely enter each other's houses and support one another.
3. Independence and Freedom - Reflecting the sense of freedom and self-sufficiency experienced by women during the war, such as going out at night and taking moonlit walks.
4. Diminished Neighborliness - Noting the decline in neighborliness and community support in modern times compared to the past, leading to a loss of the comforting sense of knowing and relying on one's neighbors.
5. Improved Material Quality of Life - Acknowledging that while the sense of neighborliness has diminished, the overall material standard of living has improved for everyone.
6. Protective Parenting - Contrasting the n

In [112]:
print(cum_tokens)

7629


In [172]:
print(json_data[key]["codes"])

{'1': {'title': "Widow's Struggle", 'description': 'Highlighting the hardships faced by young widows who had to bring up children without any financial support or assistance', 'quotes': ['My father died when I was only two years old so my very young mother, twenty-three, was a widow. My brother was born after he died and she brought the three of us up. In those days, there was no widow’s pensions or anything like that so it was all hard work.']}, '2': {'title': 'Chocolates and Rationing', 'description': 'Reflecting on the impact of war rationing and the limitations it imposed on personal choices such as marriage, and emphasizing the significance of the job making chocolates during wartime', 'quotes': ['when I started work, I started a very interesting job. I loved it, making chocolates. I started with a small firm, first of all, Thompsons of Belfast, but I moved from there to the Ormeau Bakery in 1939, the war years, you know and I was there from thirty nine to fifty three, when I got 

# Send codes to ChatGPT and ask to find direct quotes

In [171]:
import json

# Open the text file containing the JSON data
response = {}

# Store start time
start_time = time.time()
cum_tokens = 0

with open('codes.txt', 'r+') as file:
    # Read the contents of the file
    data = file.read()

    # Parse the JSON data
    json_data = json.loads(data)   
    counter = 0
    for key in list(json_data.keys())[:1]:

            system = """You are a researcher who has read research interview transcripts and extracted thematic codes from 
            the interviews. Now, you are trying to find quotes to support the validity of your codes
            """
            question = f"""Read the following codes: "{json_data[key]["codes"]}"

            Under each code, find quotes (preferably more than one) from the text: "{json_data[key]['text']}"

            List each code and the quotes under the codes. Your response should include only a json object of form: 

                {{ "1": {{'title': title_of_code, 'quotes': [quote1, quote2, ...]}}, ...}}
            """

            messages = [
                {'role': 'system', 'content': system},
                {'role': 'user', 'content': question}
            ]

            #response = askGPT(messages)

            cum_tokens += response['tokens']
            print("response ready. Tokens used: " + str(cum_tokens))

            text = json.loads(response['response'])
            
            print(text)
    
            counter += 1
            print("DONE: " + str(counter))
            
            
            json_data[key]['codes'] = text
            
    #file.truncate(0)
    #file.seek(0)
    #json.dump(json_data, file, ensure_ascii=False)
    
    
    print("Data saved into codes.txt")
    
    elapsed_time = time.time() - start_time
    print("Computation took " + str(elapsed_time / 60) + "minutes" )
    print("Total tokens used: " + str(cum_tokens))
    
        

response ready. Tokens used: 2081
{'1': {'title': "Widow's Struggle", 'description': 'Highlighting the hardships faced by young widows who had to bring up children without any financial support or assistance', 'quotes': ['My father died when I was only two years old so my very young mother, twenty-three, was a widow. My brother was born after he died and she brought the three of us up. In those days, there was no widow’s pensions or anything like that so it was all hard work.']}, '2': {'title': 'Chocolates and Rationing', 'description': 'Reflecting on the impact of war rationing and the limitations it imposed on personal choices such as marriage, and emphasizing the significance of the job making chocolates during wartime', 'quotes': ['when I started work, I started a very interesting job. I loved it, making chocolates. I started with a small firm, first of all, Thompsons of Belfast, but I moved from there to the Ormeau Bakery in 1939, the war years, you know and I was there from thirt

# Fetch embeddings for code descriptions

In [87]:
def get_embedding(text, model="text-embedding-ada-002"):
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

with open('codes.txt', 'r+') as file:
    data =file.read()
    json_data = json.loads(data)
        
    for snippetId in list(json_data.keys()):
        print("Working on snippet " + snippetId)
        
        for codeId in list(json_data[snippetId]['codes'].keys()):
            print("working on code Id " + codeId)
            description = json_data[snippetId]['codes'][codeId]['description']
            embedding = get_embedding(description)
            json_data[snippetId]['codes'][codeId]['embedding'] = embedding
            print("embedded.")
            
    file.truncate(0)
    file.seek(0)
    json.dump(json_data, file, ensure_ascii=False)
    
    print("Data saved into codes.txt")



Working on snippet 0
working on code Id 1
embedded.
working on code Id 2
embedded.
working on code Id 3
embedded.
working on code Id 4
embedded.
working on code Id 5
embedded.
working on code Id 6
embedded.
working on code Id 7
embedded.
working on code Id 8
embedded.
working on code Id 9
embedded.
working on code Id 10
embedded.
Working on snippet 1
working on code Id 1
embedded.
working on code Id 2
embedded.
working on code Id 3
embedded.
working on code Id 4
embedded.
working on code Id 5
embedded.
working on code Id 6
embedded.
working on code Id 7
embedded.
working on code Id 8
embedded.
working on code Id 9
embedded.
working on code Id 10
embedded.
Working on snippet 2
working on code Id 1
embedded.
working on code Id 2
embedded.
working on code Id 3
embedded.
working on code Id 4
embedded.
working on code Id 5
embedded.
working on code Id 6
embedded.
working on code Id 7
embedded.
working on code Id 8
embedded.
working on code Id 9
embedded.
working on code Id 10
embedded.
Work

# Create a pd dataframe

In [95]:
# [id, title, description, embedding, quotes, text]
import uuid

df = pd.DataFrame(columns=["ID", "TITLE", "DESCRIPTION", "EMBEDDING", "QUOTES", "TEXT"])


textSnippets = {}

with open('textSnippets.txt', 'r') as file: 
    data = file.read()
    textSnippets = json.loads(data)   
    

with open('codes.txt', 'r') as file:
    data =file.read()
    json_data = json.loads(data)
        
    for snippetId in list(json_data.keys()):
        for codeId in list(json_data[snippetId]['codes'].keys()):
            code = json_data[snippetId]['codes'][codeId]
            print(code)
            new_row = {
                "ID": uuid.uuid4().hex,
                "TITLE": code['title'],
                "DESCRIPTION": code['description'],
                "EMBEDDING": code['embedding'],
                "QUOTES": code['quotes'],
                "TEXT": textSnippets[snippetId]
            }
            print(new_row)
            df = df.append(new_row, ignore_index=True)

            
df.to_csv('codes.csv', index=False)          
            
            
           
            
    



{'title': 'Hard Work', 'quotes': ['My father died when I was only two years old so my very young mother, twenty-three, was a widow.', 'In those days, there was no widow’s pensions or anything like that so it was all hard work.'], 'description': "Illustrates the challenging circumstances under which Alice's mother had to raise three young children as a widow without any financial support or assistance after her husband's death.", 'embedding': [0.007753467187285423, -0.010289276018738747, -0.014351880177855492, -0.017511684447526932, 0.0030220600310713053, 0.012201089411973953, -0.012898105196654797, 0.01821533776819706, -0.02934103272855282, -0.051433105021715164, -0.004988639149814844, 0.02539791539311409, 0.007335258182138205, -0.013163634575903416, -0.0048957038670778275, 0.006910410709679127, 0.02495979145169258, 0.007288790307939053, 0.012094877660274506, -0.023791462182998657, -0.032527387142181396, 0.017657725140452385, 0.005400210153311491, -0.0008492801571264863, -0.00296231568

  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.

{'title': 'Job Redistribution', 'quotes': ['they didn’t pay anyone off, they spread the staff over the different departments, of which I received two into the chocolate department.'], 'description': 'Explaining how the bakery avoided layoffs by reallocating staff to different departments, such as the chocolate department, when certain products had to be suspended due to ingredient shortages.', 'embedding': [-0.00733958138152957, -0.010638651438057423, -0.0022787393536418676, 0.002030458766967058, -0.020202554762363434, -0.0017158567206934094, 0.00610498059540987, -0.004628901835530996, -0.0065573276951909065, -0.03360290452837944, 0.023494822904467583, -0.005407754797488451, -0.017019122838974, -0.01537298783659935, -0.01625727489590645, -0.008802056312561035, 0.029031818732619286, -0.0022804399486631155, -0.005261507350951433, -0.033358022570610046, -0.026079662144184113, -0.006057365331798792, 0.009346232749521732, 0.017087144777178764, -0.003809235990047455, -0.012903786264359951, 0

  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)


{'title': 'Walking in the blackout', 'quotes': ['you couldn’t walk in the blackout then, alone, you had to be in twos and threes because again, everything was pitch.', 'So it was pretty weird but there was a lot of fun during the war as well because there was canteens set up in different hotels'], 'description': 'Depicting the challenging conditions faced by women during blackout periods, when they had to walk in pairs or groups in pitch darkness due to safety concerns, reflecting the restrictions and precautions imposed by the war.', 'embedding': [-0.01871650479733944, -0.029546689242124557, -0.001033053849823773, -0.024253133684396744, -0.004945829510688782, 0.0042166151106357574, 0.019783318042755127, -0.009439276531338692, 0.005269924644380808, -0.06730379164218903, 0.00026016245828941464, 0.021336274221539497, 0.006643954198807478, -0.005877603776752949, -0.010600617155432701, 0.018919063732028008, 0.021025683730840683, 0.008190158754587173, -0.002587698632851243, -0.0206475723534

  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [127]:
import uuid

df = pd.read_csv('codes.csv')


title_embeddings = []   
for index, row in df.iterrows():
        title = row ['TITLE']
        title_embedding = get_embedding(title)
        title_embeddings.append(title_embedding)
        print("Embedding done for title: " + str(index))
df['TITLE_EMBEDDINGS'] = title_embeddings

print(df.columns)
            
df.to_csv('codes.csv', index=False)          
            
            
           
     

Embedding done for title: 0
Embedding done for title: 1
Embedding done for title: 2
Embedding done for title: 3
Embedding done for title: 4
Embedding done for title: 5
Embedding done for title: 6
Embedding done for title: 7
Embedding done for title: 8
Embedding done for title: 9
Embedding done for title: 10
Embedding done for title: 11
Embedding done for title: 12
Embedding done for title: 13
Embedding done for title: 14
Embedding done for title: 15
Embedding done for title: 16
Embedding done for title: 17
Embedding done for title: 18
Embedding done for title: 19
Embedding done for title: 20
Embedding done for title: 21
Embedding done for title: 22
Embedding done for title: 23
Embedding done for title: 24
Embedding done for title: 25
Embedding done for title: 26
Embedding done for title: 27
Embedding done for title: 28
Embedding done for title: 29
Embedding done for title: 30
Embedding done for title: 31
Embedding done for title: 32
Embedding done for title: 33
Embedding done for title

# Fetch groups of embeddings to find insights

In [134]:
#test finding similar codes
df = pd.read_csv('codes.csv')

df['EMBEDDING'] = df['EMBEDDING'].apply(eval).apply(np.array, dtype=float)
df['TITLE_EMBEDDINGS'] = df['TITLE_EMBEDDINGS'].apply(eval).apply(np.array, dtype=float)


embeddings = np.array(df['EMBEDDING'].tolist())
title_embeddings = np.array(df['TITLE_EMBEDDINGS'].tolist())

similarities = cosine_similarity([title_embeddings[8]], title_embeddings)
print(df.iloc[8, :])

sorted_indices = np.argsort(similarities[0])[::-1][:20]
print([similarities[0][i] for i in sorted_indices][:20])
top_10_snippets=  df.loc[sorted_indices].iloc[:, :]

titles = np.array(top_10_snippets['TITLE'].tolist())
descriptions = np.array(top_10_snippets['DESCRIPTION'].tolist())
quotes = np.array(top_10_snippets['QUOTES'].tolist())


for i in range(0, 20):
    print(str(i) + ". title: " + titles[i])
    print(str(i) + ". description: " + descriptions[i])
    print(str(i) + ". quotes: " + quotes[i])

"""
codeStack
    function mergeCodes {
        pop next code from stack
        take 10 closest embeddings
        ask GPT to see if they relate to each other with quotes
        return new code and description and codes it was made up of as well as code IDs which did not relate
        append to new codes json
        remove merged codes from codeStack
        mergeCodes
        
        if(stack length > 1) # Stop when we have iterated through all codes in stack =) can disregard this code, as it was not significant 
    }
    

"""

ID                                   856f88518fc146e29debb1a705cd91f6
TITLE                                                       Nostalgia
DESCRIPTION         Expresses Alice's fondness for the war years a...
EMBEDDING           [-0.017168628051877022, -0.030933482572436333,...
QUOTES              ['But the war years in the Ormeau Bakery were ...
TEXT                Well, my name is Alice D. That’s my married na...
TITLE_EMBEDDINGS    [-0.004050612449645996, 0.0023300545290112495,...
Name: 8, dtype: object
[0.9999999999999921, 0.8685426389055131, 0.8615719484024514, 0.8382052617194617, 0.8323593892124725, 0.8079641157746409, 0.8060907935493621, 0.8035842946087346, 0.7990807296021704, 0.792942068919648, 0.7918106620194735, 0.7901234964916938, 0.7835120135732575, 0.7823801351321336, 0.782178219604829, 0.7811838978248615, 0.7791840533022266, 0.7787295847205098, 0.7786666057392941, 0.7780573970172981]
0. title: Nostalgia
0. description: Expresses Alice's fondness for the war years at Orme

'\ncodeStack\n    function mergeCodes {\n        pop next code from stack\n        take 10 closest embeddings\n        ask GPT to see if they relate to each other with quotes\n        return new code and description and codes it was made up of as well as code IDs which did not relate\n        append to new codes json\n        remove merged codes from codeStack\n        mergeCodes\n        \n        if(stack length > 1) # Stop when we have iterated through all codes in stack =) can disregard this code, as it was not significant \n    }\n    \n\n'

# K means grouping

In [157]:

from sklearn.cluster import KMeans
import ast

matrix = np.vstack(df.EMBEDDING.values)
n_clusters = 80

kmeans = KMeans(n_clusters = n_clusters, init='k-means++', random_state=42)
kmeans.fit(matrix)
df['CLUSTER'] = kmeans.labels_

cluster_groups = df.groupby('CLUSTER')

for cluster_label, cluster_data in cluster_groups:
    print(f"Cluster {cluster_label}:")
    for index, row in cluster_data.iterrows():
        print(row['DESCRIPTION'])
        for quote in ast.literal_eval(row['QUOTES']):
            print("\"" + quote + "\"")
    
    print('\n')

Cluster 0:
Explaining the financial constraints faced by the interviewee's family, leading to the decision for her to start working at a young age rather than pursuing further education.
"I’m waiting for her to go to work – that’s just what it was."


Cluster 1:
Mentioning the various departments and tasks within the Ormeau Bakery, including hotplate workers, cake decorators, and biscuit makers, illustrating the diverse roles occupied by women in the bakery during the war.
"I was very fortunate there because the gentleman who was the - what would you call him? Well, he was the boss of the whole show and he made the wedding cakes and everything and he did that kind of work in the chocolate room on lovely steel-topped tables."
"He got promotion and of course, was taken out, and I was very fortunate to have been given then the complete charge of the chocolate room. So I usually had maybe six to eight girls, usually you know, and they’d be there and we were all a very happy bunch."
Highlig

# Hacks to link description to codes

In [93]:
codeTextJson ={}
with open('codes2.txt', 'r') as file: 
    data = file.read()
    json_data = json.loads(data)   
    
    for key in list(json_data.keys()):
        text = json_data[key]["text"]
        
        codeTextJson[key] = text
  
with open('textSnippets.txt', 'w') as file:
    json.dump(codeTextJson, file, ensure_ascii=False)

        

In [83]:
"""with open('codes.txt', 'r+') as file:
    data =file.read()
    json_data = json.loads(data)
    
    print(json_data.keys())
    print(codeDescriptionJson.keys())
    if(json_data.keys() != codeDescriptionJson.keys()): raise ValueError("Snippet keys in jsons dont match")
        
    for snippetId in list(json_data.keys()):
        print("Working on snippet " + snippetId)
        print(json_data[snippetId]['codes'].keys())
        print(codeDescriptionJson[snippetId].keys())
        if(json_data[snippetId]['codes'].keys() != codeDescriptionJson[snippetId].keys()): raise ValueError("Code ids in snippet: " + str(snippetId) + "dont match")
        print("codes keys OK")
        for codeId in list(json_data[snippetId]['codes'].keys()):
            print("working on code Id " + codeId)
            print("title: " + json_data[snippetId]['codes'][codeId]['title'])
            print("title: " + codeDescriptionJson[snippetId][codeId]['title'])
        
            #code ID should match
            if(snippetId != "14" or codeId != "9"):
                if(json_data[snippetId]['codes'][codeId]['title'] != codeDescriptionJson[snippetId][codeId]['title']):
                    raise ValueError("Titles dont match: #Title 1: " + json_data[snippetId]['codes'][codeId]['title'] + "  #Title 2: " + codeDescriptionJson[snippetId][codeId]['title'])
            json_data[snippetId]['codes'][codeId]['description'] = codeDescriptionJson[snippetId][codeId]['description']
            print("success")
    file.truncate(0)
    file.seek(0)
    json.dump(json_data, file, ensure_ascii=False)
    
    print("Data saved into codes.txt")
"""
            

dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14'])
dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14'])
Working on snippet 0
dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
codes keys OK
working on code Id 1
title: Hard Work
title: Hard Work
success
working on code Id 2
title: Independence
title: Independence
success
working on code Id 3
title: War Effort
title: War Effort
success
working on code Id 4
title: Rationing and Coupons
title: Rationing and Coupons
success
working on code Id 5
title: Food Supply
title: Food Supply
success
working on code Id 6
title: Secrecy and Censorship
title: Secrecy and Censorship
success
working on code Id 7
title: Community Support
title: Community Support
success
working on code Id 8
title: Quality of Life
title: Quality of Life
success
working on code Id 9
title: Nostalgia
title: Nostalgia
succes

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


model="text-embedding-ada-002"

def string_from_arr(text):
     text = " ".join(" ".join(text).split()[:100])
     return text.replace("\n", " ")

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

input_sentence = "Prior to the war, opportunities for women were limited to traditional roles like homemaking and low-skilled jobs. However, with the men being drafted into the military, industries were desperate for workers, and I found myself working in a factory assembling aircraft parts. This experience opened doors for me and many other women. After the war, I pursued a career in engineering, a field I never imagined entering before. The war definitely had a profound impact on our subsequent career trajectories."
input_sentence_against = "I lived through World War II, and I must say that the employment patterns and opportunities for women didn't change as dramatically as some claim. Yes, there was a need for women to take up jobs that were traditionally held by men, such as working in factories or offices. But these opportunities were often temporary and limited. Once the war ended and the men returned, society quickly reverted back to its traditional gender roles. Many women, including myself, were expected to leave the workforce and return to our domestic duties. The impact on our career trajectories was minimal, as the prevailing societal norms didn't allow for significant long-term changes."

input_embedding = np.array(get_embedding(input_sentence, model))

input_embedding_against = np.array(get_embedding(input_sentence_against, model))



similarities = cosine_similarity([input_embedding], embeddings)
sorted_indices = np.argsort(similarities[0])[::-1]
print([similarities[0][i] for i in sorted_indices][:10])
top_10_snippets=  df_one.loc[sorted_indices].iloc[:10, :2]


similarities_against = cosine_similarity([input_embedding_against], embeddings)
sorted_indices_against = np.argsort(similarities_against[0])[::-1]
print([similarities_against[0][i] for i in sorted_indices_against][:8])
top_10_snippets_against=  df_one.loc[sorted_indices_against].iloc[:10, :2]



question = """You will receive 20 quotes from an interviewee who participated in the research. 
Regarding the question: What were the employment patterns and opportunities for women during World War II, 
and how did these experiences impact their subsequent career trajectories?, identify 10 thematic codes which emerge 
in the 20 quotes and explain how they emerged in the quotes. Please include the number of quotes which relate to the thematic code."""
messages = [
        {'role': 'system', 'content': 'You are a researcher researching how world war two impacted women. You are trying to find thematic codes in your qualitative research interview data, which illustrate different themes which emerge in the research data'},
        {'role': 'user', 'content': question}
    ]

for index, row in top_10_snippets.iterrows():
        snippet = row['TEXT_SNIPPET']
        print("TEXT SNIPPET FOR: " + snippet)
        messages.extend([{'role': 'user', 'content': snippet}])
for index,row in top_10_snippets_against.iterrows():
    snippet = row['TEXT_SNIPPET']
    print("TEXT SNIPPET AGAINST: " + snippet)
    messages.extend([{'role': 'user', 'content': snippet}])



completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages
)
print(completion.choices)
print(completion.choices[0].message.content)

    

    
# df3.to_csv('found_cases.csv', index=False)
    

[0.7651977619475606, 0.7625256318359452, 0.7608468871719692, 0.7582924450213572, 0.7522791839985941, 0.744134256820393, 0.7438057195207606, 0.7437967334630499, 0.7436608022513328, 0.7429334372389791]
[0.7905507629253821, 0.7819074861349609, 0.7798289728055968, 0.7771109057350033, 0.7688069768684549, 0.7674484594562572, 0.7628954981582848, 0.7625488396969623]
TEXT SNIPPET FOR: Was there any division in the sense that - a lot of the women who have spoken to me, in a lot of factories the women did the less skilled work and men got into the better jobs. That doesn’t sound like it was the case for you? No, it wasn’t like that at all. In the Ormeau Baker, women’s work was women’s work and men’s and there was no mixing because the standards were very high because they didn’t only train you specifically in your job. Discipline was very strong and manners were everything. I mean when we met our lady superintendent in any department, if I had’ve said “Good morning” she’d have turned to me and sa

[<OpenAIObject at 0x7fa2dcfbbef0> JSON: {
  "index": 0,
  "message": {
    "role": "assistant",
    "content": "Based on the 20 quotes from the interviewee, 10 thematic codes can be identified:\n\n1. Gender division in the workforce (2 quotes): The interviewee mentions that in many factories, women were assigned less skilled work while men got better jobs. However, in the Ormeau Bakery, there was no such division.\n\n2. Workforce structure (2 quotes): The interviewee describes that the chocolate room in the bakery was predominantly staffed by women, while other areas such as hotplate workers and flour loft had men working.\n\n3. Training and learning on the job (1 quote): The interviewee mentions that they were provided with training in the bakery, indicating that they acquired skills while working.\n\n4. Impact of war on marriage and personal life (2 quotes): The interviewee discusses how the war impacted the ability to get married due to rationing and job uncertainties. They waited u

In [21]:
input_sentence = "You see, there wasn’t the opportunities then there are now.  I mean when I was leaving school the president of our public elementary school came to my mother and said that they would pay half the fees if she could raise the half to allow me to go on to be a teacher. "

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


model="text-embedding-ada-002"

def string_from_arr(text):
     text = " ".join(" ".join(text).split()[:100])
     return text.replace("\n", " ")

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

# input_sentence = "Prior to the war, opportunities for women were limited to traditional roles like homemaking and low-skilled jobs. However, with the men being drafted into the military, industries were desperate for workers, and I found myself working in a factory assembling aircraft parts. This experience opened doors for me and many other women. After the war, I pursued a career in engineering, a field I never imagined entering before. The war definitely had a profound impact on our subsequent career trajectories."

input_embedding = np.array(get_embedding(input_sentence, model))


similarities = cosine_similarity([input_embedding], embeddings)
sorted_indices = np.argsort(similarities[0])[::-1]
print([similarities[0][i] for i in sorted_indices][:20])
top_20_snippets=  df_one.loc[sorted_indices].iloc[:20, :2]


question = """You will receive 20 quotes from an interviewee who participated in the research. 
Regarding the question: What were the employment patterns and opportunities for women during World War II, 
and how did these experiences impact their subsequent career trajectories?, identify 10 thematic codes which emerge 
in the 20 quotes and explain how they emerged in the quotes. Include referencing (index 0 to 19) to the quotes which relate to the thematic code."""
messages = [
        {'role': 'system', 'content': 'You are a researcher researching how world war two impacted women. You are trying to find thematic codes in your qualitative research interview data, which illustrate different themes which emerge in the research data'},
        {'role': 'user', 'content': question}
    ]
i = 0
for index, row in top_20_snippets.iterrows():
        snippet = row['TEXT_SNIPPET']
        print("TEXT SNIPPET FOR INDEX " + str(i) + ": " + snippet)
        messages.extend([{'role': 'user', 'content': snippet}])
        i += 1


completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages
)
print(completion.choices)
print(completion.choices[0].message.content)

    

    
# df3.to_csv('found_cases.csv', index=False)
    

[0.7864843988494015, 0.7826415798556978, 0.7813787649539017, 0.7792155847220036, 0.7772567128890373, 0.7735163785323913, 0.7727811540080243, 0.7725788940621541, 0.7722608111979258, 0.7718947222248822, 0.7695942856147424, 0.7653621573622245, 0.7651586525033464, 0.7650677081046316, 0.7650651343069594, 0.7648907854261064, 0.7643269075225867, 0.763390138642617, 0.7632539726648887, 0.7629929642437696]
TEXT SNIPPET FOR INDEX 0: Were you ever active yourself? Well, we had to pay whatever, there was always a certain of payment you know for trade union membership but oh yes, it was all very much above board, indeed very much. Everything was right about the Ormeau Bakery; I just felt, no matter where I ever worked after that, I was always looking back. Many’s the time I used to say, dear me, you wouldn’t get them doing that in the Ormeau Bakery. I used to have to shut up at times. I wouldn’t say it made you very popular. [Both laugh] Oh yes, it was a wonderful place to work and the owner himself

[<OpenAIObject at 0x7fa2dcf420e0> JSON: {
  "index": 0,
  "message": {
    "role": "assistant",
    "content": "Based on the provided quotes, the following thematic codes emerge:\n\n1. Pride and nostalgia for Ormeau Bakery: Quotes 0, 1, 6\nThese quotes demonstrate a deep sense of pride and nostalgia for the Ormeau Bakery. The interviewee expresses admiration for the bakery's integrity, the owner's kindness, and the high standards maintained at the workplace.\n\n2. Limited employment opportunities for women: Quotes 3, 9\nThe interviewee reveals that there were limited employment opportunities for women during World War II. This is evident from her statement about being chosen for a specific job because there were no other suitable candidates.\n\n3. Early entry into workforce: Quote 4\nThe interviewee mentions leaving school at the age of fourteen and starting work at a later age of twenty-six. This highlights the early entry of women into the workforce during this time.\n\n4. Marriage r