In [1]:
%pip install openai

Note: you may need to restart the kernel to use updated packages.




In [2]:
import getpass
import os

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")

In [3]:
import pandas as pd

df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vRc1EkKgndtFkwtypr9d6W6qzw2LIX4eRnNbJYYSIG7cjXRgsuTto4Q5HXaeFj-jAn36RYpFwDOVF_N/pub?gid=1495258861&single=true&output=csv')

# create an id which is the index + 1
df['id'] = df.index + 1
df.head()

Unnamed: 0,name,age,content,len,id
0,Michelle,13,I: Let's begin with ‚how I see myself'. Can yo...,4125,1
1,Sapan,13,I: You've just done one collage yes [how I see...,3847,2
2,Jason,13,"I: So Jason, are these two separate collages o...",8701,3
3,Malcolm,13,"I: OK Malcolm, are these two separate collages...",6486,4
4,Joe,14,I: So let's start with ‚how I see myself'. Do\...,5451,5


In [4]:
# return a batch of rows labelled as samples
def prepare_content(batch):
    return "".join([f"## Sample {row['id']}\n---\n{row['content']}\n---\n\n" for _, row in batch.iterrows()])

# display 10 samples
print(prepare_content(df.iloc[:10]))

## Sample 1
---
I: Let's begin with ‚how I see myself'. Can you just go through the pictures that you've 
used and explain what pictures you'
ve used and why you've used them? 
M: Yeah. I've used this woman [Kelly Ro
wland ] b ecau se sh e's quite fashionable and I 
see myself as a fashionable person. I' ve used a picture of a Liverpool football team 

because I see myself as quite a supportive pers
on towards the team. I put a love heart 
because I'm quite a loving person. I also 
put a laptop because I like to explore things 
and try new things. 
I: What about the pictures of Kat, Alfie [
EastEnders
 characters] and, is that Lemar? 
M: Bow Wow. 
I: Let's start with Kat and Alfie, why have you used them? 

M: Because Kat, erm Alfie's quite cheery and he makes people smile so metimes so I just 
like Alfie, and I also have a temper to so I put Kat. 
I: And what about Bow Wow? 
M: I put him because I like him (laughs). 
I: So if you were going to sum up how you see yourself, what words w

In [7]:
from openai import AsyncOpenAI as OpenAI
client = OpenAI()

In [8]:
system_prompt = """Analyze the given text samples (## Sample 1, ## Sample 2, etc) for thematic labels based on similarities and differences.
The response format is JSONL.

**Format:**
```jsonl
{"name": "label name", "samples": ["1", "2", ...]},
{"name": "another label name", "samples": ["2", "3", ...]},
{"name": "unique label name", "samples": ["1"]},
{"name": "descriptive label name", "samples": ["1", "2", ...]},
{"name": "another important label", "samples": ["2", "3", ...]},
{"name": "a interesting unique label", "samples": ["1"]},
// ... more labels as identified
```"""


async def call_openai(batch):
    try:
        documents = prepare_content(batch)
        response = await client.chat.completions.create(
            model="gpt-4o-mini", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": documents}
            ],
            temperature=1,
            max_tokens=1000,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Failed to get completion: {e}")
        return None

response = await call_openai(df.iloc[:10])
print(response)

```jsonl
{"name": "Self-Perception", "samples": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]},
{"name": "Family and Relationships", "samples": ["1", "2", "6", "7"]},
{"name": "Cultural Identity", "samples": ["2", "4", "6", "8"]},
{"name": "Emotional Expression", "samples": ["1", "3", "5", "7", "10"]},
{"name": "Hobbies and Interests", "samples": ["1", "2", "3", "5", "4", "7", "8"]},
{"name": "Conflict and Anger", "samples": ["3", "1", "7"]},
{"name": "Political Views", "samples": ["4", "5"]},
{"name": "Religion", "samples": ["2", "6", "8"]},
{"name": "Public Perception", "samples": ["3", "4", "5", "10"]},
{"name": "Visual Learning", "samples": ["5"]},
{"name": "Complex Family Dynamics", "samples": ["10"]},
{"name": "Fashion and Style", "samples": ["1", "2", "6", "7"]},
{"name": "Individual vs Societal Expectations", "samples": ["3", "4", "5", "10"]}
```


In [9]:
import json

def parse_labels(response):
    if response:
        # Load labels from JSON
        label_lines = response.strip().split('\n')[1:-1]

        labels_list = []
        for line in label_lines:
            label_dict = json.loads(line.replace("},", "}"))
            labels_list.append(label_dict)
    else:
        labels_list = []
    return labels_list

parse_labels(response)

[{'name': 'Self-Perception',
  'samples': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']},
 {'name': 'Family and Relationships', 'samples': ['1', '2', '6', '7']},
 {'name': 'Cultural Identity', 'samples': ['2', '4', '6', '8']},
 {'name': 'Emotional Expression', 'samples': ['1', '3', '5', '7', '10']},
 {'name': 'Hobbies and Interests',
  'samples': ['1', '2', '3', '5', '4', '7', '8']},
 {'name': 'Conflict and Anger', 'samples': ['3', '1', '7']},
 {'name': 'Political Views', 'samples': ['4', '5']},
 {'name': 'Religion', 'samples': ['2', '6', '8']},
 {'name': 'Public Perception', 'samples': ['3', '4', '5', '10']},
 {'name': 'Visual Learning', 'samples': ['5']},
 {'name': 'Complex Family Dynamics', 'samples': ['10']},
 {'name': 'Fashion and Style', 'samples': ['1', '2', '6', '7']},
 {'name': 'Individual vs Societal Expectations',
  'samples': ['3', '4', '5', '10']}]

In [10]:
import asyncio
import nest_asyncio

nest_asyncio.apply()

def split_dataframe(df, batch_size):
    # shuffle the dataframe
    df = df.sample(frac=1).reset_index(drop=True)
    return [df.iloc[i:i + batch_size] for i in range(0, len(df), batch_size)]

batches = split_dataframe(df, 10)

tasks = [asyncio.create_task(call_openai(batch)) for batch in batches]

results = await asyncio.gather(*tasks)

In [11]:
labels_list = []

for response in results:
    labels = parse_labels(response)
    for label in labels:
        labels_list.append(label['name'])

labels_list = ", ".join(labels_list)
labels_list

"Self-Perception, Others' Perception, Political Views, Hobbies and Interests, Emotional Expression, Cultural Identity, Youth and Image, Differences in Perception, Animal Representation, Celebrity Influence, Emotional Complexity, Fashion and Trends, Familial Relationships, Self-Identity Conflict, Role Models, Artistic Expression, Mixed Ethnicity Identity, Self-Identity and Expression, Perception by Others, Fashion and Personal Style, Sports Enthusiasm, Cultural and Ethnic Identity, Emotional Self-Perception, Music Influence, Role Models and Inspirations, Complex Self-Perception, Differences between Self and Others' Perception, Personal Interests and Hobbies, Interpersonal Relationships, Conflict with Self-Image, self-perception, public perception, interests and hobbies, cultural identity, use of media figures, contrasting perceptions, emotional expression, music influence, animals and nature, fashion and appearance, confusion about identity, ethnicity discussion, self-perception and ide

In [12]:
async def call_openai(batch, labels=None):
    try:
        documents = prepare_content(batch)
        if labels:
            labels_partial = "YOU MUST ONLY APPLY THE FOLLOWING LABELS:\n"
            labels_partial += labels_list
            documents = labels_partial + documents

        response = await client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": documents}
            ],
            temperature=1,
            max_tokens=1000,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Failed to get completion: {e}")
        return None

In [13]:
batches = split_dataframe(df, 10)

tasks = [asyncio.create_task(call_openai(batch, labels_list)) for batch in batches]

results = await asyncio.gather(*tasks)

all_labels = []
for response in results:
    labels = parse_labels(response)
    all_labels.extend(labels)

all_labels

[{'name': 'Self-Perception',
  'samples': ['33', '53', '11', '38', '25', '29', '21']},
 {'name': "Others' Perception", 'samples': ['36', '53', '25', '21']},
 {'name': 'Cultural Identity', 'samples': ['33', '53', '11', '38', '14']},
 {'name': 'Fashion and Personal Style', 'samples': ['53', '38', '25', '11']},
 {'name': 'Emotional Expression', 'samples': ['33', '53', '11', '21']},
 {'name': 'Hobbies and Interests',
  'samples': ['33', '53', '11', '38', '25', '29']},
 {'name': 'Celebrity Influence', 'samples': ['33', '21', '38', '14']},
 {'name': 'Familial Relationships', 'samples': ['53', '11']},
 {'name': 'Self-Identity Conflict', 'samples': ['53', '29', '38']},
 {'name': 'Mixed Ethnicity Identity', 'samples': ['38', '29']},
 {'name': 'Emotional Complexity', 'samples': ['53', '11', '29']},
 {'name': 'Perception by Others', 'samples': ['53', '25', '29']},
 {'name': 'Role Models', 'samples': ['11', '38', '25', '21']},
 {'name': 'Interest in Music and Pop Culture', 'samples': ['53', '25', 

In [14]:
label_data = []
for label in all_labels:
    for sample in label['samples']:
        label_data.append((int(sample), label['name']))

labels_df = pd.DataFrame(label_data, columns=['sample', 'label'])

labels_df.head()

# We use a groupby operation to concatenate tags for the same sample
labels_df = labels_df.groupby('sample')['label'].apply(lambda x: ', '.join(x)).reset_index()

# Merge this tags DataFrame with the main DataFrame
# 'df.index' should match the 'sample' identifiers in the tags DataFrame
df_merged = df.merge(labels_df, left_on='id', right_on='sample', how='left')

df_merged.rename(columns={'label': 'labels'}, inplace=True)

df_merged.head()

Unnamed: 0,name,age,content,len,id,sample,labels
0,Michelle,13,I: Let's begin with ‚how I see myself'. Can yo...,4125,1,1.0,"Self-Perception, Others' Perception, Differenc..."
1,Sapan,13,I: You've just done one collage yes [how I see...,3847,2,2.0,"Cultural Identity, Emotional Expression, Famil..."
2,Jason,13,"I: So Jason, are these two separate collages o...",8701,3,3.0,"Self-Perception, Others' Perception, Emotional..."
3,Malcolm,13,"I: OK Malcolm, are these two separate collages...",6486,4,4.0,"Self-Perception, Others' Perception, Differenc..."
4,Joe,14,I: So let's start with ‚how I see myself'. Do\...,5451,5,5.0,"Self-Perception, Others' Perception, Emotional..."


In [15]:
# one hot encode the tags
one_hot = df_merged['labels'].str.get_dummies(sep=', ')
final_df = pd.concat([df_merged, one_hot], axis=1)
final_df.head()

Unnamed: 0,name,age,content,len,id,sample,labels,Animal Representation,Artistic Expression,Celebrity Influence,...,Perception of Differing Sides of Personality,Personal Interests and Hobbies,Political Views,Role Models,Self-Identity Conflict,Self-Identity and Expression,Self-Perception,Sport Enthusiasm,Sports Enthusiasm,Youth and Image
0,Michelle,13,I: Let's begin with ‚how I see myself'. Can yo...,4125,1,1.0,"Self-Perception, Others' Perception, Differenc...",0,0,1,...,0,0,0,0,1,1,1,0,0,0
1,Sapan,13,I: You've just done one collage yes [how I see...,3847,2,2.0,"Cultural Identity, Emotional Expression, Famil...",0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,Jason,13,"I: So Jason, are these two separate collages o...",8701,3,3.0,"Self-Perception, Others' Perception, Emotional...",0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,Malcolm,13,"I: OK Malcolm, are these two separate collages...",6486,4,4.0,"Self-Perception, Others' Perception, Differenc...",0,0,1,...,0,0,0,0,1,1,1,0,1,0
4,Joe,14,I: So let's start with ‚how I see myself'. Do\...,5451,5,5.0,"Self-Perception, Others' Perception, Emotional...",0,0,1,...,0,0,0,1,0,0,1,0,0,0


In [16]:
final_df.shape

(65, 47)

In [17]:
final_df.to_csv('qualitative_analysis.csv', index=False)