# Using Embedding to Study the Latent Space of the Dating Profiles

## Data Preprocessing

In [1]:
import os
import pandas as pd
import numpy as np

# load cleaned_okc.csv
df = pd.read_csv('cleaned_okc.csv')

# preview the data
df.head()

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,"Favorite books, movies, show, music, and food",The six things I could never do without,I spend a lot of time thinking about,On a typical Friday night I am,The most private thing I am willing to admit,You should message me if…,merged_profile,cleaned_religion,seriousness_degree,belief
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,"books: absurdistan, the republic, of mice and ...",food. water. cell phone. shelter.,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet! you are ti...,about me: i would love to think that i was so...,agnosticism,4.0,agnosticism
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories. my b...,,,i am very open and will share just about anyth...,,i am a chef: this is what that means. 1. i am ...,agnosticism,2.0,agnosticism
2,29,single,m,straight,average,mostly anything,socially,,graduated from college/university,white,...,"books: to kill a mockingbird, lord of the ring...","like everyone else, i love my friends and fami...",what my contribution to the world is going to ...,out with my friends!,i cried on my first day at school because a bi...,you're awesome.,"i'm an australian living in san francisco, but...",atheism,0.0,atheism
3,31,single,f,straight,average,mostly anything,socially,never,graduated from college/university,white,...,"i like: alphabetized lists, aquariums, autobio...","friends, family, notebook/pen, books, music, t...",things that amuse and inspire me,out and about or relaxing at home with a good ...,,,"writing. meeting new people, spending time wi...",christianity,0.0,theism
4,24,single,f,straight,,strictly anything,socially,,graduated from college/university,white,...,i am always willing to try new foods and am no...,sports/my softball glove coffee. because nobod...,,"in or out... drinking with friends, maybe a ba...",potential friends/lovers/people who come in co...,http://www.youtube.com/watch?v=4dxbwzuwsxk let...,"oh goodness. at the moment i have 4 jobs, so ...",christianity,2.0,theism


In [2]:
# get individuals whose seriousness_degree is 4.0 
serious_df = df[df.seriousness_degree == 4.0]

serious_df.columns = serious_df.columns.str.replace('[^\w\s]', '', regex=True)

# see the number of beliefs in serious_df
serious_df.cleaned_religion.value_counts()

cleaned_religion
atheism         554
christianity    543
other           508
agnosticism     311
catholicism      93
buddhism         66
judaism          20
hinduism         14
islam            11
Name: count, dtype: int64

In [3]:
# replace the space in the column name with underscore
serious_df.columns = serious_df.columns.str.replace(' ', '_')

# column names to lower case
serious_df.columns = serious_df.columns.str.lower()

# preview the data
serious_df.head()

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,favorite_books_movies_show_music_and_food,the_six_things_i_could_never_do_without,i_spend_a_lot_of_time_thinking_about,on_a_typical_friday_night_i_am,the_most_private_thing_i_am_willing_to_admit,you_should_message_me_if,merged_profile,cleaned_religion,seriousness_degree,belief
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,"books: absurdistan, the republic, of mice and ...",food. water. cell phone. shelter.,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet! you are ti...,about me: i would love to think that i was so...,agnosticism,4.0,agnosticism
6,28,seeing someone,m,straight,average,mostly anything,socially,never,graduated from college/university,white,...,books = yes. avid reader. moves = eternal suns...,"guitar - even if i don't play it all the time,...",a little bit of everything. but mostly social ...,hanging out with a small group of friends--sta...,i'm picky when it comes to dating. i know what...,"if you know who you are, who you want, where y...","i was born in wisconsin, grew up in iowa, and ...",christianity,4.0,theism
46,31,single,f,bisexual,average,strictly anything,socially,,,"middle eastern, other",...,"""wings of desire"" (german film by wim wenders)...",i can do without. i value: 1. my hands. 2. ot...,"i rarely think, it's over-rated ; )",the answer to each moment must be yes.,"i am an open book, i do not like secrets. if i...",you feel compelled.,i slept and dreamt that life was joy. i awoke ...,other,4.0,other
48,35,single,m,straight,athletic,mostly anything,socially,sometimes,graduated from space camp,"native american, white",...,,"my camera, aviation, amazing food, touch, lear...","travel, food, photography, sensual fun, aviati...","often working a party. lights, sound, managing...",,you can make me laugh or turn me on. teach me ...,"i'm an adventurer first, i take calculated ris...",agnosticism,4.0,agnosticism
59,29,single,m,straight,fit,mostly anything,socially,sometimes,graduated from college/university,white,...,-books: anything joseph campbell - osho - terr...,invalid question,the world,out.,no,you're curious.,"my names josh, and i create art for a living. ...",agnosticism,4.0,agnosticism


In [4]:
# filter out NA entries in 'you_should_message_me_if...'
serious_df = serious_df[serious_df.you_should_message_me_if.notna()]

# filter out string with less than 25 characters (super short and careless responses)
serious_df = serious_df[serious_df.you_should_message_me_if.str.len() > 30]

In [5]:
from numpy.random import choice

# List of religions to include, excluding 'other'
religions = ['atheism', 'christianity', 'agnosticism', 'catholicism', 'buddhism', 'judaism', 'hinduism', 'islam']

# Dictionary to hold the sampled individuals from each religion
sampled_individuals = {}

# Loop through each religion and sample individuals
for religion in religions:
    # Filter the dataset for the current religion
    religion_data = serious_df[serious_df['cleaned_religion'] == religion]
    
    # Number of individuals to sample: 15 or the size of the group if it has fewer than 15 members
    n_samples = min(15, len(religion_data))
    
    # Randomly sample individuals without replacement
    sampled_individuals[religion] = religion_data.sample(n=n_samples, replace=False)

# Combine all sampled individuals into a single DataFrame
final_sample = pd.concat(sampled_individuals.values())

final_sample

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,favorite_books_movies_show_music_and_food,the_six_things_i_could_never_do_without,i_spend_a_lot_of_time_thinking_about,on_a_typical_friday_night_i_am,the_most_private_thing_i_am_willing_to_admit,you_should_message_me_if,merged_profile,cleaned_religion,seriousness_degree,belief
12045,43,single,f,bisexual,curvy,strictly vegetarian,rarely,,dropped out of college/university,white,...,"authors: nabokov, pratchett, hunter s. thompso...",1. books and kindle 2. camera(s) 3. iphone a...,life. atheism. reasons to live. why i love the...,"reading a book, watching a movie or playing a ...",life is very full of pain and nausea for me. i...,if you want to be my friend who i can spend ti...,june is turning out to be crazy because my new...,atheism,4.0,atheism
11967,25,seeing someone,m,bisexual,fit,mostly anything,socially,sometimes,working on ph.d program,white,...,i don't have the attention span to read books....,1. my laptop. not a day goes by when i don't u...,,possibly going up to san francisco. i wish i l...,i rode the short bus when i was in pre-school.,you feel like it. i promise i won't bite. but ...,"well, i'm in a relationship now. i'm keeping m...",atheism,4.0,atheism
38251,36,single,m,gay,average,,often,never,,white,...,"books: invisible man, native son, harry potter...",,,,,"you are younger, slim to athletic, smooth to s...","i have a tempered, reasonable way of thinking....",atheism,4.0,atheism
13700,27,available,f,bisexual,curvy,anything,socially,sometimes,dropped out of two-year college,"native american, hispanic / latin, white",...,"book: ""unlikely stories, mostly"" by alasdair g...",1. my iphone 2. mascara 3. red lipstick 4. the...,ways to better support my kids and how i can g...,"either at a burlesque show (watching, not perf...",i don't have a lot of privacy. i guess...i dun...,"you're good in the sack, can respect my marria...",i'm an adult actress who has recently started ...,atheism,4.0,atheism
5346,42,available,m,straight,overweight,strictly anything,often,sometimes,working on space camp,white,...,books: i love any novel with a complex plot. t...,i'm going to avoid the cop-out of mentioning f...,how the world works. science fascinates me. po...,on a typical kid night i'm cooking a nice dinn...,"i once caused a cert advisory. also, it's been...",we're a 99% match. i'm willing to listen if we...,i'm a 42-year-old divorced father of two boys ...,atheism,4.0,atheism
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,39,single,m,straight,athletic,strictly anything,,,graduated from space camp,white,...,"holy shit, it's data mining! fuck you, okc! m...",water. a nitrox atmosphere. cerebral cortex. m...,why riot grrls went extinct so quickly.,wishing i had someone as passionate and decade...,"my 92-year-old grandma better like you, becaus...",you've never attended: a) sacred heart cathedr...,"""essay?"" fat fucking chance. playwright. phot...",islam,4.0,theism
29225,27,single,m,straight,overweight,mostly vegetarian,not at all,never,graduated from masters program,asian,...,books the dummy line enders game 3 mistakes o...,,"about my family, friends and how can i make a ...",,i got enrolled on this app :-),if you wanna have a good friend to have some g...,books the dummy line enders game 3 mistak...,islam,4.0,theism
21604,33,single,m,straight,thin,mostly anything,socially,never,graduated from masters program,middle eastern,...,i'm a typical guy when it comes to movies: sca...,1) family and friends 2) music 3) passion (yes...,new business opportunities,out with friends and sadly enough working...bu...,"i smoke....you may not think this is private, ...",you want to get to know me better....talk to y...,ambitious and passionate about life. i like to...,islam,4.0,theism
5473,21,single,m,straight,average,mostly other,socially,often,graduated from high school,"middle eastern, black, white",...,ihave a broad view.big taste.iloike all type o...,computer. (in terms of career).weed. music.cho...,life. simple things. in and outside the box. t...,in da hood trappin or at a party/kickbakk.or m...,my tounge kan work wonders. ;-),"your kinky,wett, and ready. dnt be shy ilike t...",i'm creative.i'm a picses.i'm from east oaklan...,islam,4.0,theism


In [6]:
# check count 
final_sample.cleaned_religion.value_counts()

cleaned_religion
atheism         15
christianity    15
agnosticism     15
catholicism     15
buddhism        15
judaism         14
hinduism        10
islam            6
Name: count, dtype: int64

## Apply embedding to the dating profiles

In [7]:
api_key = os.getenv('OPENAI_API_KEY')

In [8]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-3-large"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [9]:
# apply the function to the final_sample
final_sample['embedding'] = final_sample['you_should_message_me_if'].apply(get_embedding)

In [10]:
# save the final_sample to a csv file
final_sample.to_csv('final_sample.csv', index=False)

## Embed a list of desriable traits

In [12]:
# list of desirable traits
trait_list = [
    'Confidence',
    'Kindness and Compassion',
    'Sense of Humor',
    'Intelligence and Education',
    'Emotional Stability',
    'Physical Attractiveness',
    'Ambition and Passion',
    'Compatibility and Shared Values',
    'Communication Skills',
    'Generosity and Charity',
    'Spiritual Commitment and Shared Religious Practices',
    'Moral Integrity and Respect for Tradition',
    'Family Values',
    'Patience and Forgiveness',
    'Humility and Modesty',
    'Openness to Growth',
    'Purity and Chastity',
    'High Social Status and Wealth',
    'Masculinity',
    'Femininity'
]

In [13]:
# apply the function to the trait_list
trait_embeddings = [get_embedding(trait) for trait in trait_list]

In [14]:
# combine the trait_list and trait_embeddings into a dataframe
trait_df = pd.DataFrame({'trait': trait_list, 'embedding': trait_embeddings})

In [15]:
# save the trait_df to a csv file
trait_df.to_csv('trait_df.csv', index=False)