In [1]:
# !pip install openai  --upgrade --quiet

# Initiate

In [66]:
import time
import os
import openai
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from scipy.spatial.distance import cosine, euclidean

openai.api_key = os.environ['OPENAI_API_KEY']
np.set_printoptions(precision=3, suppress=True)

REFRESH = False

def get_embedding(text, sleep=0):
    '''Compute GPT3-embedding for a given text string'''
    time.sleep(sleep)
    result = openai.Embedding.create(input=text, model='text-embedding-ada-002')
    return np.array(result.data[0].embedding)

In [61]:
result = openai.Embedding.create(input='test message', model='text-embedding-ada-002')

In [46]:
people = pd.read_csv('./data/people.csv')
messages = pd.read_csv('./data/messages.csv', sep='John_Cena')
images = pd.read_csv('./data/images.csv', sep='John_Cena')

  messages = pd.read_csv('./data/messages.csv', sep='John_Cena')
  images = pd.read_csv('./data/images.csv', sep='John_Cena')


In [4]:
df_people = pd.DataFrame([{j[0]:j[1] for j in [i.split(': ') for i in person.replace('\r', '').split('\n')]} for person in list(people.items())[0][1]])

In [6]:
# @title adequate notation
# people_list = []
# for person in list(people.items())[0][1]:
#     person = person.replace('\r', '')
#     person_list = [i.split(': ') for i in person.split('\n')]
#     person_dict = {j[0]: j[1] for j in person_list}
#     people_list.append(person_dict)
# df_people = pd.DataFrame(people_list)

In [5]:
df_people['Likes'] = df_people['Likes'].str.split(', ')
df_people['Dislikes'] = df_people['Dislikes'].str.split(', ')

In [6]:
def glue(row):
    replacement_dict = {'Male': 'man', 'Female': 'woman'}
    row['Gender'] = replacement_dict.get(row['Gender'], 'John Cena')
    return ' '.join(row[['Lifestyle', 'Age', 'Race', 'Gender']].values).lower()

In [7]:
df_people['Description'] = df_people.apply(lambda row: glue(row), axis='columns')

In [8]:
df_people['ID'] = range(len(df_people))

In [9]:
df_people

Unnamed: 0,Gender,Age,Race,Lifestyle,Likes,Dislikes,Description,ID
0,man,Young Adult,African American,Moderate,"[Sports, Outdoors, Learning, Exercise]","[Homemaking, Design, Relaxing, Arts and Crafts]",moderate young adult african american man,0
1,woman,Adult,Asian,Active,"[Outdoors, Exercise, Sports, Homemaking]","[Arts and Crafts, Design, Games, Relaxing]",active adult asian woman,1
2,man,Senior,Hispanic,Sedentary,"[Relaxing, Homemaking, Indoors, Arts and Crafts]","[Outdoors, Sports, Exercise, Design]",sedentary senior hispanic man,2
3,woman,Young Adult,Asian,Moderate,"[Learning, Design, Arts and Crafts, Games]","[Sports, Outdoors, Homemaking, Relaxing]",moderate young adult asian woman,3
4,man,Adult,White,Active,"[Sports, Outdoors, Exercise, Games]","[Homemaking, Design, Arts and Crafts, Relaxing]",active adult white man,4
5,woman,Middle-aged,Hispanic,Sedentary,"[Relaxing, Homemaking, Indoors, Arts and Crafts]","[Outdoors, Sports, Exercise, Design]",sedentary middle-aged hispanic woman,5
6,man,Senior,White,Moderate,"[Learning, Design, Arts and Crafts, Relaxing]","[Sports, Outdoors, Homemaking, Exercise]",moderate senior white man,6
7,woman,Young Adult,Asian,Active,"[Sports, Outdoors, Exercise, Homemaking]","[Arts and Crafts, Design, Games, Relaxing]",active young adult asian woman,7
8,man,Adult,African American,Sedentary,"[Relaxing, Homemaking, Indoors, Arts and Crafts]","[Outdoors, Sports, Exercise, Design]",sedentary adult african american man,8
9,woman,Middle-aged,Asian,Moderate,"[Learning, Design, Arts and Crafts, Games]","[Sports, Outdoors, Homemaking, Relaxing]",moderate middle-aged asian woman,9


compose the dictionary with embeddings of all the likes and dislikes

likes = set(np.concatenate(df_people.Likes))
dislikes = set(np.concatenate(df_people.Dislikes))

embeddings_dict = {}
for i in likes | dislikes:
    time.sleep(2)
    embeddings_dict[i] = get_embedding(i)

descriptions_embeddings_dict = {}
for i in df_people.Description:
    time.sleep(2)
    descriptions_embeddings_dict[i] = get_embedding(i)

descriptions_embeddings_dict

In [12]:
def get_df_likes_dislikes(
        attr,
        df_people=df_people,
        refresh=REFRESH
        ):
    if not refresh:
        raise Exception('API call aborted. Were you sure? If yes, change REFRESH to True.')
    df = pd.DataFrame()
    for i in df_people[['ID', 'Description', attr]].values:
        for like in i[2]:
            df = pd.concat(
                [
                    df,
                    pd.DataFrame(
                        [
                            i[0],
                            like.lower(),
                            f'{i[1]} who likes {like.lower()}'
                        ]
                    ).T
                ],
                axis='rows'
            )
    df.reset_index(inplace=True, drop=True)
    df.columns=['ID', 'Activity', 'Description']
    df['Embedding'] = df.Description.apply(lambda x: get_embedding(x))
    return df

In [13]:
def add_activity(df):
    df['Activity'] = np.stack(df.Description.str.split('who likes '))[:, 1]

In [16]:
def normalize(x, overall_min, overall_max):  # currently from -1 to 1 due to negative vectors (we center at 0)
    return 2 * (x - overall_min) / (overall_max - overall_min) - 1

def add_deviation(df):
    # first we find mean vector for an ID
    # only then we find the mean for likes/dislikes
    # thus we prevent individuals having too much likes/dislikes
    # from solely deciding the mean
    mean_vector = df\
    .groupby('ID')\
    .Embedding\
    .apply(lambda x: np.mean(np.stack(x), axis=0))\
    .mean()
    deviation = df.Embedding.apply(lambda x: x - mean_vector)
    df['Deviation'] = deviation
    overall = np.concatenate(df.Deviation)
    overall_min = np.min(overall)
    overall_max = np.max(overall)
    del overall
    norm_dev = df.Deviation.apply(lambda x: normalize(x, overall_min, overall_max))
    df['Norm_Dev'] = norm_dev

In [17]:
preferences_dfs = {}
for likes_dislikes in ['likes', 'dislikes']:
    if REFRESH:
        preferences_dfs[likes_dislikes] = get_df_likes_dislikes(
            likes_dislikes.capitalize(), df_people=df_people
            )
        preferences_dfs[likes_dislikes].to_csv(f'data/{likes_dislikes}.csv', index=False)
        np.save(f'data/{likes_dislikes}', preferences_dfs[likes_dislikes].Embedding)
    else:
        preferences_dfs[likes_dislikes] = pd.read_csv(f'data/{likes_dislikes}.csv')
        preferences_dfs[likes_dislikes].Embedding = np.load(
            f'data/{likes_dislikes}.npy', allow_pickle=True
            )
    add_activity(preferences_dfs[likes_dislikes])
    add_deviation(preferences_dfs[likes_dislikes])

In [72]:
content_dfs = {
    'messages': messages,
    'images': images,
}

In [73]:
for content_type in ['messages', 'images']:
    df = content_dfs[content_type]
    df['Content_ID'] = range(len(df))

    if REFRESH:
        df['Embedding'] = df['content'].apply(get_embedding)
        df.to_csv(f'data/{content_type}_embeddings.csv', index=False)
        np.save(f'data/{content_type}', df.Embedding)
    else:
        df['Embedding'] = np.load(f'data/{content_type}.npy', allow_pickle=True)
    # offset each group by it's center
    emb_center = df['Embedding'].mean()    
    deviation = df.Embedding.apply(lambda emb: emb - emb_center)
    df['Deviation'] = deviation
    
    overall = np.concatenate(df.Deviation)
    overall_min = np.min(overall)
    overall_max = np.max(overall)
    del overall
    norm_dev = df.Deviation.apply(lambda x: normalize(x, overall_min, overall_max))
    df['Norm_Dev'] = norm_dev

In [22]:
euclidean([-1, -1], [1, 1])/(2**0.5)

2.0

In [23]:
cosine([-1, -1], [1, 1])

2.0

In [25]:
def get_cosine(vec_1, vec_2, epsilon=10**-6):
    """
    cosine() is so bad, that it's incapable
    of evaluating [0, 0] to [1, 1] distance
    due to zero division.
    Thus we manually bring it to live here
    """
    vec_1 = [i if i else epsilon for i in vec_1]
    vec_2 = [i if i else epsilon for i in vec_2]
    return cosine(vec_1, vec_2)

In [26]:
def get_distance(vec_1, vec_2):
    """
    intended to work with -1 to 1 normalized vectors
    combines two approaches so that negatives of cosine similarity
    are smoothened as well as the negatives of euclidean alone
    """
    euc = euclidean(vec_1, vec_2)
    cos = get_cosine(vec_1, vec_2)
    # let's scale both distances to [0, 1]
    # since cosine() lies between 0 and 2, halve it    
    cos /= 2
    # now normalize the euclidean distance output
    # larges distances in our case are beteween
    # [-1, -1, ... , -1] and [1, 1, ... , 1]
    # and the value depends on n dimensions
    # so let's consider it
    ones = np.ones(len(vec_1))
    laregest_euc = euclidean(- ones, ones)
    euc /= laregest_euc    
    dist = euc/2 + euc*cos + cos/2
    # scale dist to [0, 1]
    dist /= 2
    # return euc, cos, dist  # an option for testing the function
    return dist

In [80]:
users = set(df_people.ID)
users

In [82]:
user = 0
user_vec = preferences_dfs['likes'].Norm_Dev[user]
user_activity = preferences_dfs['likes'].Activity[user]
user_activity, user_vec

('sports', array([-0.215, -0.147,  0.222, ..., -0.104, -0.1  ,  0.373]))

In [78]:
message_distances = content_dfs['messages'].Norm_Dev.apply(lambda x: get_distance(x, user_vec))
2 * (message_distances - message_distances.min()) / (message_distances.max() - message_distances.min()) - 1

0    -0.578270
1    -0.254970
2     0.654209
3    -0.031063
4    -0.232658
5    -0.836242
6    -0.609635
7    -1.000000
8    -0.682812
9    -0.818118
10   -0.072274
11   -0.346504
12    0.403642
13   -0.131215
14    0.416898
15    0.329650
16   -0.291280
17   -0.448340
18   -0.235248
19   -0.181300
20    0.233747
21    0.288873
22    0.702827
23    0.225809
24   -0.278210
25    0.895140
26    0.907253
27    0.463770
28    0.169184
29   -0.027674
30    0.862179
31    0.230272
32    0.589889
33    0.234934
34    0.386357
35    0.508308
36    0.315224
37    1.000000
38    0.591318
39    0.640380
Name: Norm_Dev, dtype: float64

In [79]:
normalize(
    message_distances,
    message_distances.min(),
    message_distances.max()
)

0    -0.578270
1    -0.254970
2     0.654209
3    -0.031063
4    -0.232658
5    -0.836242
6    -0.609635
7    -1.000000
8    -0.682812
9    -0.818118
10   -0.072274
11   -0.346504
12    0.403642
13   -0.131215
14    0.416898
15    0.329650
16   -0.291280
17   -0.448340
18   -0.235248
19   -0.181300
20    0.233747
21    0.288873
22    0.702827
23    0.225809
24   -0.278210
25    0.895140
26    0.907253
27    0.463770
28    0.169184
29   -0.027674
30    0.862179
31    0.230272
32    0.589889
33    0.234934
34    0.386357
35    0.508308
36    0.315224
37    1.000000
38    0.591318
39    0.640380
Name: Norm_Dev, dtype: float64

In [143]:
user = 10
count = 0
for entry in preferences_dfs['likes'].query(f'ID == {user}').iterrows():
    entry = entry[1]
    if not count:
        count += 1
        print(f'User ID: {user}')
        print(entry.Description.split('who likes')[0])
    message_distances = messages.Norm_Dev.apply(lambda x: get_distance(x, entry.Norm_Dev))
    message_distances_normalized = normalize(
        message_distances,
        message_distances.min(),
        message_distances.max()
    )
    print()
    print(entry.Activity)
    best_fit = message_distances_normalized.argmin()
    worst_fit = message_distances_normalized.argmax()
    print(f'{best_fit=}', messages.loc[best_fit].content)
    print(f'{worst_fit=}', messages.loc[worst_fit].content)


User ID: 10
active senior african american man 

sports
best_fit=9 Grab your clubs and hit the green! Golfing is a fun way to enjoy the outdoors, socialize with friends, and improve your hand-eye coordination and focus. Fore!
worst_fit=37 Put down the phone and pick up some decor! Home decorating is a fun and creative way to express yourself and transform your living space into a cozy and inviting sanctuary. Plus, you'll feel proud of your beautiful and personalized home!

outdoors
best_fit=1 Take a break from your screen and hit the trails! Hiking is a great way to explore nature, get some exercise, and clear your mind. Don't forget to bring some snacks and water!
worst_fit=37 Put down the phone and pick up some decor! Home decorating is a fun and creative way to express yourself and transform your living space into a cozy and inviting sanctuary. Plus, you'll feel proud of your beautiful and personalized home!

exercise
best_fit=4 Step away from your screen and take a walk outside. Wa

In [147]:
entry

ID                                                            10
Description    active senior african american man who likes r...
Embedding      [-0.004802075680345297, -0.013270577415823936,...
Activity                                                relaxing
Deviation      [0.0022611532240262022, -0.008505503841661266,...
Norm_Dev       [0.07308562840649313, -0.17912949317732452, 0....
Name: 43, dtype: object

In [207]:
likes_weight = 1
dislikes_weight = 1
preference_distances_by_user = {}
preference_distances_df = pd.DataFrame()
for user in users:
    preference_distances_by_user[user] = {}
    for content_type in ['messages', 'images']:
        preference_distances_by_user[user][content_type] = {}
        for content_entry in content_dfs[content_type].iterrows():
            content_entry = content_entry[1]
            preference_distances_by_user[user][content_type][content_entry.Content_ID] = {}
            for likes_dislikes in ['likes', 'dislikes']:
                preference_distances_by_user[user][content_type][content_entry.Content_ID][likes_dislikes] = []
                for user_pref_entry in preferences_dfs[likes_dislikes].query(f'ID == {user}').iterrows():
                    user_pref_entry = user_pref_entry[1]
                    preference_distance = get_distance(content_entry.Norm_Dev, user_pref_entry.Norm_Dev)
                    preference_distances_by_user[user][content_type][content_entry.Content_ID][likes_dislikes].append(preference_distance)
                    print(f'{user=} {content_type} Content_ID={content_entry.Content_ID} {likes_dislikes} {preference_distance}')
                    distance_df_entry = {
                        'User_ID': user,
                        'Content_Type': content_type,
                        'Content_ID': content_entry.Content_ID,
                        'Attitude': likes_dislikes,
                        'Activity': user_pref_entry.Activity,
                        'Preference_Distance': preference_distance
                    }
                    preference_distances_df = pd.concat([preference_distances_df, pd.DataFrame(distance_df_entry, index=[0])])
            mean_like_distance = np.mean(preference_distances_by_user[user][content_type][content_entry.Content_ID]['likes'])
            min_dislike_distance =  np.min(preference_distances_by_user[user][content_type][content_entry.Content_ID]['dislikes'])
            preference_distances_by_user[user][content_type][content_entry.Content_ID]['Preference_Distance'] = mean_like_distance - min_dislike_distance
preference_distances_df.reset_index(drop=True, inplace=True)

user=0 messages Content_ID=0 likes 0.19514712389025118
user=0 messages Content_ID=0 likes 0.20915028169111896
user=0 messages Content_ID=0 likes 0.21005833395742193
user=0 messages Content_ID=0 likes 0.18381935756895096
user=0 messages Content_ID=0 dislikes 0.20690674239122583
user=0 messages Content_ID=0 dislikes 0.20602441459965265
user=0 messages Content_ID=0 dislikes 0.18939520869232745
user=0 messages Content_ID=0 dislikes 0.19837772429606754
user=0 messages Content_ID=1 likes 0.20307409336204768
user=0 messages Content_ID=1 likes 0.15655348676665423
user=0 messages Content_ID=1 likes 0.23026654544954359
user=0 messages Content_ID=1 likes 0.18942463047932834
user=0 messages Content_ID=1 dislikes 0.2253180073671604
user=0 messages Content_ID=1 dislikes 0.22341633572015546
user=0 messages Content_ID=1 dislikes 0.1899152612937533
user=0 messages Content_ID=1 dislikes 0.224277985248699
user=0 messages Content_ID=2 likes 0.2253661586607394
user=0 messages Content_ID=2 likes 0.222444827

In [270]:
def print_user_results(
        user: int,
        content_type: str,
        n_items: int = 5
    ):
    """
    user (int): User_ID

    content_type (str): either 'messages' or 'images'

    n_items: top N messages/images,
    set 1 to show the best/worst options,
    default = 5 (for number of fingers in human to count with)
    """
    print(people.loc[user].content)
    dist_data = preference_distances_by_user[user][content_type].items()
    pref_dist_list = [(content_id, i['Preference_Distance']) for (content_id, i) in dist_data]
    sorted_content_ids = sorted(
        pref_dist_list,
        key=lambda x: x[1]
        )

    print(f'\nTop {n_items} {content_type}:')
    for n, (content_id, pref_dist) in enumerate(sorted_content_ids[:n_items]):
        content = content_dfs[content_type]\
            .query(f'Content_ID == {content_id}')\
                .content\
                    .values[0]
        print(f'{n+1}. {content}')

    print(f'\nBottom {n_items} {content_type}:')
    for n, (content_id, pref_dist) in enumerate(sorted_content_ids[-n_items:][::-1]):
        content = content_dfs[content_type]\
            .query(f'Content_ID == {content_id}')\
                .content\
                    .values[0]
        print(f'{n+1}. {content}')

In [277]:
print_user_results(
    user=0,
    content_type='messages',
    n_items=5
    )

Gender: Male
Age: Young Adult
Race: African American
Lifestyle: Moderate
Likes: Sports, Outdoors, Learning, Exercise
Dislikes: Homemaking, Design, Relaxing, Arts and Crafts

Top 5 messages:
1. Hop on your bike and feel the wind in your hair! Biking is a fun and eco-friendly way to explore your surroundings, get some exercise, and enjoy the outdoors. Don't forget your helmet!
2. Grab your clubs and hit the green! Golfing is a fun way to enjoy the outdoors, socialize with friends, and improve your hand-eye coordination and focus. Fore!
3. Pick up some weights and feel the burn! Weightlifting is a great way to build muscle, boost your metabolism, and feel more confident in your body. You got this!
4. Put on your running shoes and hit the pavement! Jogging is a great way to improve your cardiovascular health, burn calories, and reduce stress. Start slow and steady, and work your way up!
5. Strap on your roller skates and roll into some fun! Roller skating is a great way to improve your bal

In [282]:
print_user_results(
    user=0,
    content_type='images',
    n_items=5
    )

Gender: Male
Age: Young Adult
Race: African American
Lifestyle: Moderate
Likes: Sports, Outdoors, Learning, Exercise
Dislikes: Homemaking, Design, Relaxing, Arts and Crafts

Top 5 images:
1. Senior white man wearing a coat and holding a book while walking outdoors.
2. Young adult African American man walking his dog in the park.
3. Senior white woman biking in the park and looking towards the sky.
4. Young adult African American woman looking at the newspaper.
5. Senior hispanic man playing a game of cards in the park.

Bottom 5 images:
1. Young adult white woman dancing and cleaning the house.
2. Woman placing a cutting board into the kitchen cupboard.
3. Adult white woman rolling dough in the kitchen and smiling.
4. Person knitting at home next to a basket of yarn.
5. Adult white woman sitting at a desk and painting.


In [258]:
# # generating the best recommendation for each user via grouped df of distances

# mean_like_dists = preference_distances_df.query("Attitude == 'likes'")\
#     .drop(columns=['Activity', 'Attitude'])\
#         .groupby(
#             [
#                 'User_ID',
#                 'Content_Type',
#                 'Content_ID'
#             ]
#         ).mean()['Preference_Distance']
# min_dislike_dists = preference_distances_df.query("Attitude == 'dislikes'")\
#     .drop(columns=['Activity', 'Attitude'])\
#         .groupby(
#             [
#                 'User_ID',
#                 'Content_Type',
#                 'Content_ID'
#             ]
#         ).min()['Preference_Distance']
# grouped = preference_distances_df\
#     .drop(columns=['Activity', 'Attitude'])\
#         .groupby(
#             [
#                 'User_ID',
#                 'Content_Type',
#                 'Content_ID'
#             ]
#         ).count()
# grouped['Preference_Distance'] = mean_like_dists - min_dislike_dists
# min_dist_indexes = grouped.groupby(['User_ID', 'Content_Type'])['Preference_Distance'].idxmin()
# grouped.loc[min_dist_indexes]

In [None]:
1/0