In [1]:
# !pip install openai  --upgrade --quiet

# Initiate

In [2]:
import time
import os
import openai
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from scipy.spatial.distance import cosine, euclidean

openai.api_key = os.environ['OPENAI_API_KEY']
np.set_printoptions(precision=3, suppress=True)

REFRESH = False

def get_embedding(text, sleep=0):
    '''Compute GPT3-embedding for a given text string'''
    time.sleep(sleep)
    result = openai.Embedding.create(input=text, model='text-embedding-ada-002')
    return np.array(result.data[0].embedding)

In [3]:
people = pd.read_csv('./data/people.csv')
messages = pd.read_csv('./data/messages.csv')
images = pd.read_csv('./data/images.csv')

In [4]:
people

Unnamed: 0,content
0,Gender: Male\r\nAge: Young Adult\r\nRace: Afri...
1,Gender: Female\r\nAge: Adult\r\nRace: Asian\r\...
2,Gender: Male\r\nAge: Senior\r\nRace: Hispanic\...
3,Gender: Female\r\nAge: Young Adult\r\nRace: As...
4,Gender: Male\r\nAge: Adult\r\nRace: White\r\nL...
5,Gender: Female\r\nAge: Middle-aged\r\nRace: Hi...
6,Gender: Male\r\nAge: Senior\r\nRace: White\r\n...
7,Gender: Female\r\nAge: Young Adult\r\nRace: As...
8,Gender: Male\r\nAge: Adult\r\nRace: African Am...
9,Gender: Female\r\nAge: Middle-aged\r\nRace: As...


In [5]:
df_people = pd.DataFrame([{j[0]:j[1] for j in [i.split(': ') for i in person.replace('\r', '').split('\n')]} for person in list(people.items())[0][1]])

In [6]:
# @title adequate notation
# people_list = []
# for person in list(people.items())[0][1]:
#     person = person.replace('\r', '')
#     person_list = [i.split(': ') for i in person.split('\n')]
#     person_dict = {j[0]: j[1] for j in person_list}
#     people_list.append(person_dict)
# df_people = pd.DataFrame(people_list)

In [7]:
df_people['Likes'] = df_people['Likes'].str.split(', ')
df_people['Dislikes'] = df_people['Dislikes'].str.split(', ')

In [8]:
def glue(row):
    replacement_dict = {'Male': 'man', 'Female': 'woman'}
    row['Gender'] = replacement_dict.get(row['Gender'], 'John Cena')
    return ' '.join(row[['Lifestyle', 'Age', 'Race', 'Gender']].values).lower()

In [9]:
df_people['Description'] = df_people.apply(lambda row: glue(row), axis='columns')

In [10]:
df_people['ID'] = range(len(df_people))

In [11]:
df_people

Unnamed: 0,Gender,Age,Race,Lifestyle,Likes,Dislikes,Description,ID
0,man,Young Adult,African American,Moderate,"[Sports, Outdoors, Learning, Exercise]","[Homemaking, Design, Relaxing, Arts and Crafts]",moderate young adult african american man,0
1,woman,Adult,Asian,Active,"[Outdoors, Exercise, Sports, Homemaking]","[Arts and Crafts, Design, Games, Relaxing]",active adult asian woman,1
2,man,Senior,Hispanic,Sedentary,"[Relaxing, Homemaking, Indoors, Arts and Crafts]","[Outdoors, Sports, Exercise, Design]",sedentary senior hispanic man,2
3,woman,Young Adult,Asian,Moderate,"[Learning, Design, Arts and Crafts, Games]","[Sports, Outdoors, Homemaking, Relaxing]",moderate young adult asian woman,3
4,man,Adult,White,Active,"[Sports, Outdoors, Exercise, Games]","[Homemaking, Design, Arts and Crafts, Relaxing]",active adult white man,4
5,woman,Middle-aged,Hispanic,Sedentary,"[Relaxing, Homemaking, Indoors, Arts and Crafts]","[Outdoors, Sports, Exercise, Design]",sedentary middle-aged hispanic woman,5
6,man,Senior,White,Moderate,"[Learning, Design, Arts and Crafts, Relaxing]","[Sports, Outdoors, Homemaking, Exercise]",moderate senior white man,6
7,woman,Young Adult,Asian,Active,"[Sports, Outdoors, Exercise, Homemaking]","[Arts and Crafts, Design, Games, Relaxing]",active young adult asian woman,7
8,man,Adult,African American,Sedentary,"[Relaxing, Homemaking, Indoors, Arts and Crafts]","[Outdoors, Sports, Exercise, Design]",sedentary adult african american man,8
9,woman,Middle-aged,Asian,Moderate,"[Learning, Design, Arts and Crafts, Games]","[Sports, Outdoors, Homemaking, Relaxing]",moderate middle-aged asian woman,9


compose the dictionary with embeddings of all the likes and dislikes

likes = set(np.concatenate(df_people.Likes))
dislikes = set(np.concatenate(df_people.Dislikes))

embeddings_dict = {}
for i in likes | dislikes:
    time.sleep(2)
    embeddings_dict[i] = get_embedding(i)

descriptions_embeddings_dict = {}
for i in df_people.Description:
    time.sleep(2)
    descriptions_embeddings_dict[i] = get_embedding(i)

descriptions_embeddings_dict

In [14]:
def get_df_likes_dislikes(attr, df_people=df_people):
    if not REFRESH:
        raise Exception('API call aborted. Were you sure? If yes, change REFRESH to True.')
    df = pd.DataFrame()
    for i in df_people[['ID', 'Description', attr]].values:
        for like in i[2]:
            df = pd.concat(
                [
                    df,
                    pd.DataFrame(
                        [
                            i[0],
                            f'{i[1]} who likes {like.lower()}'
                        ]
                    ).T
                ],
                axis='rows'
            )
    df.reset_index(inplace=True, drop=True)
    df.columns=['ID', 'Description']
    df['Embedding'] = df.Description.apply(lambda x: get_embedding(x))
    return df

In [15]:
if REFRESH:
    df_likes = get_df_likes_dislikes('Likes', df_people=df_people)
    df_dislikes = get_df_likes_dislikes('Dislikes', df_people=df_people)
    df_likes.to_csv('data/likes.csv', index=False)
    df_dislikes.to_csv('data/dislikes.csv', index=False)
    np.save('data/likes', df_likes.Embedding)
    np.save('data/dislikes', df_dislikes.Embedding)
else:
    df_likes = pd.read_csv('data/likes.csv')
    df_likes.Embedding = np.load('data/likes.npy', allow_pickle=True)
    df_dislikes = pd.read_csv('data/dislikes.csv')
    df_dislikes.Embedding = np.load('data/dislikes.npy', allow_pickle=True)

In [42]:
def normalize(x, overall_min, overall_max):  # currently from -1 to 1 due to negative vectors (we center at 0)
    return 2 * (x - overall_min) / (overall_max - overall_min) - 1

def add_deviation(df):
    # first we find mean vector for an ID
    # only then we find the mean for likes/dislikes
    # thus we prevent individuals having too much likes/dislikes
    # from solely deciding the mean
    mean_vector = df\
    .groupby('ID')\
    .Embedding\
    .apply(lambda x: np.mean(np.stack(x), axis=0))\
    .mean()
    deviation = df.Embedding.apply(lambda x: x - mean_vector)
    df['Deviation'] = deviation
    overall = np.concatenate(df.Deviation)
    overall_min = np.min(overall)
    overall_max = np.max(overall)
    del overall
    norm_dev = df.Deviation.apply(lambda x: normalize(x, overall_min, overall_max))
    df['Norm_Dev'] = norm_dev

In [43]:
add_deviation(df_likes)
add_deviation(df_dislikes)

In [50]:
content_dfs = {
    'messages': messages,
    'images': images,
}

In [46]:
for content_type in ['messages', 'images']:
    df = content_dfs[content_type]
    df['Content_ID'] = range(len(df))

    if REFRESH:
        df['Embedding'] = df['content'].apply(get_embedding)
    else:
        df['Embedding'] = np.load(f'data/{content_type}')
    # offset each group by it's center
    emb_center = df['Embedding'].mean()    
    deviation = df.Embedding.apply(lambda emb: emb - emb_center)
    df['Deviation'] = deviation
    
    overall = np.concatenate(df.Deviation)
    overall_min = np.min(overall)
    overall_max = np.max(overall)
    del overall
    norm_dev = df.Deviation.apply(lambda x: normalize(x, overall_min, overall_max))
    df['Norm_Dev'] = norm_dev

In [49]:
messages.to_csv('data/messages_embeddings.csv')
images.to_csv('data/images_embeddings.csv')
np.save('data/messages', messages.Embedding)
np.save('data/images', images.Embedding)

In [None]:
euclidean([-1, -1], [1, 1])/(2**0.5)

2.0

In [None]:
cosine([-1, -1], [1, 1])

2.0

In [None]:
def get_cosine(vec_1, vec_2, epsilon=10**-6):
    """
    cosine() is so bad, that it's incapable
    of evaluating [0, 0] to [1, 1] distance
    due to zero division.
    Thus we manually bring it to live here
    """
    vec_1 = [i if i else epsilon for i in vec_1]
    vec_2 = [i if i else epsilon for i in vec_2]
    return cosine(vec_1, vec_2)

In [None]:
def get_distance(vec_1, vec_2):
    """
    intended to work with -1 to 1 normalized vectors
    combines two approaches so that negatives of cosine similarity
    are smoothened as well as the negatives of euclidean alone
    """
    euc = euclidean(vec_1, vec_2)
    cos = get_cosine(vec_1, vec_2)
    # let's scale both distances to [0, 1]
    # since cosine() lies between 0 and 2, halve it    
    cos /= 2
    # now normalize the euclidean distance output
    # larges distances in our case are beteween
    # [-1, -1, ... , -1] and [1, 1, ... , 1]
    # and the value depends on n dimensions
    # so let's consider it
    ones = np.ones(len(vec_1))
    laregest_euc = euclidean(- ones, ones)
    euc /= laregest_euc    
    dist = euc/2 + euc*cos + cos/2
    # scale dist to [0, 1]
    dist /= 2
    # return euc, cos, dist  # an option for testing the function
    return dist

In [None]:
user_vec = df_likes.Norm_Dev[0]
user_vec

array([-0.15990006, -0.48826592, -0.47252663, ..., -0.46114113,
       -0.11936103, -0.10757138])

In [None]:
message_distances = messages.Norm_Dev.apply(lambda x: get_distance(x, user_vec))
2 * (message_distances - message_distances.min()) / (message_distances.max() - message_distances.min()) - 1

0   -1.000000
1    0.336279
2    0.950451
3    0.752838
4    0.662796
5    0.738977
6    1.000000
7    0.644908
Name: Norm_Dev, dtype: float64

In [None]:
normalize(
    message_distances,
    message_distances.min(),
    message_distances.max()
)

0   -1.000000
1    0.336279
2    0.950451
3    0.752838
4    0.662796
5    0.738977
6    1.000000
7    0.644908
Name: Norm_Dev, dtype: float64

In [None]:
users = set(df_people.ID)

In [None]:
users

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

In [None]:
user = 0

In [None]:
for dislike in df_dislikes[df_dislikes.ID == user].Norm_Dev:
    message_distances = messages.Norm_Dev.apply(lambda x: get_distance(x, dislike))
    print()
    print(message_distances)


0    0.268303
1    0.277241
2    0.274582
3    0.269100
4    0.281510
5    0.273693
6    0.271558
7    0.262516
Name: Norm_Dev, dtype: float64

0    0.274327
1    0.282516
2    0.284386
3    0.273967
4    0.218520
5    0.267626
6    0.297967
7    0.290717
Name: Norm_Dev, dtype: float64


In [None]:
1/0

### Visualize embeddings

In [None]:
embs = people.embedding.tolist() + messages.embedding.tolist() + images.embedding.tolist()
fig, ax = plt.subplots(1, 2, figsize=(10,4))

# plot embedding difference norm
emb_norm_diff = [[np.linalg.norm(e1-e2) for e2 in embs] for e1 in embs]
im0 = ax[0].imshow(emb_norm_diff, cmap='pink', vmin=0., vmax=1.)
ax[0].set_title('pairwise difference norm')
ax[0].grid(False)
plt.colorbar(im0, ax=ax[0])

# plot embedding cosine similarity
emb_cossim = [[np.dot(e1,e2)/(np.linalg.norm(e1)*np.linalg.norm(e2)) for e2 in embs] for e1 in embs]
im1 = ax[1].imshow(emb_cossim, cmap='viridis', vmin=-1., vmax=1.)
ax[1].set_title('pairwise cosine similarity')
ax[1].grid(False)
plt.colorbar(im1, ax=ax[1])

plt.tight_layout()
plt.show()

# plot embedding pca
emb_pca = PCA(n_components=2).fit_transform(embs)
people_pca, messages_pca, images_pca = np.split(emb_pca, [len(people),-len(images)])
fig, ax = plt.subplots(figsize=(8,5))
# plot people pca
ax.scatter(people_pca[:,0], people_pca[:,1], s=100)
for ind, pnt in enumerate(people_pca):
    ax.annotate(f'  p{ind}', pnt)
# plot messages pca
ax.scatter(messages_pca[:,0], messages_pca[:,1], s=100)
for ind, pnt in enumerate(messages_pca):
    ax.annotate(f'  m{ind}', pnt)
# plot images pca
ax.scatter(images_pca[:,0], images_pca[:,1], s=100)
for ind, pnt in enumerate(images_pca):
    ax.annotate(f'  i{ind}', pnt)
plt.tight_layout()
plt.show()

### Compute recommendations

In [None]:
def softmax(x, temp=1.):
    """Compute row-wise softmax of a matrix x"""
    z = np.array(x)
    z -= z.max(axis=1, keepdims=True)
    z = np.exp(z / temp)
    z /= z.sum(axis=1, keepdims=True)
    return z


def get_recs(df1, df2, temp=.1, k=3):
    """Normalize df1-df2 proximity and return k best/worst recommendations"""
    neg_norms = [[-np.linalg.norm(e1-e2) for e2 in df2.embedding] for e1 in df1.embedding]
    probs = softmax(neg_norms, temp=temp)
    print(f'probability distribution:\n{probs}')
    
    # show k best/worst recommendations for each element
    inds = probs.argsort(axis=1)
    for i in range(len(df1)):
        print('\n========================\n')
        print(df1.loc[i].content)
        
        # k best recommendations
        print()
        for j in range(k):
            l = inds[i][~j]
            print(f'#{l} ({100*probs[i][l]:.0f}%): {df2.loc[l].content}')

        # k worst recommendations
        print()
        for j in range(k):
            l = inds[i][j]
            print(f'#{l} ({100*probs[i][l]:.0f}%): {df2.loc[l].content}')

In [None]:
# people--images
get_recs(people, images)

In [None]:
# messages--images
get_recs(messages, images)

In [None]:
# people--messages
get_recs(people, messages)