In [13]:
# %pip install openai  -upgrade --quiet

In [264]:
import time
import os
import openai
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from scipy.spatial.distance import cosine, euclidean

openai.api_key = os.environ['OPENAI_API_KEY']
np.set_printoptions(precision=3, suppress=True)


def get_embedding(text, sleep=0):
    '''Compute GPT3-embedding for a given text string'''
    time.sleep(sleep)
    result = openai.Embedding.create(input=text, model='text-embedding-ada-002')
    return np.array(result.data[0].embedding)

In [10]:
people = pd.read_csv('./data/people.csv')
messages = pd.read_csv('./data/messages.csv')
images = pd.read_csv('./data/images.csv')

In [188]:
df_people = pd.DataFrame([{j[0]:j[1] for j in [i.split(': ') for i in person.split('\n')]} for person in list(people.items())[0][1]])

In [57]:
# @title adequate notation
# people_list = []
# for person in list(people.items())[0][1]:
#     person_list = [i.split(': ') for i in person.split('\n')]
#     person_dict = {j[0]: j[1] for j in person_list}
#     people_list.append(person_dict)
# df_people = pd.DataFrame(people_list)

Unnamed: 0,Gender,Age,Race,Lifestyle,Likes,Dislikes
0,Male,Young Adult,African American,Moderate,"Reading, Baking, Watercolor Painting","Weightlifting, Solitaire"
1,Female,Senior,Asian,Sedentary,"Knitting, Jigsaw Puzzles, Crosswords","Walking, Biking, Weightlifting"
2,Male,Adult,Hispanic,Active,"Walking, Biking, Weightlifting","Coloring Books, Solitaire"
3,Female,Middle-aged,White,Moderate,"Reading, Gardening, Baking, Watercolor Painting","Weightlifting, Walking"
4,Male,Young Adult,Asian,Sedentary,"Guitar, Meditating, Solitaire","Biking, Weightlifting, Walking"
5,Female,Adult,Hispanic,Active,"Biking, Weightlifting, Walking","Coloring Books, Jigsaw Puzzles"
6,Male,Middle-aged,African American,Moderate,"Reading, Guitar, Jigsaw Puzzles","Weightlifting, Walking, Biking"
7,Female,Senior,Asian,Sedentary,"Knitting, Coloring Books, Crosswords","Walking, Biking, Weightlifting"
8,Male,Young Adult,Hispanic,Active,"Biking, Weightlifting, Walking, Guitar","Coloring Books, Knitting"
9,Female,Middle-aged,Asian,Moderate,"Reading, Baking, Watercolor Painting, Crosswords","Weightlifting, Walking, Jigsaw Puzzles"


In [189]:
df_people['Likes'] = df_people['Likes'].str.split(', ')
df_people['Dislikes'] = df_people['Dislikes'].str.split(', ')

In [127]:
def glue(row):
    replacement_dict = {'Male': 'man', 'Female': 'woman'}
    row['Gender'] = replacement_dict.get(row['Gender'], 'John Cena')
    return ' '.join(row[['Lifestyle', 'Age', 'Race', 'Gender']].values).lower()

In [190]:
df_people['Description'] = df_people.apply(lambda row: glue(row), axis='columns')

In [191]:
df_people['ID'] = range(len(df_people))

In [204]:
df_people

Unnamed: 0,Gender,Age,Race,Lifestyle,Likes,Dislikes,Description,ID
0,Male,Young Adult,African American,Moderate,"[Reading, Baking, Watercolor Painting]","[Weightlifting, Solitaire]",moderate young adult african american man,0
1,Female,Senior,Asian,Sedentary,"[Knitting, Jigsaw Puzzles, Crosswords]","[Walking, Biking, Weightlifting]",sedentary senior asian woman,1
2,Male,Adult,Hispanic,Active,"[Walking, Biking, Weightlifting]","[Coloring Books, Solitaire]",active adult hispanic man,2
3,Female,Middle-aged,White,Moderate,"[Reading, Gardening, Baking, Watercolor Painting]","[Weightlifting, Walking]",moderate middle-aged white woman,3
4,Male,Young Adult,Asian,Sedentary,"[Guitar, Meditating, Solitaire]","[Biking, Weightlifting, Walking]",sedentary young adult asian man,4
5,Female,Adult,Hispanic,Active,"[Biking, Weightlifting, Walking]","[Coloring Books, Jigsaw Puzzles]",active adult hispanic woman,5
6,Male,Middle-aged,African American,Moderate,"[Reading, Guitar, Jigsaw Puzzles]","[Weightlifting, Walking, Biking]",moderate middle-aged african american man,6
7,Female,Senior,Asian,Sedentary,"[Knitting, Coloring Books, Crosswords]","[Walking, Biking, Weightlifting]",sedentary senior asian woman,7
8,Male,Young Adult,Hispanic,Active,"[Biking, Weightlifting, Walking, Guitar]","[Coloring Books, Knitting]",active young adult hispanic man,8
9,Female,Middle-aged,Asian,Moderate,"[Reading, Baking, Watercolor Painting, Crosswo...","[Weightlifting, Walking, Jigsaw Puzzles]",moderate middle-aged asian woman,9


compose the dictionary with embeddings of all the likes and dislikes

likes = set(np.concatenate(df_people.Likes))
dislikes = set(np.concatenate(df_people.Dislikes))

embeddings_dict = {}
for i in likes | dislikes:
    time.sleep(2)
    embeddings_dict[i] = get_embedding(i)

descriptions_embeddings_dict = {}
for i in df_people.Description:
    time.sleep(2)
    descriptions_embeddings_dict[i] = get_embedding(i)

descriptions_embeddings_dict

In [195]:
def get_df_likes_dislikes(attr, df_people=df_people):
    df = pd.DataFrame()
    for i in df_people[['ID', 'Description', attr]].values:
        for like in i[2]:
            df = pd.concat(
                [
                    df,
                    pd.DataFrame(
                        [
                            i[0],
                            f'{i[1]} who likes {like.lower()}'
                        ]
                    ).T
                ],
                axis='rows'
            )
    df.reset_index(inplace=True, drop=True)
    df.columns=['ID', 'Description']
    df['Embedding'] = df.Description.apply(lambda x: get_embedding(x))
    return df

In [174]:
df_likes = get_df_likes_dislikes('Likes', df_people=df_people)
df_dislikes = get_df_likes_dislikes('Dislikes', df_people=df_people)

In [199]:
df_likes.to_csv('data/likes.csv')
df_dislikes.to_csv('data/dislikes.csv')

In [217]:
df_likes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 3 columns):
ID             33 non-null int64
Description    33 non-null object
Embedding      33 non-null object
dtypes: int64(1), object(2)
memory usage: 920.0+ bytes


In [209]:
for id_, group in df_likes.groupby('ID'):
    print(id_)
    print(group.mean())

0
ID                                                           0
Embedding    [-0.003327933489345014, -0.008358138070131341,...
dtype: object
1
ID                                                           1
Embedding    [-0.007232751542081435, -0.00773633395632108, ...
dtype: object
2
ID                                                           2
Embedding    [-0.015978617127984762, -0.0009856480173766613...
dtype: object
3
ID                                                           3
Embedding    [-0.009486995462793857, -0.0014398788334801793...
dtype: object
4
ID                                                           4
Embedding    [-0.00014099527228002748, -0.00185592759711047...
dtype: object
5
ID                                                           5
Embedding    [-0.0192279857583344, 0.0024953963390241065, 0...
dtype: object
6
ID                                                           6
Embedding    [-0.013960679993033409, -0.00738622946664691, ...
dtype: object
7
ID  

In [317]:
def normalize(x, overall_min, overall_max):  # currently from -1 to 1 due to negative vectors (we center at 0)
    return 2 * (x - overall_min) / (overall_max - overall_min) - 1

def add_deviation(df):
    # first we find mean vector for an ID
    # only then we find the mean for likes/dislikes
    # thus we prevent individuals having too much likes/dislikes
    # from solely deciding the mean
    mean_vector = df\
    .groupby('ID')\
    .Embedding\
    .apply(lambda x: np.mean(np.stack(x), axis=0))\
    .mean()
    deviation = df.Embedding.apply(lambda x: x - mean_vector)
    df['Deviation'] = deviation
    overall = np.concatenate(df.Deviation)
    overall_min = np.min(overall)
    overall_max = np.max(overall)
    del overall
    norm_dev = df.Deviation.apply(lambda x: normalize(x, overall_min, overall_max))
    df['Norm_Dev'] = norm_dev

In [318]:
add_deviation(df_likes)
add_deviation(df_dislikes)

In [321]:
np.concatenate(df_likes.Norm_Dev).min()

-1.0

In [382]:
for df in [messages, images]:
    df['Content_ID'] = range(len(df))
    df['Embedding'] = df['content'].apply(get_embedding)

    # offset each group by it's center
    emb_center = df['Embedding'].mean()    
    deviation = df.Embedding.apply(lambda emb: emb - emb_center)
    df['Deviation'] = deviation
    
    overall = np.concatenate(df.Deviation)
    overall_min = np.min(overall)
    overall_max = np.max(overall)
    del overall
    norm_dev = df.Deviation.apply(lambda x: normalize(x, overall_min, overall_max))
    df['Norm_Dev'] = norm_dev

In [261]:
messages.to_csv('data/messages_embeddings.csv')
images.to_csv('data/images_embeddings.csv')

In [326]:
euclidean([-1, -1], [1, 1])/(2**0.5)

2.0

In [322]:
cosine([-1, -1], [1, 1])

2.0

In [424]:
def get_cosine(vec_1, vec_2, epsilon=10**-6):
    """
    cosine() is so bad, that it's incapable
    of evaluating [0, 0] to [1, 1] distance
    due to zero division.
    Thus we manually bring it to live here
    """
    vec_1 = [i if i else epsilon for i in vec_1]
    vec_2 = [i if i else epsilon for i in vec_2]
    return cosine(vec_1, vec_2)

In [441]:
def get_distance(vec_1, vec_2):
    """
    intended to work with -1 to 1 normalized vectors
    combines two approaches so that negatives of cosine similarity
    are smoothened as well as the negatives of euclidean alone
    """
    euc = euclidean(vec_1, vec_2)
    cos = get_cosine(vec_1, vec_2)
    # let's scale both distances to [0, 1]
    # since cosine() lies between 0 and 2, halve it    
    cos /= 2
    # now normalize the euclidean distance output
    # larges distances in our case are beteween
    # [-1, -1, ... , -1] and [1, 1, ... , 1]
    # and the value depends on n dimensions
    # so let's consider it
    ones = np.ones(len(vec_1))
    laregest_euc = euclidean(- ones, ones)
    euc /= laregest_euc    
    dist = euc/2 + euc*cos + cos/2
    # scale dist to [0, 1]
    dist /= 2
    # return euc, cos, dist  # an option for testing the function
    return dist

In [452]:
user_vec = df_likes.Norm_Dev[0]
user_vec

array([-0.15990006, -0.48826592, -0.47252663, ..., -0.46114113,
       -0.11936103, -0.10757138])

In [454]:
message_distances = messages.Norm_Dev.apply(lambda x: get_distance(x, user_vec))
2 * (message_distances - message_distances.min()) / (message_distances.max() - message_distances.min()) - 1

0   -1.000000
1    0.336279
2    0.950451
3    0.752838
4    0.662796
5    0.738977
6    1.000000
7    0.644908
Name: Norm_Dev, dtype: float64

In [455]:
normalize(
    message_distances,
    message_distances.min(),
    message_distances.max()
)

0   -1.000000
1    0.336279
2    0.950451
3    0.752838
4    0.662796
5    0.738977
6    1.000000
7    0.644908
Name: Norm_Dev, dtype: float64

In [457]:
users = set(df_people.ID)

In [458]:
users

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

In [459]:
user = 0

In [465]:
for dislike in df_dislikes[df_dislikes.ID == user].Norm_Dev:
    message_distances = messages.Norm_Dev.apply(lambda x: get_distance(x, dislike))
    print()
    print(message_distances)


0    0.268303
1    0.277241
2    0.274582
3    0.269100
4    0.281510
5    0.273693
6    0.271558
7    0.262516
Name: Norm_Dev, dtype: float64

0    0.274327
1    0.282516
2    0.284386
3    0.273967
4    0.218520
5    0.267626
6    0.297967
7    0.290717
Name: Norm_Dev, dtype: float64


In [None]:
1/0

### Visualize embeddings

In [None]:
embs = people.embedding.tolist() + messages.embedding.tolist() + images.embedding.tolist()
fig, ax = plt.subplots(1, 2, figsize=(10,4))

# plot embedding difference norm
emb_norm_diff = [[np.linalg.norm(e1-e2) for e2 in embs] for e1 in embs]
im0 = ax[0].imshow(emb_norm_diff, cmap='pink', vmin=0., vmax=1.)
ax[0].set_title('pairwise difference norm')
ax[0].grid(False)
plt.colorbar(im0, ax=ax[0])

# plot embedding cosine similarity
emb_cossim = [[np.dot(e1,e2)/(np.linalg.norm(e1)*np.linalg.norm(e2)) for e2 in embs] for e1 in embs]
im1 = ax[1].imshow(emb_cossim, cmap='viridis', vmin=-1., vmax=1.)
ax[1].set_title('pairwise cosine similarity')
ax[1].grid(False)
plt.colorbar(im1, ax=ax[1])

plt.tight_layout()
plt.show()

# plot embedding pca
emb_pca = PCA(n_components=2).fit_transform(embs)
people_pca, messages_pca, images_pca = np.split(emb_pca, [len(people),-len(images)])
fig, ax = plt.subplots(figsize=(8,5))
# plot people pca
ax.scatter(people_pca[:,0], people_pca[:,1], s=100)
for ind, pnt in enumerate(people_pca):
    ax.annotate(f'  p{ind}', pnt)
# plot messages pca
ax.scatter(messages_pca[:,0], messages_pca[:,1], s=100)
for ind, pnt in enumerate(messages_pca):
    ax.annotate(f'  m{ind}', pnt)
# plot images pca
ax.scatter(images_pca[:,0], images_pca[:,1], s=100)
for ind, pnt in enumerate(images_pca):
    ax.annotate(f'  i{ind}', pnt)
plt.tight_layout()
plt.show()

### Compute recommendations

In [None]:
def softmax(x, temp=1.):
    """Compute row-wise softmax of a matrix x"""
    z = np.array(x)
    z -= z.max(axis=1, keepdims=True)
    z = np.exp(z / temp)
    z /= z.sum(axis=1, keepdims=True)
    return z


def get_recs(df1, df2, temp=.1, k=3):
    """Normalize df1-df2 proximity and return k best/worst recommendations"""
    neg_norms = [[-np.linalg.norm(e1-e2) for e2 in df2.embedding] for e1 in df1.embedding]
    probs = softmax(neg_norms, temp=temp)
    print(f'probability distribution:\n{probs}')
    
    # show k best/worst recommendations for each element
    inds = probs.argsort(axis=1)
    for i in range(len(df1)):
        print('\n========================\n')
        print(df1.loc[i].content)
        
        # k best recommendations
        print()
        for j in range(k):
            l = inds[i][~j]
            print(f'#{l} ({100*probs[i][l]:.0f}%): {df2.loc[l].content}')

        # k worst recommendations
        print()
        for j in range(k):
            l = inds[i][j]
            print(f'#{l} ({100*probs[i][l]:.0f}%): {df2.loc[l].content}')

In [None]:
# people--images
get_recs(people, images)

In [None]:
# messages--images
get_recs(messages, images)

In [None]:
# people--messages
get_recs(people, messages)