In [13]:
# %pip install openai  -upgrade --quiet

In [9]:
import os
import openai
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

openai.api_key = os.environ['OPENAI_API_KEY']
np.set_printoptions(precision=3, suppress=True)


def get_embedding(text):
    '''Compute GPT3-embedding for a given text string'''
    result = openai.Embedding.create(input=text, model='text-embedding-ada-002')
    return np.array(result.data[0].embedding)

In [10]:
people = pd.read_csv('./data/people.csv')
messages = pd.read_csv('./data/messages.csv')
images = pd.read_csv('./data/images.csv')

In [48]:
{j[0]:j[1] for j in [i.split(': ') for i in list(people.items())[0][1][0].split('\n')]}

{'Gender': 'Male',
 'Age': 'Young Adult',
 'Race': 'African American',
 'Lifestyle': 'Moderate',
 'Likes': 'Reading, Baking, Watercolor Painting',
 'Dislikes': 'Weightlifting, Solitaire'}

In [107]:
people_matter = pd.DataFrame([{j[0]:j[1] for j in [i.split(': ') for i in person.split('\n')]} for person in list(people.items())[0][1]])

In [57]:
# people_list = []
# for person in list(people.items())[0][1]:
#     person_list = [i.split(': ') for i in person.split('\n')]
#     person_dict = {j[0]: j[1] for j in person_list}
#     people_list.append(person_dict)
# people_matter = pd.DataFrame(people_list)

Unnamed: 0,Gender,Age,Race,Lifestyle,Likes,Dislikes
0,Male,Young Adult,African American,Moderate,"Reading, Baking, Watercolor Painting","Weightlifting, Solitaire"
1,Female,Senior,Asian,Sedentary,"Knitting, Jigsaw Puzzles, Crosswords","Walking, Biking, Weightlifting"
2,Male,Adult,Hispanic,Active,"Walking, Biking, Weightlifting","Coloring Books, Solitaire"
3,Female,Middle-aged,White,Moderate,"Reading, Gardening, Baking, Watercolor Painting","Weightlifting, Walking"
4,Male,Young Adult,Asian,Sedentary,"Guitar, Meditating, Solitaire","Biking, Weightlifting, Walking"
5,Female,Adult,Hispanic,Active,"Biking, Weightlifting, Walking","Coloring Books, Jigsaw Puzzles"
6,Male,Middle-aged,African American,Moderate,"Reading, Guitar, Jigsaw Puzzles","Weightlifting, Walking, Biking"
7,Female,Senior,Asian,Sedentary,"Knitting, Coloring Books, Crosswords","Walking, Biking, Weightlifting"
8,Male,Young Adult,Hispanic,Active,"Biking, Weightlifting, Walking, Guitar","Coloring Books, Knitting"
9,Female,Middle-aged,Asian,Moderate,"Reading, Baking, Watercolor Painting, Crosswords","Weightlifting, Walking, Jigsaw Puzzles"


In [108]:
people_matter['Likes'] = people_matter['Likes'].str.split(', ')
people_matter['Dislikes'] = people_matter['Dislikes'].str.split(', ')

In [76]:
row = ' '.join(people_matter.loc[0][['Lifestyle', 'Age', 'Race', 'Gender']].values).lower()

In [None]:
' '.join([replacement_dict.get(word, word) for word in input_sentence.split()])

In [87]:
def glue(row):
    replacement_dict = {'Male': 'man', 'Female': 'woman'}
    row['Gender'] = replacement_dict.get(row['Gender'], 'John Cena')
    return ' '.join(row[['Lifestyle', 'Age', 'Race', 'Gender']].values).lower()

In [89]:
people_matter['Description'] = people_matter.apply(lambda row: glue(row), axis='columns')

In [93]:
people_matter['Like0'], people_matter['Like1'], people_matter['Like2'] = people_matter['Likes'].str.split(', ')

ValueError: too many values to unpack (expected 3)

In [77]:
people_matter

Unnamed: 0,Gender,Age,Race,Lifestyle,Likes,Dislikes
0,Male,Young Adult,African American,Moderate,"Reading, Baking, Watercolor Painting","Weightlifting, Solitaire"
1,Female,Senior,Asian,Sedentary,"Knitting, Jigsaw Puzzles, Crosswords","Walking, Biking, Weightlifting"
2,Male,Adult,Hispanic,Active,"Walking, Biking, Weightlifting","Coloring Books, Solitaire"
3,Female,Middle-aged,White,Moderate,"Reading, Gardening, Baking, Watercolor Painting","Weightlifting, Walking"
4,Male,Young Adult,Asian,Sedentary,"Guitar, Meditating, Solitaire","Biking, Weightlifting, Walking"
5,Female,Adult,Hispanic,Active,"Biking, Weightlifting, Walking","Coloring Books, Jigsaw Puzzles"
6,Male,Middle-aged,African American,Moderate,"Reading, Guitar, Jigsaw Puzzles","Weightlifting, Walking, Biking"
7,Female,Senior,Asian,Sedentary,"Knitting, Coloring Books, Crosswords","Walking, Biking, Weightlifting"
8,Male,Young Adult,Hispanic,Active,"Biking, Weightlifting, Walking, Guitar","Coloring Books, Knitting"
9,Female,Middle-aged,Asian,Moderate,"Reading, Baking, Watercolor Painting, Crosswords","Weightlifting, Walking, Jigsaw Puzzles"


In [None]:
1/0

In [18]:
for df in [people, messages, images]:
    df['embedding'] = df['content'].apply(get_embedding)

    # offset each group by it's center
    emb_center = df['embedding'].mean()
    df['embedding'] = df['embedding'].apply(lambda emb: emb - emb_center)

### Visualize embeddings

In [None]:
embs = people.embedding.tolist() + messages.embedding.tolist() + images.embedding.tolist()
fig, ax = plt.subplots(1, 2, figsize=(10,4))

# plot embedding difference norm
emb_norm_diff = [[np.linalg.norm(e1-e2) for e2 in embs] for e1 in embs]
im0 = ax[0].imshow(emb_norm_diff, cmap='pink', vmin=0., vmax=1.)
ax[0].set_title('pairwise difference norm')
ax[0].grid(False)
plt.colorbar(im0, ax=ax[0])

# plot embedding cosine similarity
emb_cossim = [[np.dot(e1,e2)/(np.linalg.norm(e1)*np.linalg.norm(e2)) for e2 in embs] for e1 in embs]
im1 = ax[1].imshow(emb_cossim, cmap='viridis', vmin=-1., vmax=1.)
ax[1].set_title('pairwise cosine similarity')
ax[1].grid(False)
plt.colorbar(im1, ax=ax[1])

plt.tight_layout()
plt.show()

# plot embedding pca
emb_pca = PCA(n_components=2).fit_transform(embs)
people_pca, messages_pca, images_pca = np.split(emb_pca, [len(people),-len(images)])
fig, ax = plt.subplots(figsize=(8,5))
# plot people pca
ax.scatter(people_pca[:,0], people_pca[:,1], s=100)
for ind, pnt in enumerate(people_pca):
    ax.annotate(f'  p{ind}', pnt)
# plot messages pca
ax.scatter(messages_pca[:,0], messages_pca[:,1], s=100)
for ind, pnt in enumerate(messages_pca):
    ax.annotate(f'  m{ind}', pnt)
# plot images pca
ax.scatter(images_pca[:,0], images_pca[:,1], s=100)
for ind, pnt in enumerate(images_pca):
    ax.annotate(f'  i{ind}', pnt)
plt.tight_layout()
plt.show()

### Compute recommendations

In [None]:
def softmax(x, temp=1.):
    """Compute row-wise softmax of a matrix x"""
    z = np.array(x)
    z -= z.max(axis=1, keepdims=True)
    z = np.exp(z / temp)
    z /= z.sum(axis=1, keepdims=True)
    return z


def get_recs(df1, df2, temp=.1, k=3):
    """Normalize df1-df2 proximity and return k best/worst recommendations"""
    neg_norms = [[-np.linalg.norm(e1-e2) for e2 in df2.embedding] for e1 in df1.embedding]
    probs = softmax(neg_norms, temp=temp)
    print(f'probability distribution:\n{probs}')
    
    # show k best/worst recommendations for each element
    inds = probs.argsort(axis=1)
    for i in range(len(df1)):
        print('\n========================\n')
        print(df1.loc[i].content)
        
        # k best recommendations
        print()
        for j in range(k):
            l = inds[i][~j]
            print(f'#{l} ({100*probs[i][l]:.0f}%): {df2.loc[l].content}')

        # k worst recommendations
        print()
        for j in range(k):
            l = inds[i][j]
            print(f'#{l} ({100*probs[i][l]:.0f}%): {df2.loc[l].content}')

In [None]:
# people--images
get_recs(people, images)

In [None]:
# messages--images
get_recs(messages, images)

In [None]:
# people--messages
get_recs(people, messages)