In [1]:
# !pip install openai  --upgrade --quiet

# Initiate

In [2]:
import time
import os
import openai
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from scipy.spatial.distance import cosine, euclidean

openai.api_key = os.environ['OPENAI_API_KEY']
np.set_printoptions(precision=3, suppress=True)

REFRESH = False

def get_embedding(text, sleep=0):
    '''Compute GPT3-embedding for a given text string'''
    time.sleep(sleep)
    result = openai.Embedding.create(input=text, model='text-embedding-ada-002')
    return np.array(result.data[0].embedding)

In [3]:
def glue(row):
    replacement_dict = {'Male': 'man', 'Female': 'woman'}
    row['Gender'] = replacement_dict.get(row['Gender'], 'John Cena')
    return ' '.join(row[['Lifestyle', 'Age', 'Race', 'Gender']].values).lower()

# Get dfs

In [4]:
people = pd.read_csv('./data/people.csv')
messages = pd.read_csv('./data/messages.csv', sep='John_Cena')
images = pd.read_csv('./data/images.csv', sep='John_Cena')

  messages = pd.read_csv('./data/messages.csv', sep='John_Cena')
  images = pd.read_csv('./data/images.csv', sep='John_Cena')


In [5]:
# @title adequate notation
people_list = []
for person in list(people.items())[0][1]:
    person = person.replace('\r', '')
    person_list = [i.split(': ') for i in person.split('\n')]
    person_dict = {j[0]: j[1] for j in person_list}
    people_list.append(person_dict)
df_people = pd.DataFrame(people_list)

In [6]:
df_people['Likes'] = df_people['Likes'].str.split(', ')
df_people['Dislikes'] = df_people['Dislikes'].str.split(', ')
df_people['Description'] = df_people.apply(lambda row: glue(row), axis='columns')
df_people['ID'] = range(len(df_people))
df_people.head(1)

Unnamed: 0,Gender,Age,Race,Lifestyle,Likes,Dislikes,Description,ID
0,man,Young Adult,African American,Moderate,"[Sports, Outdoors, Learning, Exercise]","[Homemaking, Design, Relaxing, Arts and Crafts]",moderate young adult african american man,0


In [7]:
def get_df_likes_dislikes(
        attr,
        df_people=df_people,
        refresh=REFRESH
        ):
    if not refresh:
        raise Exception('API call aborted. Were you sure? If yes, change REFRESH to True.')
    df = pd.DataFrame()
    for i in df_people[['ID', 'Description', attr]].values:
        for like in i[2]:
            df = pd.concat(
                [
                    df,
                    pd.DataFrame(
                        [
                            i[0],
                            like.lower(),
                            f'{i[1]} who likes {like.lower()}'
                        ]
                    ).T
                ],
                axis='rows'
            )
    df.reset_index(inplace=True, drop=True)
    df.columns=['ID', 'Activity', 'Description']
    df['Embedding'] = df.Description.apply(lambda x: get_embedding(x))
    return df

In [8]:
def add_activity(df):
    df['Activity'] = np.stack(df.Description.str.split('who likes '))[:, 1]

In [9]:
def normalize(x, overall_min, overall_max):  # currently from -1 to 1 due to negative vectors (we center at 0)
    return 2 * (x - overall_min) / (overall_max - overall_min) - 1

def add_deviation(df):
    # first we find mean vector for an ID
    # only then we find the mean for likes/dislikes
    # thus we prevent individuals having too much likes/dislikes
    # from solely deciding the mean
    mean_vector = df\
    .groupby('ID')\
    .Embedding\
    .apply(lambda x: np.mean(np.stack(x), axis=0))\
    .mean()
    deviation = df.Embedding.apply(lambda x: x - mean_vector)
    df['Deviation'] = deviation
    overall = np.concatenate(df.Deviation)
    overall_min = np.min(overall)
    overall_max = np.max(overall)
    del overall
    norm_dev = df.Deviation.apply(lambda x: normalize(x, overall_min, overall_max))
    df['Norm_Dev'] = norm_dev

In [10]:
preferences_dfs = {}
for likes_dislikes in ['likes', 'dislikes']:
    if REFRESH:
        preferences_dfs[likes_dislikes] = get_df_likes_dislikes(
            likes_dislikes.capitalize(), df_people=df_people
            )
        preferences_dfs[likes_dislikes].to_csv(f'data/{likes_dislikes}.csv', index=False)
        np.save(f'data/{likes_dislikes}', preferences_dfs[likes_dislikes].Embedding)
    else:
        preferences_dfs[likes_dislikes] = pd.read_csv(f'data/{likes_dislikes}.csv')
        preferences_dfs[likes_dislikes].Embedding = np.load(
            f'data/{likes_dislikes}.npy', allow_pickle=True
    )
    add_activity(preferences_dfs[likes_dislikes])
    add_deviation(preferences_dfs[likes_dislikes])

In [11]:
content_dfs = {
    'messages': messages,
    'images': images,
}

In [12]:
for content_type in ['messages', 'images']:
    df = content_dfs[content_type]
    df['Content_ID'] = range(len(df))

    if REFRESH:
        df['Embedding'] = df['content'].apply(get_embedding)
        df.to_csv(f'data/{content_type}_embeddings.csv', index=False)
        np.save(f'data/{content_type}', df.Embedding)
    else:
        df['Embedding'] = np.load(f'data/{content_type}.npy', allow_pickle=True)
    # offset each group by it's center
    emb_center = df['Embedding'].mean()    
    deviation = df.Embedding.apply(lambda emb: emb - emb_center)
    df['Deviation'] = deviation
    
    overall = np.concatenate(df.Deviation)
    overall_min = np.min(overall)
    overall_max = np.max(overall)
    del overall
    norm_dev = df.Deviation.apply(lambda x: normalize(x, overall_min, overall_max))
    df['Norm_Dev'] = norm_dev

# Get distances

In [13]:
def get_cosine(vec_1, vec_2, epsilon=10**-6):
    """
    cosine() is so bad, that it's incapable
    of evaluating [0, 0] to [1, 1] distance
    due to zero division.
    Thus we manually bring it to live here
    """
    vec_1 = [i if i else epsilon for i in vec_1]
    vec_2 = [i if i else epsilon for i in vec_2]
    return cosine(vec_1, vec_2)

In [14]:
def get_distance(vec_1, vec_2):
    """
    intended to work with -1 to 1 normalized vectors
    combines two approaches so that negatives of cosine similarity
    are smoothened as well as the negatives of euclidean alone
    """
    euc = euclidean(vec_1, vec_2)
    cos = get_cosine(vec_1, vec_2)
    # let's scale both distances to [0, 1]
    # since cosine() lies between 0 and 2, halve it    
    cos /= 2
    # now normalize the euclidean distance output
    # larges distances in our case are beteween
    # [-1, -1, ... , -1] and [1, 1, ... , 1]
    # and the value depends on n dimensions
    # so let's consider it
    ones = np.ones(len(vec_1))
    largest_euc = euclidean(-ones, ones)
    euc /= largest_euc    
    dist = euc/2 + euc*cos + cos/2
    # scale dist to [0, 1]
    dist /= 2
    # return euc, cos, dist  # an option for testing the function
    return dist

In [15]:
# @title Version with overall distance df
def get_preference_distances_V2_overall():
    likes_weight = 1
    dislikes_weight = 1
    preference_distances_by_user = {}
    preference_distances_df = pd.DataFrame()
    users = sorted(list(set(df_people['ID'])))
    for user in users:
        preference_distances_by_user[user] = {}
        for content_type in ['messages', 'images']:
            preference_distances_by_user[user][content_type] = {}
            for content_entry in content_dfs[content_type].iterrows():
                content_entry = content_entry[1]
                preference_distances_by_user[user][content_type][content_entry.Content_ID] = {}
                for likes_dislikes in ['likes', 'dislikes']:
                    preference_distances_by_user[user][content_type][content_entry.Content_ID][likes_dislikes] = []
                    for user_pref_entry in preferences_dfs[likes_dislikes].query(f'ID == {user}').iterrows():
                        user_pref_entry = user_pref_entry[1]
                        preference_distance = get_distance(content_entry.Norm_Dev, user_pref_entry.Norm_Dev)
                        preference_distances_by_user[user][content_type][content_entry.Content_ID][likes_dislikes].append(preference_distance)
                        # print(f'{user=} {content_type} Content_ID={content_entry.Content_ID} {likes_dislikes} {preference_distance}')
                mean_like_distance = np.mean(preference_distances_by_user[user][content_type][content_entry.Content_ID]['likes'])
                min_dislike_distance =  np.min(preference_distances_by_user[user][content_type][content_entry.Content_ID]['dislikes'])
                overall_preference_distance = mean_like_distance - min_dislike_distance
                preference_distances_by_user[user][content_type][content_entry.Content_ID]['Preference_Distance'] = overall_preference_distance
                distance_df_entry = {
                    'User_ID': user,
                    'Content_Type': content_type,
                    'Content_ID': content_entry.Content_ID,
                    'Preference_Distance': overall_preference_distance
                }
                preference_distances_df = pd.concat([preference_distances_df, pd.DataFrame(distance_df_entry, index=[0])])
    preference_distances_df.reset_index(drop=True, inplace=True)
    return preference_distances_df, preference_distances_by_user

In [16]:
class V2:
    def __init__(self):
        self.preference_distances_df, self.preference_distances_by_user = get_preference_distances_V2_overall()

    def print_user_results(
            self,
            user: int,
            content_type: str,
            n_items: int = 5
        ):
        """
        user (int): User_ID

        content_type (str): either 'messages' or 'images'

        n_items: top N messages/images,
        set 1 to show the best/worst options,
        default = 5 (for number of fingers in human to count with)
        """
        print(people.loc[user].content)
        
        dist_data = self.preference_distances_by_user[user][content_type].items()
        pref_dist_list = [(content_id, i['Preference_Distance']) for (content_id, i) in dist_data]
        sorted_content_ids = sorted(
            pref_dist_list,
            key=lambda x: x[1]
            )

        print(f'\nTop {n_items} {content_type}:')
        for n, (content_id, pref_dist) in enumerate(sorted_content_ids[:n_items]):
            content = content_dfs[content_type]\
                .query(f'Content_ID == {content_id}')\
                    .content\
                        .values[0]
            print(f'{n+1}. {content}')

        print(f'\nBottom {n_items} {content_type}:')
        for n, (content_id, pref_dist) in enumerate(sorted_content_ids[-n_items:][::-1]):
            content = content_dfs[content_type]\
                .query(f'Content_ID == {content_id}')\
                    .content\
                        .values[0]
            print(f'{n+1}. {content}')

In [17]:
V2_instance = V2()

In [18]:
V2_instance.print_user_results(
    user=10,
    content_type='messages',
    n_items=5
    )

Gender: Male
Age: Senior
Race: African American
Lifestyle: Active
Likes: Sports, Outdoors, Exercise, Games
Dislikes: Homemaking, Design, Arts and Crafts, Relaxing

Top 5 messages:
1. Grab your clubs and hit the green! Golfing is a fun way to enjoy the outdoors, socialize with friends, and improve your hand-eye coordination and focus. Fore!
2. Hop on your bike and feel the wind in your hair! Biking is a fun and eco-friendly way to explore your surroundings, get some exercise, and enjoy the outdoors. Don't forget your helmet!
3. Put on your running shoes and hit the pavement! Jogging is a great way to improve your cardiovascular health, burn calories, and reduce stress. Start slow and steady, and work your way up!
4. Pick up some weights and feel the burn! Weightlifting is a great way to build muscle, boost your metabolism, and feel more confident in your body. You got this!
5. Strap on your roller skates and roll into some fun! Roller skating is a great way to improve your balance, coor

In [19]:
V2_instance.print_user_results(
    user=10,
    content_type='images',
    n_items=5
)

Gender: Male
Age: Senior
Race: African American
Lifestyle: Active
Likes: Sports, Outdoors, Exercise, Games
Dislikes: Homemaking, Design, Arts and Crafts, Relaxing

Top 5 images:
1. Senior hispanic man playing a game of cards in the park.
2. Person shuffling a deck of playing cards.
3. Young adult African American man walking his dog in the park.
4. Senior white woman biking in the park and looking towards the sky.
5. Senior white man wearing a coat and holding a book while walking outdoors.

Bottom 5 images:
1. Young adult white woman dancing and cleaning the house.
2. Woman placing a cutting board into the kitchen cupboard.
3. Adult white woman sitting at a desk and painting.
4. Person sketching flowers in a notepad.
5. Adult white woman rolling dough in the kitchen and smiling.


# ________________