In [21]:
import json
import pandas as pd
import os
#!pip install openai -U
from openai import OpenAI
from enum import Enum
from typing import List
from pydantic import BaseModel, Field
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
model = "gpt-4o-2024-08-06"

In [34]:
# Get human emotions    
 
# Define the Pydantic model for the API response
class EmotionsResponse(BaseModel):
    Characteristics: List[str] = Field(None, description="List of non-redundant human emotions.")

def get_emotions(model: str) -> List[str]:
    """Gets a list of 50 non-redundant human emotions using the specified gpt model."""
    
    # Define system and user prompts
    system_prompt = "Find 50 non-redundant and different human emotions. "\
    "For example, from each of these four emotion groups, only select one representative"\
    ": 1: joy, happiness, 2: Shame, Embarrassment, "\
    "3: Envy, Jealousy , 4: Hate, disgust, hatered, Resentment. "\
    "So on and so forth."

    user_prompt = "Select 50 non-redundant and different human emotions."

    try:
        #Call the API to get the completion
        completion = client.beta.chat.completions.parse(
            model= model,
            messages=[
                {"role": "system", "content": "Be a helpful assistant."},
                {"role": "system", "content": system_prompt},
                {"role": "system", "content": "Did you accidentally select more than one emotion from group 1? If so, keep only one and drop the rest."},
                {"role": "system", "content": "Did you accidentally select more than one emotion from group 2? If so, keep only one and drop the rest."},
                {"role": "system", "content": "Did you accidentally select mumore than one emotion from group 3? If so, keep only one and drop the rest."},
                {"role": "system", "content": "Did you accidentally select more than one emotion from group 4? If so, keep only one and drop the rest."},
                {"role": "system", "content": "Check again to remove redundancy. Remember, you must only select emotions describing non-redundant human feelings."},
                {"role": "user", "content": user_prompt}
            ],
            response_format=EmotionsResponse
        )

        #output returns in the defined pydantic style
        output = completion.choices[0].message.parsed
        return output.json()
    
    except Exception as e:
        # Handle exceptions such as API errors, etc
        print(f"An error occurred: {e}")
        return []

# Example usage
emotions = get_emotions(model= model)

In [35]:
# #to check emotion redundancy by looking at example groups 
[i for i in list(json.loads(emotions).values())[0] if i in ['Joy', 'Happiness', 'Shame', 'Embarrassment', 'Envy', 'Jealousy' , 'Hate', 'disgust', 'hatered', 'Resentment']]

['Joy', 'Shame', 'Envy', 'Resentment']

In [36]:
#Get 100 best selling American clothing brands 

# Define the Pydantic model for the API response
class BrandsResponse(BaseModel):
    Brands: List[str] = Field(None, description="Brands as a list of strings.")

def get_brands(model: str) -> List[str]:
    """Get 100 best selling American clothing brands using the specified gpt model."""

    try:
        #Call the API to get the completion
        completion = client.beta.chat.completions.parse(
            model= model,
            messages=[
                {"role": "system", "content": "Be a helpful assistant."},
                {"role": "system", "content": "Find 100 non-redundant best selling American clothing brands."},
                {"role": "system", "content": "DONT MAKE ANY MISTAKES, check if you did any."},
                {"role": "user", "content": "Give me 100 best selling American clothing brands."}
            ],
            response_format=BrandsResponse
        )

        #output returns in the defined pydantic style
        output = completion.choices[0].message.parsed
        return output.json()
    
    except Exception as e:
        # Handle exceptions such as API errors, etc
        print(f"An error occurred: {e}")
        return []

# Example usage
brands = get_brands(model= model)

In [37]:
emotions_ls = list(json.loads(emotions).values())[0]
brands_ls = list(json.loads(brands).values())[0]

In [38]:
# Embedding brand in emotions space: Get association scores between an input and list of emotions

Characteristic = Enum('Characteristic', dict([(emotion, emotion) for emotion in emotions_ls]))

class EmotionalAssociationScore(BaseModel):
    emotion: Characteristic
    score: float

class EmotionalAssociationScores(BaseModel):
    associations: List[EmotionalAssociationScore] = Field(description="A list of emotions and associated scores")

def emotional_association_scores(
        thing, 
        model,
        emotions
    ):
    
    prompt = f"Assign emotional association scores between {0} and {len(emotions)} for the provided thing. "\
    "Assign a score for each of the following emotions. Briefly, explain the reason behind the association score."\
    "Ensure the scores reflect the association strength for the specified thing. "\
    "Thing: "\
    f"{thing}"
            
    completion = client.beta.chat.completions.parse(
        model = model,
        messages=[
            {"role": "system", "content": "Be a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        response_format=EmotionalAssociationScores,
    )
    #output returns in the defined pydantic style
    output = completion.choices[0].message.parsed
    return thing, output.json()

In [None]:
#not using this for the moment
# #Embedding brands in emotions space: 
# # tried nested prompt but decided to go with one prompt and a list comprehension
# emotions= emotions_ls
# associations_brands = [emotional_association_scores(thing, model, emotions) for thing in brands_ls[:3]]


In [73]:
def get_df(thing, model, emotions):
    gpt = emotional_association_scores(thing, model, emotions)
    data = list(json.loads(gpt[1]).values())[0]
    df = pd.DataFrame(data)
    df.rename(columns = {'score': gpt[0]}, inplace=True)
    df.set_index('emotion', inplace=True)
    return df

def get_dfs(things_ls, model, emotions):
    merged_df = pd.DataFrame()
    for thing in things_ls:
        new_df = get_df(thing, model, emotions)
        if merged_df.empty:
            merged_df = new_df
        else:
            merged_df = pd.merge(merged_df, new_df, left_index=True, right_index=True, how='outer')
    return merged_df


things_ls = brands_ls
dfs = get_dfs(things_ls, model, emotions)
# Drop columns with NaN values
dfs_cleaned = dfs.dropna(axis=1)

dfs_cleaned 

Unnamed: 0_level_0,Nike,Adidas,Under Armour,Lululemon,Levi's,Ralph Lauren,Tommy Hilfiger,Calvin Klein,Coach,Michael Kors,...,Band of Outsiders,J Brand,HUF,Obey,OMighty,Shein,Uniqlo,Zara,H&M,Forever 21
emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Admiration,41.0,32.0,30.0,30.0,35.0,40.0,35.0,40.0,40.0,35.0,...,30.0,25.0,20.0,20.0,30.0,15.0,30.0,30.0,20.0,24.0
Anguish,6.0,5.0,5.0,5.0,5.0,5.0,5.0,2.0,10.0,5.0,...,5.0,10.0,20.0,15.0,5.0,20.0,1.0,5.0,3.0,9.0
Anticipation,33.0,33.0,30.0,25.0,20.0,25.0,20.0,30.0,30.0,25.0,...,20.0,33.0,35.0,10.0,30.0,25.0,20.0,35.0,33.0,27.0
Anxiety,15.0,8.0,10.0,10.0,10.0,10.0,10.0,6.0,20.0,10.0,...,10.0,10.0,30.0,35.0,10.0,35.0,3.0,20.0,10.0,13.0
Apathy,10.0,10.0,10.0,5.0,10.0,5.0,10.0,9.0,10.0,5.0,...,5.0,12.0,5.0,10.0,5.0,25.0,5.0,10.0,8.0,12.0
Apprehension,13.0,7.0,5.0,10.0,10.0,10.0,10.0,5.0,20.0,10.0,...,10.0,16.0,35.0,40.0,10.0,35.0,2.0,10.0,10.0,14.0
Boredom,6.0,10.0,10.0,5.0,15.0,10.0,10.0,3.0,5.0,10.0,...,10.0,15.0,10.0,15.0,5.0,20.0,10.0,10.0,15.0,15.0
Calmness,22.0,20.0,20.0,20.0,30.0,30.0,20.0,18.0,20.0,20.0,...,15.0,25.0,10.0,15.0,15.0,15.0,32.0,15.0,18.0,13.0
Confusion,12.0,12.0,10.0,10.0,10.0,10.0,15.0,10.0,10.0,10.0,...,15.0,20.0,35.0,30.0,15.0,25.0,10.0,15.0,15.0,16.0
Contentment,34.0,35.0,30.0,30.0,40.0,35.0,35.0,27.0,30.0,25.0,...,20.0,32.0,15.0,15.0,25.0,20.0,35.0,35.0,28.0,25.0


In [81]:
# # Set pandas to display all rows and columns
# pd.set_option('display.max_rows', None)  # Show all rows
# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.width', None)  # Adjust display width to prevent column cutting
# pd.set_option('display.max_colwidth', None)  # Show full content in columns
# dfs.isna().sum()


Nike                          0
Adidas                        0
Under Armour                  0
Lululemon                     0
Levi's                        0
Ralph Lauren                  0
Tommy Hilfiger                0
Calvin Klein                  0
Coach                         0
Michael Kors                  0
Gap                           0
J.Crew                        0
American Eagle Outfitters     0
Abercrombie & Fitch           0
Old Navy                      0
Brooks Brothers               0
Vineyard Vines                0
Patagonia                     0
North Face                    0
Columbia Sportswear           0
Fox Racing                    0
Carhartt                      0
Express                       0
Banana Republic               0
Hollister                     0
Victoria's Secret             0
Tory Burch                    0
Converse                      0
Polo Ralph Lauren             0
DKNY                          0
Lands' End                    0
New Bala

In [40]:
#Embedding a book in emotions space: 
emotions= emotions_ls
thing = 'Summer Island'
df = get_df(thing, model, emotions)
df


Unnamed: 0_level_0,Summer Island
emotion,Unnamed: 1_level_1
Joy,45.0
Shame,5.0
Envy,20.0
Disgust,5.0
Surprise,30.0
Trust,25.0
Fear,15.0
Hope,35.0
Gratitude,30.0
Satisfaction,40.0


In [84]:
import numpy as np
from itertools import islice
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

def get_norm(series):
    # L2 normalize the vectors
    return normalize(series, norm='l2')


def get_similarity(df, dfs):
    similarities = dict()

    # Reshape Series to 2D array (required by cosine_similarity)
    s1 = df.values.reshape(1, -1)
    #normalize
    s1 = get_norm(s1)

    for col in list(dfs.columns):
        # Reshape
        s2= dfs[col].values.reshape(1, -1)
        # Normalize
        s2 = get_norm(s2)

        cosine_sim = cosine_similarity(s1, s2)
        similarities[col]= cosine_sim[0][0]
        print(f'Cosine Similarity: {cosine_sim[0][0]}')

    sorted_dict = dict(sorted(similarities.items(), key=lambda item: item[1], reverse = True))

    # Get the top 3 (highest similarity)
    top_3 = dict(islice(sorted_dict.items(), 3))

    return top_3

get_similarity(df, dfs_cleaned)


Cosine Similarity: 0.7644874793943254
Cosine Similarity: 0.7915208696554784
Cosine Similarity: 0.7608195743106554
Cosine Similarity: 0.7779319473990269
Cosine Similarity: 0.8037630566805488
Cosine Similarity: 0.7539100225400972
Cosine Similarity: 0.7622481786721752
Cosine Similarity: 0.7489428855722048
Cosine Similarity: 0.78125144794287
Cosine Similarity: 0.7598346046050082
Cosine Similarity: 0.8066318333661667
Cosine Similarity: 0.7812572507767217
Cosine Similarity: 0.812234080487034
Cosine Similarity: 0.8128564804830194
Cosine Similarity: 0.7596362571773432
Cosine Similarity: 0.7471326061541392
Cosine Similarity: 0.7304123939756261
Cosine Similarity: 0.7641130577357115
Cosine Similarity: 0.7808219292677105
Cosine Similarity: 0.7197644069983289
Cosine Similarity: 0.7598707076214085
Cosine Similarity: 0.8264195878019709
Cosine Similarity: 0.7515901586850424
Cosine Similarity: 0.7875971487589849
Cosine Similarity: 0.7896908288587204
Cosine Similarity: 0.7815882209157102
Cosine Similari

{'Carhartt': 0.8264195878019709,
 'J Brand': 0.8190895344480581,
 'Abercrombie & Fitch': 0.8128564804830194}

In [None]:
#The cosine similarity ranges from -1 to 1, where:
#1 indicates identical vectors (i.e., vectors point in the same direction).
#0 indicates orthogonality (i.e., vectors are at a 90-degree angle to each other, no similarity).
#-1 indicates opposite directions (i.e., vectors point in exactly opposite directions).
#represents similarity between feature vectors, quantifying similarity between two vectors based on their direction, 
# irrespective of their magnitude.





In [None]:
# a method
#embedding dimension is emotions
#talk about options
#get the brands, go through 50 emotins at a time
#cosine: normalize first: l2 norm = 1
#give instructions on readme on where key goes 
#first have everything in pandas df, then think about database
# one module or package w 1 .py 
#adaptors that take in pydantic datatypes and will make into sql
