In [None]:
#!pip install openai
#!pip install transformers
#!pip install torch torchvision

import json
import pandas as pd
from openai import OpenAI
from io import StringIO
import re


In [None]:
with open('../key/key.json') as f:
    k = json.load(f)['key']

In [None]:
client = OpenAI(api_key=k)

In [None]:
chat_completion_b = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Give me a list of 10 top selling both gender clothing brand names in US and for each brand list 2 single word dominant characteristics of the brand shoppers.",
        }
    ],
    model="gpt-4o-mini"
)

In [None]:
chat_completion_s = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Give me a list of 10 best selling song names and for each song list 2 single word dominant emotions that the song inspires. No singer names.",
        }
    ],
    model="gpt-4o-mini"
)

In [None]:
brands = chat_completion_b.choices[0].message.content
brands = brands.split('\n')
b = [i for i in brands[1:-1] if i not in ['']]
b = [re.sub(r'[^a-zA-Z]','',i) for i in b]
b_n = b[::3]
b_chr = [i for i in b if i not in b[::3]]
b_chr = [tuple(b_chr[i::+1][0:2]) for i in range(0,len(b_chr),2)]
d_b = dict(zip(b_n,b_chr))

In [None]:
songs = chat_completion_s.choices[0].message.content
songs = songs.split('\n')
s = [i for i in songs[1:-1] if i not in ['']]
s = [i.split(':')[-1] for i in s]
s = [re.sub(r'[^a-zA-Z]','',i) for i in s]
s_n = s[::3]
s_chr = [i for i in s if i not in s[::3]]
s_chr = [tuple(s_chr[i::+1][0:2]) for i in range(0,len(s_chr),2)]
d_s = dict(zip(s_n,s_chr))

In [None]:
print(d_b)
d_s

In [None]:
#remove songs and brands with less than 2 adjectives.
tuple_size_to_remove = [0,1]
d_b = {k: v for k, v in d_b.items() if len(v) not in tuple_size_to_remove}
d_s = {k: v for k, v in d_s.items() if len(v) not in tuple_size_to_remove}



In [None]:
from transformers import BertTokenizer, BertModel
#import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

In [None]:
import numpy as np

def average_embedding(embeddings):
    return np.mean(embeddings, axis=0)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarities = []
for adj_A in list(d_b.values()):
    avg_embedding_A = average_embedding(get_embeddings(adj_A))
    for adj_B in list(d_s.values()):
        avg_embedding_B = average_embedding(get_embeddings(adj_B))
        similarity_score = cosine_similarity([avg_embedding_A], [avg_embedding_B])[0][0]
        similarities.append((adj_A, adj_B, similarity_score))

In [None]:
df = pd.DataFrame(similarities, columns = ['b_chr', 's_em', 'cosine'])
inverted_d_b = {v: k for k, v in d_b.items()}
inverted_d_s = {v: k for k, v in d_s.items()}
df['song'] = df['s_em'].map(inverted_d_s)
df['brand'] = df['b_chr'].map(inverted_d_b)
df['cosine'] = df['cosine'].apply(lambda x: round(x, 2))


In [None]:
pd.set_option("display.max_rows", None)
df = df[['brand','b_chr','song','s_em','cosine']]

In [None]:
df.sort_values(['brand','cosine'], ascending = [True, False], inplace = True)
df = df.groupby('brand').head(3)
df

In [None]:
np.mean(get_embeddings(('joy','happy')),axis=0).shape

In [None]:
#finding an optimal text embedding:
#First embed them since used by most methods
# Here, we do need to know the frequency for each of the emotions
# we have a small dataset



embeddings = [nlp(sentence).vector for sentence in sentences]
distance = euclidean_distance(embeddings[0], embeddings[1])
print(distance)

# OUTPUT
1.8646982721454675

A = ['satisfied','happy']
B= ['Sadness', 'Love', 'Trust']
def jaccard_similarity(x,y):
  """ returns the jaccard similarity between two lists """
  intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
  union_cardinality = len(set.union(*[set(x), set(y)]))
  return intersection_cardinality/float(union_cardinality)

print('Jaccard Similarity:', jaccard_similarity(A,B))


#OpenAI example
#define 3 categories to be represented to the user as strings in json
#class Category(Enum):
#    violence = "violence"
#    sexual = "sexual"
#    self_harm = "self_harm"

#defining what my data type looks like so it properly imported and exported as json
#class ContentCompliance(BaseModel): 
#    is_violating: bool  #tell me if there is a violation and if so, classify which one of the category classes
#    category: Optional[Category] #the type of the category could be none or category. If non violating, no need to define category
#    explanation_if_violating: Optional[str]


In [None]:
#Earlier version where I listed teh emotions as enum

# class Char(BaseModel): 
#     is_char: bool
#     characteristic: Optional[Characteristic] ?

#List 10 best selling American clothing brands and, for each brand, list which one of the emotions in the given list are most closely associated with."
client = OpenAI(api_key=k)

class Characteristic(Enum):
    Happiness = 'Happiness'
    Sadness = 'Sadness'
    Anger = 'Anger'
    Fear = 'Fear'
    Surprise = 'Surprise'
    Disgust = 'Disgust'
    Trust = 'Trust'
    Anticipation ='Anticipation'
    Joy ='Joy'
    Love = 'Love'

class BrandAttributes(BaseModel):
    brand_name: str = Field(None, description=" name as str")
    characteristics: List[Characteristic] = Field(None, description=" list of characteristics as str")

class BrandAttributes(BaseModel):
    brand_name: str = Field(None, description=" name as str")
    characteristics: List[Characteristic] = Field(None, description=" list of characteristics as str")

class BrandChars(BaseModel):
    a: List[BrandAttributes] = Field(None, description="List of BrandAttributes")
    
completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": "Be a helpful assistant."},
        {"role": "user", "content": "List 30 best selling American clothing brands and, for each brand, list which one of the emotions in the given list are most closely associated with."}
    ],
    response_format=BrandChars,
)

out = json.loads(completion.choices[0].message.content)
brand_emotions = out
out = list(out.values())[0]
df_brand = pd.DataFrame(data = [i.values() for i in out], columns = ['Name','characteristic'])
df_brand

In [None]:
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split 
  
# Create feature and target arrays 
X = df_brand_mb.drop(['Name'], axis =1)
y = df_brand_mb['Name'] 
  
# Split into training and test set 
X_train, X_test, y_train, y_test = train_test_split( 
             X, y, test_size = 0.2, random_state=42) 
  
knn = KNeighborsClassifier(n_neighbors=3) 
  
knn.fit(X_train, y_train) 
  
# Predict on dataset which model has not seen before 
print(knn.predict(X_test.values))

In [None]:
#what i initially used to get brands

# class Chr_answer(BaseModel):
#     Characteristics: List[str] = Field(None, description="Brands as a list of strings")

# #few shot prompt with api
# completion = client.beta.chat.completions.parse(
#     model="gpt-4o-2024-08-06",
#     messages=[
#         {"role": "system", "content": "Be a helpful assistant."},
#         {"role": "system", "content": "Find 10 non-redundant best selling American clothing brands."},
#         {"role": "system", "content": "DONT MAKE ANY MISTAKES, check if you did any."},
#         {"role": "user", "content": "Give me 10 best selling American clothing brands."}
#     ],
#     response_format=Chr_answer,
# )

# #you can also proceed from parsed eg json.loads(completion_song.choices[0].message.parsed.json())
# out = json.loads(completion.choices[0].message.content)
# out = list(out.values())[0]
# brands = out
# print(brands)
# print(len(brands))

In [None]:
# #Goal: Get binary association of 100 brands and 1 book with 50 emotions. Multibinarize and do a correlation. in the example, no correlation with any of the brands
# # create a list of 50 human emotions
# client = OpenAI(api_key=k)
# class Chr_answer(BaseModel):
#     Characteristics: List[str] = Field(None, description="Emotions as a list of strings")

# #few shot prompt with api
# completion = client.beta.chat.completions.parse(
#     model="gpt-4o-2024-08-06",
#     messages=[
#         {"role": "system", "content": "Be a helpful assistant."},
#         {"role": "system", "content": "Find 50 non-redundant human emotions. These emotions should be different from one another, meaning that for example, of the emotions 'joy' and 'happiness', only pick one. Of 'Shame' and 'Embarrassment' and of 'Envy' and 'Jealousy', only pick one. So on and so forth."},
#         {"role": "system", "content": "Stop finding if you've already found 50."},
#         {"role": "system", "content": "DONT MAKE ANY MISTAKES, check if you did any."},
#         {"role": "user", "content": "Give me about 50 emotions addressing non-redundant and different human feelings."}
#     ],
#     response_format=Chr_answer,
# )

# #you can also proceed from parsed eg json.loads(completion_song.choices[0].message.parsed.json())
# out = json.loads(completion.choices[0].message.content)
# out = list(out.values())[0]
# emotions = out
# print(emotions)
# print(len(emotions))

# #List 100 best selling American clothing brands and, for each brand, list which one of the emotions in the given list are most closely associated with (Either associated or not, binary)."
# client = OpenAI(api_key=k)

# Characteristic = Enum('Characteristic', dict([(i, i) for i in emotions])) #MyEnumType = Enum('MyEnumType', myEnumStrings)

# class BrandAttributes(BaseModel):
#     brand_name: str = Field(None, description=" name as str")
#     characteristics: List[Characteristic] = Field(None, description=" list of characteristics as str")

# class BrandChars(BaseModel):
#     a: List[BrandAttributes] = Field(None, description="List of BrandAttributes")
    
# completion = client.beta.chat.completions.parse(
#     model="gpt-4o-2024-08-06",
#     messages=[
#         {"role": "system", "content": "Be a helpful assistant."},
#         {"role": "system", "content": "List 100 best selling American clothing brands and, for each brand, list which one of the emotions in the given list are most closely associated with."},
#         {"role": "user", "content": "List 100 best selling American clothing brands and, for each brand, list which one of the emotions in the given list are most closely associated with."}
#     ],
#     response_format=BrandChars,
# )

# out = json.loads(completion.choices[0].message.content)
# brand_emotions = out
# out = list(out.values())[0]
# df_brand = pd.DataFrame(data = [i.values() for i in out], columns = ['Name','characteristic'])
# df_brand

# #Take the example song, and list which one of the emotions in the given list are most closely associated with the song.")

# client = OpenAI(api_key=k)

# Characteristic = Enum('Characteristic', dict([(i, i) for i in emotions])) 

# class BookAttributes(BaseModel):
#     book_name: str
#     book_writer :str = Field(None, description="writer as str")
#     characteristics: List[Characteristic]

# class BooksChars(BaseModel):
#     a: List[BookAttributes] = Field(None, description="List of book attributes.")

# prompt = """
# As prompt, take the book 'Summer Island', list its writer, and a list of the emotions from the given list that are most closely associated with the book.
# """

# completion = client.beta.chat.completions.parse(
#     model="gpt-4o-2024-08-06",
#     messages=[
#         {"role": "system", "content": "Be a helpful assistant."},
#         {"role": "user", "content": prompt}
#     ],
#     response_format=BooksChars,
# )
# out = json.loads(completion.choices[0].message.content)
# out = list(out.values())[0]
# df_book = pd.DataFrame(data = [i.values() for i in out], columns = ['Name','Writer', 'characteristic'])
# df_book

# #MultilabelBinarize brands and song
# from sklearn.preprocessing import MultiLabelBinarizer

# mlb = MultiLabelBinarizer(classes=emotions)

# df_brand2 = pd.DataFrame(mlb.fit_transform(df_brand['characteristic']), columns = emotions)
# df_brand_mb = pd.merge(df_brand2, df_brand[['Name']], how ='left', left_index=True, right_index=True)
# df_brand_mb.head()

# mlb = MultiLabelBinarizer(classes=emotions)

# df_book2 = pd.DataFrame(mlb.fit_transform(df_book['characteristic']), columns = emotions)
# df_book_mb = pd.merge(df_book2, df_book[['Name']], how ='left', left_index=True, right_index=True)
# df_book_mb.head()

# df_b_T = df_brand_mb.set_index('Name').T
# df_bo_T = df_book_mb.set_index('Name').T

# df_bo_T


# df_b_T.corrwith(df_bo_T, axis = 1)


#---------------
#2nd method, I tried, was to go through every brand from 10, and every emotion from 50, for every pair assign an association value (0 to 500):

# # Prompt GPT to determine the association between brand and emotion pairs based on semantic or contextual understanding.
  
# # Define the Pydantic models for the input and output
# class ObjectPair(BaseModel):
#     object1: str
#     object2: str

# class RelationRequest(BaseModel):
#     list1: List[str]  # First list of objects
#     list2: List[str]  # Second list of objects
#     model: str = Field(default="gpt-4o-2024-08-06")  # OpenAI model version
#     min_value: float = 0  # Minimum value of the normalized range
#     max_value: float = 500  # Maximum value of the normalized range 

# class RelationResponse(BaseModel):
#     pair: ObjectPair
#     association_score: float  # The score provided by GPT

# class OpenAIRelationQuantifier:
#     @staticmethod
#     def _generate_prompt(pair: ObjectPair, min_value: float, max_value: float) -> str:
#         """Generate the prompt for GPT to evaluate the association between two objects."""
#         return f"On a scale from {min_value} to {max_value}, how strongly are the following two items related?\n\n" \
#                f"Item 1: {pair.object1}\nItem 2: {pair.object2}\n" \
#                f"Please provide a score and a short explanation of their relationship."

#     @staticmethod
#     def quantify_relations(request: RelationRequest) -> List[RelationResponse]:
#         """Quantify the relationships between each pair of objects using GPT."""
#         client = OpenAI(api_key=k)  

#         results = []
#         for object1 in request.list1:
#             for object2 in request.list2:
#                 pair = ObjectPair(object1=object1, object2=object2)
#                 prompt = OpenAIRelationQuantifier._generate_prompt(pair, request.min_value, request.max_value)

#                 # Send the prompt to GPT using beta.chat.completions.parse method
#                 response = client.beta.chat.completions.parse(
#                     model=request.model,
#                     messages=[
#                         {"role": "system", "content": "You are an expert at analyzing relationships between concepts."},
#                         {"role": "user", "content": prompt}
#                     ]
#                 )

#                 # Extract GPT's parsed response
#                 gpt_reply = response.choices[0].message.content
#                 score_line = gpt_reply.splitlines()[0]
#                 score = float(score_line.split()[-1].rstrip('.'))  # Assumes the score is at the end of the first line

#                 # Append the result
#                 results.append(RelationResponse(pair=pair, association_score=score))

#         return results

# # Usage

# # Define two lists of objects to compare
# list1 = brands[:3]
# list2 = emotions[:5]

# # Create the relation request object
# relation_request = RelationRequest(list1=list1, list2=list2, min_value=0, max_value=500)

# # Quantify the relationships between objects in the two lists
# relations = OpenAIRelationQuantifier.quantify_relations(relation_request)

# # Display the results
# for relation in relations:
#     print(f"Pair: ({relation.pair.object1}, {relation.pair.object2}) - Association Score: {relation.association_score:.2f}")
# relations


# df=pd.DataFrame()
# df['brand']= [i.pair.object1 for i in relations]
# df['emotion']= [i.pair.object2 for i in relations]
# df['score']= [i.association_score for i in relations]

# print(df)
# df = df.pivot(index='brand',columns ='emotion', values='score')


In [None]:
# #nested prompt as dict
# from pydantic import BaseModel, Field
# from typing import Tuple
# class RelationRequest(BaseModel):
#     list1: List[str]  # First list of objects (object1)
#     list2: List[str]  # Second list of objects (object2)
#     model: str = Field(default="gpt-4o-2024-08-06")  # Updated OpenAI model version
#     min_value: float = 0  # Minimum value of the normalized range
#     max_value: float = 500  # Maximum value of the normalized range (set to 500)

# class ObjectPair(BaseModel):
#     object1: str
#     obj2: str

# class RelationResponse(BaseModel):
#     pair: ObjectPair
#     association_score: float  # The score provided by GPT


# class OpenAIRelationQuantifier:
#     @staticmethod
#     def _generate_prompt(object1: str, object2_list: List[str], min_value: float, max_value: float) -> str:
#         """Generate the prompt for GPT to evaluate the association between one object and a list of other objects."""
#         prompt = f"For the object '{object1}', please provide a score between {min_value} and {max_value} for its association with each of the following objects:"
        
#         for idx, obj2 in enumerate(object2_list, 1):
#             prompt += f"\n{idx}. {obj2}"
#             print(obj2)

#         prompt += "\n\nReturn the result as 'object1, obj2' pairs and their associated scores"

#         return prompt

#     @staticmethod
#     def quantify_relations(request: RelationRequest, api_key: str) -> List[RelationResponse]:
#         """Quantify the relationships between each object1 and all objects in object2 using GPT."""
#         openai.api_key = api_key  # Set the API key for OpenAI

#         results = []
#         for object1 in request.list1:
#             prompt = OpenAIRelationQuantifier._generate_prompt(object1, request.list2, request.min_value, request.max_value)

#             # Send the prompt to GPT
#             response = client.beta.chat.completions.parse(
#                 model='gpt-4o-2024-08-06',
#                 messages=[
#                     {"role": "system", "content": "You are an expert at analyzing relationships between concepts."},
#                     {"role": "user", "content": prompt}
#                 ],
#                 response_format=RelationResponse
#             )

#             # Extract GPT's parsed response
#             results.append(response.choices[0].message.content)
#         return results


# api_key = k

# # Define two lists of objects to compare
# list1 = brands[:3]
# list2 = emotions[:2]

# # Create the relation request object
# relation_request = RelationRequest(list1=list1, list2=list2, min_value=0, max_value=500)

# # Quantify the relationships between objects in the two lists
# relations = OpenAIRelationQuantifier.quantify_relations(relation_request, api_key=api_key)

# relations

In [None]:
# #nested prompt as joined str in a list
# class RelationRequest(BaseModel):
#     list1: List[str]  # First list of objects (object1)
#     list2: List[str]  # Second list of objects (object2)
#     model: str = Field(default="gpt-4o-2024-08-06")  # Updated OpenAI model version
#     min_value: float = 0  # Minimum value of the normalized range
#     max_value: float = 500  # Maximum value of the normalized range (set to 500)

# class ObjectPair(BaseModel):
#     object1: str
#     obj2: str

# class RelationResponse(BaseModel):
#     pair: ObjectPair
#     association_score: float  # The score provided by GPT

# class OpenAIRelationQuantifier:
#     @staticmethod
#     def _generate_prompt(object1: str, object2_list: List[str], min_value: float, max_value: float) -> str:
#         """Generate the prompt for GPT to evaluate the association between one object and a list of other objects."""
#         object2_str = ', '.join(object2_list)
#         return f"For the following object '{object1}', assign a score between {min_value} and {max_value} to each object in this list: [{object2_str}]. " \
#                f"Please return the result as 'object1 obj2' pairs, and ensure the scores reflect the association strength."

#     @staticmethod
#     def quantify_relations(request: RelationRequest, api_key: str) -> List[RelationResponse]:
#         """Quantify the relationships between each object1 and all objects in object2 using GPT."""
#         openai.api_key = api_key  # Set the API key for OpenAI

#         results = []
#         for object1 in request.list1:
#             prompt = OpenAIRelationQuantifier._generate_prompt(object1, request.list2, request.min_value, request.max_value)

#             response = client.beta.chat.completions.parse(
#                 model=request.model,
#                 messages=[
#                     {"role": "system", "content": "You are an expert at analyzing relationships between concepts."},
#                     {"role": "user", "content": prompt}
#                 ],
#                 response_format=RelationResponse
#             )
            
#             results.append(relations.choices[0].message.content)

#         return results


# api_key = k

# # Define two lists of objects to compare
# list1 = ["Nike", "lululemon"]
# list2 = ["Joy","liberated"]

# # Create the relation request object
# relation_request = RelationRequest(list1=list1, list2=list2, min_value=0, max_value=500)

# # Quantify the relationships between objects in the two lists
# relations = OpenAIRelationQuantifier.quantify_relations(relation_request, api_key=api_key)
# relations
