In [None]:
#!pip install openai
#!pip install transformers
#!pip install torch torchvision

import json
import pandas as pd
from openai import OpenAI
from io import StringIO
import re


In [None]:
with open('../key/key.json') as f:
    k = json.load(f)['key']

In [None]:
client = OpenAI(api_key=k)

In [None]:
chat_completion_b = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Give me a list of 10 top selling both gender clothing brand names in US and for each brand list 2 single word dominant characteristics of the brand shoppers.",
        }
    ],
    model="gpt-4o-mini"
)

In [None]:
chat_completion_s = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Give me a list of 10 best selling song names and for each song list 2 single word dominant emotions that the song inspires. No singer names.",
        }
    ],
    model="gpt-4o-mini"
)

In [None]:
brands = chat_completion_b.choices[0].message.content
brands = brands.split('\n')
b = [i for i in brands[1:-1] if i not in ['']]
b = [re.sub(r'[^a-zA-Z]','',i) for i in b]
b_n = b[::3]
b_chr = [i for i in b if i not in b[::3]]
b_chr = [tuple(b_chr[i::+1][0:2]) for i in range(0,len(b_chr),2)]
d_b = dict(zip(b_n,b_chr))

In [None]:
songs = chat_completion_s.choices[0].message.content
songs = songs.split('\n')
s = [i for i in songs[1:-1] if i not in ['']]
s = [i.split(':')[-1] for i in s]
s = [re.sub(r'[^a-zA-Z]','',i) for i in s]
s_n = s[::3]
s_chr = [i for i in s if i not in s[::3]]
s_chr = [tuple(s_chr[i::+1][0:2]) for i in range(0,len(s_chr),2)]
d_s = dict(zip(s_n,s_chr))

In [None]:
print(d_b)
d_s

In [None]:
#remove songs and brands with less than 2 adjectives.
tuple_size_to_remove = [0,1]
d_b = {k: v for k, v in d_b.items() if len(v) not in tuple_size_to_remove}
d_s = {k: v for k, v in d_s.items() if len(v) not in tuple_size_to_remove}



In [None]:
from transformers import BertTokenizer, BertModel
#import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

In [None]:
import numpy as np

def average_embedding(embeddings):
    return np.mean(embeddings, axis=0)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarities = []
for adj_A in list(d_b.values()):
    avg_embedding_A = average_embedding(get_embeddings(adj_A))
    for adj_B in list(d_s.values()):
        avg_embedding_B = average_embedding(get_embeddings(adj_B))
        similarity_score = cosine_similarity([avg_embedding_A], [avg_embedding_B])[0][0]
        similarities.append((adj_A, adj_B, similarity_score))

In [None]:
df = pd.DataFrame(similarities, columns = ['b_chr', 's_em', 'cosine'])
inverted_d_b = {v: k for k, v in d_b.items()}
inverted_d_s = {v: k for k, v in d_s.items()}
df['song'] = df['s_em'].map(inverted_d_s)
df['brand'] = df['b_chr'].map(inverted_d_b)
df['cosine'] = df['cosine'].apply(lambda x: round(x, 2))


In [None]:
pd.set_option("display.max_rows", None)
df = df[['brand','b_chr','song','s_em','cosine']]

In [None]:
df.sort_values(['brand','cosine'], ascending = [True, False], inplace = True)
df = df.groupby('brand').head(3)
df

In [None]:
np.mean(get_embeddings(('joy','happy')),axis=0).shape