In [7]:
# Função que recebe df_classificacao, df_img_embeddings e o embedding da imagem e retorna um df ordenado por similaridade
# com o embedding da imagem

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def similaridade_cosseno(df_classificacao, df_img_embeddings, img_embedding):
    """
    Função que recebe df_classificacao, df_img_embeddings e o embedding da imagem e retorna um df ordenado por similaridade
    com o embedding da imagem
    """

    # Puxando os embeddings das imagens para df_classificacao
    df_classificacao = pd.merge(df_classificacao, df_img_embeddings, on='id_produto', how='inner')

    # Pegando as colunas que tem emb_img 
    emb_cols = [col for col in df_img_embeddings.columns if 'emb_img' in col]

    # Calcula a similaridade entre o embedding da imagem e os embeddings das imagens de df_classificacao
    sim = cosine_similarity(df_classificacao[emb_cols].values, img_embedding[emb_cols].values)
    
    # Adiciona a coluna de similaridade no df_img_embeddings
    df_classificacao['sim'] = sim
    
    # Ordena o df_classificacao por similaridade
    df_classificacao = df_classificacao.sort_values(by='sim', ascending=False)
    
    # Retorna o df_classificacao
    return df_classificacao

In [2]:
img_embedding = [ i for i in range(512)]

In [4]:
import pandas as pd

# transformando img_embedding em dataframe com as colunas emb_img_{i}
df_emb = pd.DataFrame([img_embedding], columns=[f'emb_img_{i}' for i in range(len(img_embedding))])

In [5]:
df_emb.sample()

Unnamed: 0,emb_img_0,emb_img_1,emb_img_2,emb_img_3,emb_img_4,emb_img_5,emb_img_6,emb_img_7,emb_img_8,emb_img_9,...,emb_img_502,emb_img_503,emb_img_504,emb_img_505,emb_img_506,emb_img_507,emb_img_508,emb_img_509,emb_img_510,emb_img_511
0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511


In [8]:
# puxando df_classificacao.parquet e img_embeddings.parquet
df_classificacao = pd.read_parquet('./datasets/df_classificacao.parquet')
df_img_embeddings = pd.read_parquet('./datasets/img_embeddings.parquet')

# # pegando um emb aletório pra testar
# img_embedding = df_img_embeddings.sample(1)

# chamando a função
df_sim = similaridade_cosseno(df_classificacao, df_img_embeddings, df_emb)

In [10]:
df_emb

Unnamed: 0,emb_img_0,emb_img_1,emb_img_2,emb_img_3,emb_img_4,emb_img_5,emb_img_6,emb_img_7,emb_img_8,emb_img_9,...,emb_img_502,emb_img_503,emb_img_504,emb_img_505,emb_img_506,emb_img_507,emb_img_508,emb_img_509,emb_img_510,emb_img_511
0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511


In [9]:
df_sim.head()

Unnamed: 0,id_produto,grupo_produto,blusa_comprimento_blusa,blusa_comprimento_manga,blusa_decote,calca_comprimento_calca,calca_modelagem_calca,cor,estampa,saia_comprimento_saia,...,emb_img_503,emb_img_504,emb_img_505,emb_img_506,emb_img_507,emb_img_508,emb_img_509,emb_img_510,emb_img_511,sim
590,07-20-6828-09812,vestido,,,,,,BRANCO,abstrato,,...,-0.143804,0.35219,0.136577,-0.109828,-0.142583,0.235666,0.853869,0.07322,0.068405,0.099166
676,07-20-6582-09249,vestido,,,,,,LARANJA,liso,,...,-0.132052,0.465839,-0.039068,0.10739,-0.043994,-0.101816,0.259647,-0.193711,0.123518,0.092636
572,07-20-6475-09259,vestido,,,,,,ROSA,liso,,...,-0.48583,0.15531,0.176762,-0.271943,-0.038973,0.088913,1.366884,-0.208256,0.388112,0.092468
543,07-20-7070-8399,vestido,,,,,,BEGE,liso,,...,-0.460659,0.320019,0.150448,-0.003568,-0.065661,0.297573,0.883042,-0.165814,0.226913,0.091883
294,52-13-5276-8259,blusa,CURTA,ALCA,FRENTE_UNICA,,,VERMELHO,liso,,...,-0.170874,0.172805,0.127845,0.190867,-0.14446,0.047882,0.454087,-0.107933,0.067576,0.090358
