In [1]:
!pip install torch transformers pandas --quiet
!pip install -U scikit-learn --quiet

In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import numpy as np

# Configuração do modelo e tokenizer
model_name = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Função para calcular importância das palavras-chave (tokens) com base nos pesos de atenção
def calculate_keyword_importance(attentions, tokens):
    token_importance = attentions[-1].mean(dim=1).squeeze().tolist()
    keywords = [(tokenizer.decode([token]), importance) for token, importance in zip(tokens, token_importance)]
    keywords_sorted = sorted(keywords, key=lambda x: x[1], reverse=True)
    return keywords_sorted[:5]  # Top 5 palavras-chave mais importantes

# Função principal para gerar o dataframe de explicabilidade
def generate_explainability_df(text_1, text_2):
    # Tokenizar as entradas e gerar embeddings com atenção
    inputs_1 = tokenizer(text_1, return_tensors="pt", padding=True, truncation=True)
    inputs_2 = tokenizer(text_2, return_tensors="pt", padding=True, truncation=True)

    # Embeddings e pesos de atenção
    outputs_1 = model(**inputs_1, output_attentions=True)
    outputs_2 = model(**inputs_2, output_attentions=True)

    # Similaridade de cosseno entre os embeddings finais
    embeddings_1 = outputs_1.last_hidden_state[:, 0, :].detach().numpy()
    embeddings_2 = outputs_2.last_hidden_state[:, 0, :].detach().numpy()
    similarity = cosine_similarity(embeddings_1, embeddings_2)[0][0]

    # Extração e cálculo de pesos de atenção
    attention_weights_1 = [layer.detach().numpy() for layer in outputs_1.attentions]
    attention_weights_2 = [layer.detach().numpy() for layer in outputs_2.attentions]

    # Palavras-chave mais importantes
    keywords_1 = calculate_keyword_importance(outputs_1.attentions, inputs_1["input_ids"].squeeze())
    keywords_2 = calculate_keyword_importance(outputs_2.attentions, inputs_2["input_ids"].squeeze())

    # Confiança na Similaridade
    confidence_score = np.std(np.mean(attention_weights_1[-1], axis=1).squeeze().tolist())
    
    # Justificativa
    justification_summary = "Similaridade baseada em termos como " + ", ".join([kw[0] for kw in keywords_1[:3]]) \
                            + " no texto 1 e " + ", ".join([kw[0] for kw in keywords_2[:3]]) + " no texto 2."

    # Informação de Tamanho
    input_length_1 = len(inputs_1["input_ids"].squeeze())
    input_length_2 = len(inputs_2["input_ids"].squeeze())

    # Tokens ignorados
    ignored_tokens_1 = [tokenizer.decode([token]) for token in inputs_1["input_ids"].squeeze().tolist() if token not in tokenizer.all_special_ids]
    ignored_tokens_2 = [tokenizer.decode([token]) for token in inputs_2["input_ids"].squeeze().tolist() if token not in tokenizer.all_special_ids]

    # Clusterização e Distância para Outliers (Exemplo com uma métrica simples)
    embedding_cluster_positions_1 = embeddings_1.tolist()
    embedding_cluster_positions_2 = embeddings_2.tolist()
    outlier_distance = np.linalg.norm(embeddings_1 - embeddings_2)  # Distância entre os embeddings

    # Criar o DataFrame com as informações explicáveis
    df = pd.DataFrame({
        "input_text_1": [text_1],
        "input_text_2": [text_2],
        "token_ids_1": [inputs_1["input_ids"].detach().numpy()],
        "token_ids_2": [inputs_2["input_ids"].detach().numpy()],
        "attention_weights_1": [attention_weights_1],
        "attention_weights_2": [attention_weights_2],
        "similarity_score": [similarity],
        "embedding_vectors_1": [embeddings_1],
        "embedding_vectors_2": [embeddings_2],
        "timestamp": [datetime.now()],
        "model_version": [model_name],
        "keyword_importance_1": [keywords_1],
        "keyword_importance_2": [keywords_2],
        "similarity_confidence": [confidence_score],
        "justification_summary": [justification_summary],
        "input_length_1": [input_length_1],
        "input_length_2": [input_length_2],
        "ignored_tokens_1": [ignored_tokens_1],
        "ignored_tokens_2": [ignored_tokens_2],
        "embedding_cluster_positions_1": [embedding_cluster_positions_1],
        "embedding_cluster_positions_2": [embedding_cluster_positions_2],
        "outlier_distance": [outlier_distance]
    })
    
    return df

# Exemplo de uso
text_1 = "Descrição de uma vaga de cientista de dados"
text_2 = "Perfil de um candidato com experiência em análise de dados"
df_explainability = generate_explainability_df(text_1, text_2)

  from .autonotebook import tqdm as notebook_tqdm
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [3]:
df_explainability

Unnamed: 0,input_text_1,input_text_2,token_ids_1,token_ids_2,attention_weights_1,attention_weights_2,similarity_score,embedding_vectors_1,embedding_vectors_2,timestamp,...,keyword_importance_2,similarity_confidence,justification_summary,input_length_1,input_length_2,ignored_tokens_1,ignored_tokens_2,embedding_cluster_positions_1,embedding_cluster_positions_2,outlier_distance
0,Descrição de uma vaga de cientista de dados,Perfil de um candidato com experiência em anál...,"[[101, 1305, 10950, 125, 230, 5926, 125, 10989...","[[101, 1740, 3252, 125, 222, 4931, 170, 4040, ...",[[[[[0.8595248 0.01187342 0.00428111 0.011538...,[[[[[0.83092827 0.0191794 0.01446818 0.011154...,0.771395,"[[0.10359708, -0.48410273, 1.0135684, -0.19191...","[[0.116354845, -0.41202834, 0.5305816, -0.0884...",2024-11-12 11:40:30.366627,...,"[(um, [0.3526493012905121, 0.03881805762648582...",0.121607,"Similaridade baseada em termos como de, uma, d...",11,13,"[Des, ##crição, de, uma, vaga, de, cientista, ...","[Per, ##fil, de, um, candidato, com, experiênc...","[[0.10359708219766617, -0.484102725982666, 1.0...","[[0.11635484546422958, -0.41202834248542786, 0...",7.38531
