In [1]:
import torch
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import spacy
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline

In [2]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

In [3]:
def get_embedding(label):
    # Tokenize and get embeddings for the label
    inputs = tokenizer(label, return_tensors='pt')
    outputs = model(**inputs)
    # Use the embeddings from the last hidden state, averaging over tokens
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [4]:
#torch.nn.functional.cosine_similarity(get_embedding("door"), get_embedding("window"))

In [5]:
def calculate_similarity(labels):
    embeddings = {label: get_embedding(label) for label in labels}
    synonym_map = {}
    visited = set()

    for label1 in labels:
        if label1 in visited:
            continue
        for label2 in labels:
            if label1 != label2 and label2 not in visited:
                # Compute cosine similarity between embeddings
                cos_sim = torch.nn.functional.cosine_similarity(embeddings[label1], embeddings[label2])
                if cos_sim.item() > 0.6:  # More stringent threshold
                    primary = label1 if labels.count(label1) >= labels.count(label2) else label2
                    secondary = label2 if primary == label1 else label1
                    synonym_map[secondary] = primary
                    visited.add(secondary)
    
    return synonym_map

In [6]:
#labels = ["forgiving", "humble", "friendly", "decent", "evil", "cruel"]
#synonym_map = calculate_similarity(labels)
#print(synonym_map)

In [7]:
def merge_labels(df, synonym_map):
    df['Primary_Label'] = df['Extracted_Label'].apply(lambda x: synonym_map.get(x, x))
    return df.groupby('Primary_Label').agg({'count': 'sum'}).reset_index().rename(columns={'Primary_Label': 'Extracted_Label'})

In [8]:
# Load the data
data = pd.read_csv("path to your file with extracted labels", usecols=['Extracted_Label', 'count'])

In [9]:
changes = True
iterations = 0
merge_history = []
max_iterations = 3

while changes and iterations < max_iterations:
    print(f"Total labels: {len(data)}")
    print(f"Iteration {iterations + 1} started")
    
    unique_labels = data['Extracted_Label'].unique().tolist()
    synonyms = calculate_similarity(unique_labels)
    #synonyms = calculate_similarity(unique_labels, threshold=0)

    if not synonyms:
        changes = False
        print(f"No merges found after iteration {iterations}")
    else:
        merge_history.append(synonyms)  # Track which labels were merged
        data = merge_labels(data, synonyms)  # Update the data with merged labels
        print(f"Synonyms merged in iteration {iterations + 1}: {synonyms}")
        
    iterations += 1
    print(f"Iteration {iterations} completed")

# After completing all iterations, sort the data and save it to a CSV file
data = data.sort_values(by=['count'], ascending=False)
data.to_csv("path to save your file", index=False)

print(f"Data consolidated and synonyms merged after {iterations} iterations.")

Total labels: 530
Iteration 1 started
Synonyms merged in iteration 1: {' unfriendly': ' friendly', ' friendliness': ' friendly', ' generosity': ' generous', ' generous ': ' generous', ' selfless': ' selfish', ' compassionate': ' selfish', ' greedy': ' selfish', ' altruistic': ' selfish', ' unselfish': ' selfish', ' selfish ': ' selfish', ' selfishness': ' selfish', ' greedy ': ' selfish', ' empathy': ' caring', ' considerate': ' caring', ' kindness': ' caring', ' compassion': ' caring', ' unforgiving': ' forgiving', ' impatient': ' patient', ' patient ': ' patient', ' inpatient ': ' patient', ' helpfulness': ' helpful', ' courageous': ' heroic', ' heroic ': ' heroic', ' loyal': ' reliable', ' trustworthy': ' reliable', ' unreliable': ' reliable', ' trustful': ' reliable', ' dependable': ' reliable', ' reliabilty': ' reliable', ' extroverted': ' sociable', ' chatty': ' sociable', ' introverted': ' sociable', ' amiable': ' personable', ' hospitable ': ' personable', ' laziness': ' lazy',

In [10]:
# Consolidate merge history into a structured format
structured_merge_history = {}
for iteration in merge_history:
    for word, primary_word in iteration.items():
        if primary_word not in structured_merge_history:
            structured_merge_history[primary_word] = []
        structured_merge_history[primary_word].append(word)

# Print structured merge history
print("Structured Merge History:")
for primary_word, merged_words in structured_merge_history.items():
    print(f"{primary_word}: {', '.join(merged_words)}")

save_path = "path to save your history file"
with open(save_path, 'w') as file:
    for primary_word, merged_words in structured_merge_history.items():
        file.write(f"{primary_word}: {', '.join(merged_words)}\n")


Structured Merge History:
 friendly:  unfriendly,  friendliness
 generous:  generosity,  generous 
 selfish:  selfless,  compassionate,  greedy,  altruistic,  unselfish,  selfish ,  selfishness,  greedy 
 caring:  empathy,  considerate,  kindness,  compassion
 forgiving:  unforgiving
 patient:  impatient,  patient ,  inpatient 
 helpful:  helpfulness
 heroic:  courageous,  heroic 
 reliable:  loyal,  trustworthy,  unreliable,  trustful,  dependable,  reliabilty
 sociable:  extroverted,  chatty,  introverted
 personable:  amiable,  hospitable 
 lazy:  laziness,  tired
 truthful:  honest,  dishonest,  honesty,  truthful 
 angry:  furious,  resentful,  irritable
 modest:  humble
 empathetic:  apathetic,  emotional
 callous:  hateful
 ambition:  ambitious,  humbleness,  arrogance,  curiosity,  persistence,  greed
 boastful:  prideful
 good temper:  ill temper,  ill-tempered,  good temper  
 benevolent:  charitable
 respectful:  rude,  courteous,  disrespectful,  polite,  respect,  respecte