In [1]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd

model = SentenceTransformer('all-mpnet-base-v2')

  from .autonotebook import tqdm as notebook_tqdm


Retrieve Google's GoEmotions dataset

In [2]:
# Import GoEmotions
df1 = pd.read_csv("../Data_Storage/DescriptiveInference/goemotions_Google/fulldataset/goemotions/goemotions_1.csv")
df1.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn‚Äôt be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
# Get all emotions
titles = df1.columns.to_list()
emotions = titles[9:]
# get whether text is unclear
unclear = titles[8]
text = titles[0]

In [4]:
def normalize_list(lst):
    lst = np.array(lst)
    return (lst/np.linalg.norm(lst))

# Note Normalizing the embeddings appear to do nothing, meaning that the embeddings appear pre-normalized
def column_mean(list_of_lists, normalize=False):
    # Convert the input list of lists to a numpy array
    lists = np.array([normalize_list(lst) for lst in list_of_lists]) if normalize else np.array(list_of_lists)
    data = np.array(lists)
    # Compute the mean average along the columns (axis=0)
    mean_average = np.mean(data, axis=0)
    # Convert the result back to a Python list
    mean_average_list = mean_average.tolist()
    return mean_average_list


In [5]:
def retrieve_embeddings(df: pd.DataFrame):
    return np.array([df[i].to_list() for i in range(len(df.columns))]).astype(float)

df_embeddings = pd.read_json("./unprocessed_embeddings.json")
unprocessed_embeddings = retrieve_embeddings(df_embeddings)

70000

In [10]:


# Get emotions
def get_mean_embedding(emotion_title: str, df: pd.DataFrame, embeddings):
    df_emotion = df[df[emotion_title] == 1]
    embeds = np.array([embeddings[index] for index in df_emotion.index.tolist()])
    return np.mean(embeds, axis=0)

# Note Normalizing the embeddings appear to do nothing, meaning that the embeddings appear pre-normalized
def column_mean(list_of_lists):
    # Convert the input list of lists to a numpy array
    # Compute the mean average along the columns (axis=0)
    mean_average = np.mean(list_of_lists, axis=0)
    # Convert the result back to a Python list
    mean_average_list = mean_average.tolist()
    return mean_average_list

def normalize_embedding(embed):
    return (embed/np.linalg.norm(embed))

def cos_sim(embed1, embed2):
    return float(util.cos_sim(embed1, embed2)[0][0])

def get_max_sim(embeds, focused_embeds):
    results = []
    for embed in embeds:
        results.append({key: cos_sim(embed, val) for key, val in focused_embeds.items()})
    return results

unprocessed_embeddings_norm = np.array([normalize_embedding(embed) for embed in unprocessed_embeddings])
focused_embeds = {title: get_mean_embedding(title, df1, unprocessed_embeddings_norm) for title in emotions}


In [12]:
index = -4
emotion_0_indices = df1[df1[emotions[index]] == 1].index.tolist()
emotion_0_embeds = np.array([unprocessed_embeddings[i] for i in emotion_0_indices])
embed_0 = focused_embeds[emotions[index]]

results = get_max_sim(emotion_0_embeds, focused_embeds)

In [13]:
print(emotions[index])
test = df1[df1[emotions[index]] == 1].reset_index()
for i, res in enumerate(results):
    text = test["text"][i]
    val = max(res, key=lambda k: res[k])
    print(f"Text: {text}\nSentiment: {val}\n")

remorse
Text: Na. Rediting my post. Sorry for the ignorance.
Sentiment: remorse

Text: Pity. I had some decent lunches there, but never went there at night.
Sentiment: relief

Text: Then I‚Äôm sorry but this game really isn‚Äôt for you. Progression like that would ruin the base of what this game is.
Sentiment: disapproval

Text: *and, sorry for the spelling mistake
Sentiment: remorse

Text: Sorry just seemed like you were trying to make it a [NAME] thing
Sentiment: remorse

Text: y i k e s
Sentiment: neutral

Text: No biggie, dude, sorry I got all triggered, I just reeeally love [NAME] 
Sentiment: remorse

Text: My mistake. I just glanced over your last post.
Sentiment: remorse

Text: I've been watching too much my 600lb life...I instantly hear dr. [NAME]
Sentiment: surprise

Text: I cringed so much just reading that tweet, I didn't even make it to the comments. That's my fucking President. :(
Sentiment: sadness

Text: You'll miss a begging old man asking for a spare coin. RIP
Sentimen