In [1]:
import pandas as pd
import itertools
from sklearn.metrics.pairwise import cosine_similarity
import os

In [None]:
# Read the data
df_wrime = pd.read_table('wrime-ver1.tsv')

# Plutchik's eight primary emotions
emotion_names = ['Joy', 'Sadness', 'Anticipation', 'Surprise', 'Anger', 'Fear', 'Disgust', 'Trust']
df_wrime['readers_emotion_intensities'] = df_wrime.apply(lambda x: [x['Avg. Readers_' + name] for name in emotion_names], axis=1)
is_target = df_wrime['readers_emotion_intensities'].map(lambda x: max(x) >= 2)
df_wrime_target = df_wrime[is_target]
# reindex
df_wrime_target = df_wrime_target.reset_index(drop=True)
# get the sentences
sentences = df_wrime_target['Sentence'].tolist()

# replace '\n' with ''
sentences = [sentence.replace('\\n', ' ') for sentence in sentences]
# replace 'x000D' with ''
sentences = [sentence.replace('x000D', ' ') for sentence in sentences]

In [2]:
# Get only hard labeled data
emotion_flags = []

for label in emotion_names:
    is_target_emotion = df_wrime_target['readers_emotion_intensities'].apply(lambda x: x[emotion_names.index(label)] == 3 and sum(x) == 3)
    emotion_flags.append(is_target_emotion)

In [None]:
df_embeddings = pd.read_table('embeddings/fine-tuned_embeddings.tsv', header=None)

In [None]:
if not os.path.exists('mean_similar'):
    os.makedirs('mean_similar')

combinations = list(itertools.combinations(range(len(emotion_flags)), 2))

for combination in combinations:
    emotion_flag1 = emotion_flags[combination[0]]
    emotion_flag2 = emotion_flags[combination[1]]
    emotion_name1 = emotion_names[combination[0]]
    emotion_name2 = emotion_names[combination[1]]

    with open(f'mean_similar/{emotion_name1}_{emotion_name2}.txt', 'w') as file:
        for s in range(len(df_wrime_target[emotion_flag1])):
            for t in range(len(df_wrime_target[emotion_flag2])):
                mean_vector = (df_embeddings.iloc[df_wrime_target[emotion_flag1].index[s], :] + df_embeddings.iloc[df_wrime_target[emotion_flag2].index[t], :]) / 2
                df_mean_vector = pd.DataFrame(mean_vector).T

                df_wrime_target['similarity'] = cosine_similarity(df_mean_vector, df_embeddings)[0]
                best3_similarity_sentence = df_wrime_target.sort_values('similarity', ascending=False)[['Sentence', 'similarity']].head(10)

                file.write('\n')
                file.write(f'target sentence 1: {sentences[df_wrime_target[emotion_flag1].index[s]]} {df_wrime_target.loc[df_wrime_target[emotion_flag1].index[s], "readers_emotion_intensities"]}\n')
                file.write(f'target sentence 2: {sentences[df_wrime_target[emotion_flag2].index[t]]} {df_wrime_target.loc[df_wrime_target[emotion_flag2].index[t], "readers_emotion_intensities"]}\n')
                file.write('---\n')

                for i, row in best3_similarity_sentence.iterrows():
                    file.write(f'sentence: {row["Sentence"]}\n')
                    file.write(f'intensities: {df_wrime_target.loc[i, "readers_emotion_intensities"]}\n')
                    file.write(f'similarity: {row["similarity"]}\n')
                    file.write('---\n')