In [1]:
import pandas as pd

# read vector data
sent_scores = pd.read_csv("static/emo_vectores_602_loc_md.csv")

# read lemmatized texts
lemmas_per_story_df = pd.read_csv("data/lemmatized_stories.csv")
lemmas_per_story_df["n_words"] = lemmas_per_story_df["lemmas"].apply(lambda x: len(x.split()))

lemmas_per_story_dict = {}
for index, row in lemmas_per_story_df.iterrows():
    story_id = row['StoryID']
    lemmas = row['lemmas'].split()
    lemmas_per_story_dict[story_id] = lemmas

# fmake list of all words(lemmas)
all_lemmas = []
for idx in lemmas_per_story_dict:
    all_lemmas.extend(lemmas_per_story_dict[idx])

# list of types (unique lemmas)
unique_lemmas = list(set(all_lemmas))

print(f"The number of words in the corpus: {len(all_lemmas)}")
print(f"The number of unique lemmas in the corpus: {len(unique_lemmas)}")

The number of words in the corpus: 37567
The number of unique lemmas in the corpus: 7075


In [2]:
# define emotion labels list to explore
# labels = list(sent_scores.columns)[1:]  # ["happy", "pleas", "cont", "pride", "relief", "amuse', 'surp', 'sad', 'fear', 'disgust', 'shame', 'anger']
labels = ["happy", "pleas", "sad"]

In [3]:
# TODO: read from files

from emo_functions import get_emo_words

# this takes some time going thorigh all vercor data
emo_word_dict = get_emo_words(unique_lemmas, labels)

Creating list of happy words
Creating list of pleas words
Creating list of sad words


In [4]:
# count words, freqs and scores for each emotional word in corpus per each emotion based in lists from get_emo_words()
from emo_functions import get_scores_and_freqs
emo_words_data = get_scores_and_freqs(emo_word_dict, all_lemmas)

# save lexicons to csv
for emotion in emo_words_data:
    df = pd.DataFrame(emo_words_data[emotion])
    df.to_csv(f"results/lexicons/{emotion}_lexicon.csv", index=False)


In [5]:
# counts the total number of each emoniton emotional words in corpus
from emo_functions import count_emo_words_total_freq

count_data = count_emo_words_total_freq(emo_words_data)

# save result
df = pd.DataFrame(count_data)
df.to_csv("results/emo_words_count.csv", index=False)

{'happy': [{'lemma': 'упоение', 'freq': 2, 'score': 0.4116482}, {'lemma': 'счастие', 'freq': 32, 'score': 0.8145565}, {'lemma': 'красота', 'freq': 19, 'score': 0.40538603}, {'lemma': 'прелесть', 'freq': 1, 'score': 0.43166476}, {'lemma': 'радоваться', 'freq': 3, 'score': 0.43464622}, {'lemma': 'осчастливленная', 'freq': 1, 'score': 0.40145236}, {'lemma': 'радостный', 'freq': 4, 'score': 0.42917}, {'lemma': 'жизнь', 'freq': 96, 'score': 0.41600302}, {'lemma': 'бедняжка', 'freq': 1, 'score': 0.4208307}, {'lemma': 'спасение', 'freq': 1, 'score': 0.40139747}, {'lemma': 'успех', 'freq': 4, 'score': 0.42934263}, {'lemma': 'разочарование', 'freq': 4, 'score': 0.40357244}, {'lemma': 'потому', 'freq': 12, 'score': 0.4315129}, {'lemma': 'радость', 'freq': 16, 'score': 0.6290357}, {'lemma': 'утешение', 'freq': 1, 'score': 0.46744418}, {'lemma': 'горе', 'freq': 7, 'score': 0.49296218}, {'lemma': 'грезиться', 'freq': 2, 'score': 0.40239233}, {'lemma': 'случайность', 'freq': 2, 'score': 0.41162673},

In [6]:
# calculate score per each story based on its word values and normalized by the length of the story
from emo_functions import get_score_per_story

data_per_stories = get_score_per_story(lemmas_per_story_dict, emo_words_data)
for emotion in data_per_stories:
    df = pd.DataFrame(data_per_stories[emotion])
    df.story_emo_score = df.story_emo_score.round(3)
    merged_df = pd.merge(df, lemmas_per_story_df[["StoryID", "FILE NAME"]], on="StoryID", how="left")
    # merged_df = merged_df.reset_index(drop=True)

    merged_df.to_csv(f"results/scores_per_story/{emotion}_per_story.csv", index=False)


HAPPY
1 / 10 S115
2 / 10 S058
3 / 10 S071
4 / 10 S028
5 / 10 S083
6 / 10 S102
7 / 10 S016
8 / 10 S004
9 / 10 S043
10 / 10 S093
PLEAS
1 / 10 S115
2 / 10 S058
3 / 10 S071
4 / 10 S028
5 / 10 S083
6 / 10 S102
7 / 10 S016
8 / 10 S004
9 / 10 S043
10 / 10 S093
SAD
1 / 10 S115
2 / 10 S058
3 / 10 S071
4 / 10 S028
5 / 10 S083
6 / 10 S102
7 / 10 S016
8 / 10 S004
9 / 10 S043
10 / 10 S093


In [8]:
for emotion in data_per_stories:
    print(emotion)

happy
