In [13]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer

In [14]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN
    
    
def lemmatize_word(word, lemmatizer):
    pos_tag = nltk.pos_tag([word])[0][1]
    wordnet_pos = get_wordnet_pos(pos_tag)
    return lemmatizer.lemmatize(word, pos=wordnet_pos)

In [None]:
def get_loughran_mcdonald_sets(file_path="./data/Loughran-McDonald_MasterDictionary_1993-2024.xlsx"):

    lm_df = pd.read_excel(file_path)

    lm_df.columns = [col.lower() for col in lm_df.columns]
    lm_df['word'] = lm_df['word'].str.lower()

    sentiments = ["positive", "negative", "uncertainty"]
    results = []
    lemmatizer = WordNetLemmatizer()

    for sentiment in sentiments:
        words_list = lm_df[lm_df[sentiment] > 0]["word"].dropna().tolist()
        words_list_lemm = [lemmatize_word(word, lemmatizer) for word in words_list]
        results.append(set(words_list_lemm))
    
    #"positive", "negative", "uncertainty"
    return results[0], results[1], results[2]


In [16]:
positive_words, negative_words, uncertain_words = get_loughran_mcdonald_sets()

In [None]:
def analyze_text_sentiments(lemm_text, positive_set, negative_set, uncertain_set):

    negation_set = ['no', 'not', 'never', 'none', 'neither', 'nor', 'without']

    if not isinstance(lemm_text, str):
        return 0, 0, 0, 0
        
    words = nltk.word_tokenize(lemm_text.lower())
    
    total_words = len(words)
    positive_count = 0
    negative_count = 0
    
    uncertainty_count = sum(1 for word in words if word in uncertain_set)

    for i, word in enumerate(words):
        is_negated = False

        if i > 0:
            if words[i-1] in negation_set:
                is_negated = True

        if word in positive_set:
            if is_negated:
                negative_count += 1
            else:
                positive_count += 1

        elif word in negative_set:
            if is_negated:
                positive_count += 1 
            else:
                negative_count += 1
    
    return total_words, positive_count, negative_count, uncertainty_count

In [18]:
lda_results = pd.read_excel("./data/processed/lda_results.xlsx")

In [19]:
list_total_words = []
list_positive_count = []
list_negative_count = []
list_uncertainty_count = []

for index, row in lda_results.iterrows():
    lemm_text = row["original_text"]
    total_words, positive_count, negative_count, uncertainty_count = analyze_text_sentiments(lemm_text, positive_words, negative_words, uncertain_words)

    list_total_words.append(total_words)
    list_positive_count.append(positive_count)
    list_negative_count.append(negative_count)
    list_uncertainty_count.append(uncertainty_count)

In [20]:
sentiments_results = lda_results.copy()

sentiments_results['total_words'] = list_total_words
sentiments_results['positive_count'] = list_positive_count
sentiments_results['negative_count'] = list_negative_count
sentiments_results['uncertainty_count'] = list_uncertainty_count

In [21]:
sentiments_results.describe().round(2)

Unnamed: 0,dominant_topic,total_words,positive_count,negative_count,uncertainty_count
count,11103.0,11103.0,11103.0,11103.0,11103.0
mean,3.2,50.23,1.35,1.12,1.16
std,2.23,31.46,1.97,1.69,1.76
min,0.0,6.0,0.0,0.0,0.0
25%,1.0,27.0,0.0,0.0,0.0
50%,3.0,43.0,1.0,0.0,0.0
75%,5.0,65.0,2.0,2.0,2.0
max,7.0,351.0,18.0,25.0,17.0


In [22]:
sentiments_results.to_excel('./data/processed/sentiments_results.xlsx', index=False)