In [1]:
import pandas as pd

In [2]:
#Open data
data = pd.read_csv("data/final_data.csv")

#Import moral words separately
from liwc_moral_words import liwc_moral


  data = pd.read_csv("data/final_data.csv")


Functions to get morality words

In [3]:
def get_moral_words(text, word_list):
    '''
    Tokenizes text into list of words and keeps only those from the 
    dictionary

    Inputs:
      - text (str): cleaned text from subreddit
      - word_list (list of strings): words to assess

    Outputs:
      (list): list of morality words
    '''
    words_liwc = []

    words = text.split()

    for word in words:
        if word in word_list:
            words_liwc.append(word)

    return words_liwc

def count_words(data, word_list):
    '''
    Create a dictionary that maps moral words with their frequency on the entire
    corpus

    Inputs:
      - data (series): cleaned_text column

    Outputs:
      - (dict): dictionary mapping words with their frequencies

    '''

    moral_words_counts = {}

    for entry in data:
        row_words = get_moral_words(entry, word_list)

        for word in row_words:
            moral_words_counts[word] = moral_words_counts.get(word, 0) + 1

    sorted_dict= dict(sorted(moral_words_counts.items(), key=lambda item: item[1],
                                                                  reverse=True))
    return sorted_dict

### Get LIWC moral words frequencies for each topic

In [4]:
# Group the DataFrame by topic and apply the functions to each group
topic_dictionaries_liwc = {}
for topic, group in data.groupby('dominant_topic'):
    text_series = group['cleaned_text']
    topic_dictionaries_liwc[topic] = count_words(text_series, liwc_moral)

In [5]:
topic_dictionaries_liwc['Topic 9']

{'wrong': 106,
 'honest': 33,
 'fault': 24,
 'useful': 24,
 'lazy': 22,
 'deserve': 21,
 'shame': 21,
 'judge': 14,
 'blame': 11,
 'laziness': 10,
 'excuse': 9,
 'judging': 9,
 'judged': 8,
 'decent': 7,
 'conscience': 7,
 'honesty': 6,
 'blaming': 6,
 'competent': 5,
 'forgive': 5,
 'fake': 5,
 'ignorant': 5,
 'brave': 5,
 'useless': 5,
 'selfish': 4,
 'blamed': 4,
 'dumb': 3,
 'disrespect': 3,
 'shaming': 3,
 'worthwhile': 3,
 'commend': 3,
 'cheated': 3,
 'courage': 3,
 'faults': 2,
 'offensive': 2,
 'ridiculously': 2,
 'worthy': 2,
 'evil': 2,
 'inappropriate': 2,
 'righteous': 2,
 'excuses': 2,
 'forgiving': 2,
 'braver': 2,
 'ethical': 2,
 'deserves': 2,
 'honor': 2,
 'moral': 2,
 'cheating': 2,
 'ridiculous': 2,
 'pussies': 1,
 'deserved': 1,
 'unfair': 1,
 'unreasonable': 1,
 'overbearing': 1,
 'disrespectful': 1,
 'decently': 1,
 'injustice': 1,
 'reprehensible': 1,
 'truthfully': 1,
 'treacherous': 1,
 'nerd': 1,
 'mistreated': 1,
 'wronged': 1,
 'trustworthy': 1,
 'disapprov

## Get average positive emotion and negative emotion scores for top 10 words in each topic

First, create a column with the LIWC moral words for each document

In [7]:
data["moral_words"] = data["cleaned_text"].apply(lambda x: get_moral_words(x, liwc_moral))

In [10]:
data.columns

Index(['id', 'created', 'author', 'score', 'num_comments', 'link',
       'cleaned_text', 'word_count', 'type', 'link_id', 'year', 'month',
       'Segment_1', 'emo_pos', 'emo_neg', 'emo_anx', 'emo_anger', 'emo_sad',
       'moral', 'Segment', 'Care_Virtue', 'Care_Vice', 'Fairness_Virtue',
       'Fairness_Vice', 'Loyalty_Virtue', 'Loyalty_Vice', 'Authority_Virtue',
       'Authority_Vice', 'Sanctity_Virtue', 'Sanctity_Vice',
       'topic_distribution', 'dominant_topic', 'prob_topic', 'Care_total',
       'Fairness_total', 'Loyalty_total', 'Authority_total', 'Sanctity_total',
       'Virtue_total', 'Vice_total', 'Foundations_total_score', 'moral_words'],
      dtype='object')

In [19]:
def get_avg_emotion(df, topic,  dict):

    #Create dictionaries to store emotions scores
    avg_pos_emotion = {}
    avg_neg_emotion = {}

    #Filter df based on topic
    topic_df = df[df.dominant_topic == topic]

    #Get dict for topic
    topic_dict = dict[topic]

    for idx, row in topic_df.iterrows():
        for word in row["moral_words"]:
            avg_pos_emotion[word] = avg_pos_emotion.get(word, 0) + row["emo_pos"]
            avg_neg_emotion[word] = avg_neg_emotion.get(word, 0) + row["emo_neg"]

    for word, value in avg_pos_emotion.items():
        avg_pos_emotion[word] = value / topic_dict[word] 

    for word, value in avg_neg_emotion.items():
        avg_neg_emotion[word] = value / topic_dict[word]

    return avg_pos_emotion, avg_neg_emotion

In [22]:
topic1_words_emopos, topic1_words_emoneg = get_avg_emotion(data, "Topic 1", topic_dictionaries_liwc)

In [23]:
topic2_words_emopos, topic2_words_emoneg = get_avg_emotion(data, "Topic 2", topic_dictionaries_liwc)

In [24]:
topic3_words_emopos, topic3_words_emoneg = get_avg_emotion(data, "Topic 3", topic_dictionaries_liwc)


In [25]:
topic4_words_emopos, topic4_words_emoneg = get_avg_emotion(data, "Topic 4", topic_dictionaries_liwc)


In [26]:
topic5_words_emopos, topic5_words_emoneg = get_avg_emotion(data, "Topic 5", topic_dictionaries_liwc)


In [27]:
topic6_words_emopos, topic6_words_emoneg = get_avg_emotion(data, "Topic 6", topic_dictionaries_liwc)


In [28]:
topic7_words_emopos, topic7_words_emoneg = get_avg_emotion(data, "Topic 7", topic_dictionaries_liwc)


In [44]:
topic8_words_emopos, topic8_words_emoneg = get_avg_emotion(data, "Topic 8", topic_dictionaries_liwc)

In [30]:
topic9_words_emopos, topic9_words_emoneg = get_avg_emotion(data, "Topic 9", topic_dictionaries_liwc)

### Store in a dataframe

First, create a dataframe for each topic

In [33]:
def create_emotion_df(avg_pos_emotion, avg_neg_emotion, moral_dict):
    # Create a dictionary with words as keys and their corresponding pos, neg scores and counts as values
    data = {
        'word': list(avg_pos_emotion.keys()),
        'count': [moral_dict.get(word, 0) for word in avg_pos_emotion],  # Use moral_dict to get word count
        'pos_score': [avg_pos_emotion.get(word, 0) for word in avg_pos_emotion],
        'neg_score': [avg_neg_emotion.get(word, 0) for word in avg_neg_emotion]
    }

    # Convert to DataFrame
    df = pd.DataFrame(data)

    return df

In [46]:
topic1 = create_emotion_df(topic1_words_emopos, topic1_words_emoneg, topic_dictionaries_liwc["Topic 1"])
topic1["topic"] = 1

In [47]:
topic2 = create_emotion_df(topic2_words_emopos, topic2_words_emoneg, topic_dictionaries_liwc["Topic 2"])
topic2["topic"] = 2

In [48]:
topic3 = create_emotion_df(topic3_words_emopos, topic3_words_emoneg, topic_dictionaries_liwc["Topic 3"])
topic3["topic"] = 3

In [49]:
topic4 = create_emotion_df(topic4_words_emopos, topic4_words_emoneg, topic_dictionaries_liwc["Topic 4"])
topic4["topic"] = 4

In [50]:
topic5 = create_emotion_df(topic5_words_emopos, topic5_words_emoneg, topic_dictionaries_liwc["Topic 5"])
topic5["topic"] = 5

In [51]:
topic6 = create_emotion_df(topic6_words_emopos, topic6_words_emoneg, topic_dictionaries_liwc["Topic 6"])
topic6["topic"] = 6

In [52]:
topic7 = create_emotion_df(topic7_words_emopos, topic7_words_emoneg, topic_dictionaries_liwc["Topic 7"])
topic7["topic"] = 7

In [53]:
topic8 = create_emotion_df(topic8_words_emopos, topic8_words_emoneg, topic_dictionaries_liwc["Topic 8"])
topic8["topic"] = 8

In [54]:
topic9 = create_emotion_df(topic9_words_emopos, topic9_words_emoneg, topic_dictionaries_liwc["Topic 9"])
topic9["topic"] = 9

In [61]:
combined_df = pd.concat([topic1, topic2, topic3, topic4, topic5, topic6, topic7, topic8, topic9], ignore_index=True)
filtered_df = combined_df[combined_df["count"] >= 100]
filtered_df


Unnamed: 0,word,count,pos_score,neg_score,topic
1,excuses,310,0.695710,0.538129,1
2,excuse,189,0.558307,0.584021,1
3,ideal,249,0.720281,0.461084,1
6,honest,256,0.758008,0.594766,1
9,wrong,604,0.729553,0.685248,1
...,...,...,...,...,...
1018,cheat,103,0.979612,0.977476,6
1020,fault,136,0.955221,0.917279,6
1255,wrong,266,0.764624,0.965150,7
1272,decent,111,0.737568,0.636937,7


In [None]:
filtered_df.to_csv("data/moralwords_bytopic.csv")