This notebook prepares the data for Tableau exploratory data analysis from results of LIWC software.

In [1]:
import pandas as pd
import pickle

In [2]:
selfimprovement = pd.read_csv("data/liwc+mfd2-results.csv")
investing = pd.read_csv("data_to_compare/liwc/investing-liwc+mfd2.csv")
homeowners = pd.read_csv("data_to_compare/liwc/homeowners-liwc+mfd2.csv")

  selfimprovement = pd.read_csv("data/liwc+mfd2-results.csv")
  investing = pd.read_csv("data_to_compare/liwc/investing-liwc+mfd2.csv")
  homeowners = pd.read_csv("data_to_compare/liwc/homeowners-liwc+mfd2.csv")


### Observe proportions of moral language across subreddits

Get total sizes

In [12]:
#Call shape method to get shape of dataframe and select the number of rows
total_si = selfimprovement.shape[0]
total_i = investing.shape[0]
total_h = homeowners.shape[0]

Get number of posts with moralization score higher than 0.25 (the LIWC's reported mean for their corpus of Reddit + other sources)

In [17]:
moral_si = selfimprovement[selfimprovement.moral > 0.25]
moral_i = investing[investing.moral > 0.25]
moral_h = homeowners[homeowners.moral > 0.25]

#Repear workflow to get number of rows
moral_si_n = moral_si.shape[0]
moral_i_n = moral_i.shape[0]
moral_h_n = moral_h.shape[0]

Calculate proportion of moralization language across the three subreddits

In [18]:
percentage_si = (moral_si_n / total_si) * 100
percentage_i = (moral_i_n / total_i) * 100
percentage_h = (moral_h_n / total_h) * 100

In [21]:
print(f"Proportion of moralized posts in the r/selfimprovement subreddit: {percentage_si:.2f}%")
print(f"Proportion of moralized posts in the r/investing subreddit:{percentage_i:.2f}%")
print(f"Proportion of moralized posts in the r/homeowners subreddit:{percentage_h:.2f}%")

Proportion of moralized posts in the r/selfimprovement subreddit: 27.36%
Proportion of moralized posts in the r/investing subreddit:15.55%
Proportion of moralized posts in the r/homeowners subreddit:12.67%


## Feature engineering with foundations scores

Some feature engineering to create combined scores for each foundation, and combined scores for virtue and vice

In [3]:
def feature_engineering(df):

    # Calculate total for each foundation
    df["Care_total"] = df["Care_Virtue"] + df["Care_Vice"]
    df["Fairness_total"] = df["Fairness_Virtue"] + df["Fairness_Vice"]
    df["Loyalty_total"] = df["Loyalty_Virtue"] + df["Loyalty_Vice"]
    df["Authority_total"] = df["Authority_Virtue"] + df["Authority_Vice"]
    df["Sanctity_total"] = df["Sanctity_Virtue"] + df["Sanctity_Vice"]

    # Vice and virtue scores
    df["Virtue_total"] = (df["Care_Virtue"] + df["Fairness_Virtue"] 
                          + df["Loyalty_Virtue"] + df["Authority_Virtue"] 
                          + df["Sanctity_Virtue"])
    
    df["Vice_total"] = (df["Care_Vice"] + df["Fairness_Vice"] 
                    + df["Loyalty_Vice"] + df["Authority_Vice"] 
                    + df["Sanctity_Vice"])
    
    # Overall total score across all foundations
    df["Foundations_total_score"] = (
        df["Care_total"] + df["Fairness_total"] + df["Loyalty_total"] +
        df["Authority_total"] + df["Sanctity_total"])
    
    return df

In [4]:
selfimprovement2 = feature_engineering(selfimprovement)
investing2 = feature_engineering(investing)
homeowners2 = feature_engineering(homeowners)

Create one single df to use in Tableau

In [5]:
selfimprovement2["Subreddit"] = "selfimprovement"
investing2["Subreddit"] = "investing"
homeowners2["Subreddit"] = "homeowners"

In [6]:
all_reddits = pd.concat([selfimprovement2, investing2, homeowners2], ignore_index=True)

In [7]:
all_reddits

Unnamed: 0.1,Unnamed: 0,id,created,author,score,num_comments,link,cleaned_text,word_count,type,...,Sanctity_Vice,Care_total,Fairness_total,Loyalty_total,Authority_total,Sanctity_total,Virtue_total,Vice_total,Foundations_total_score,Subreddit
0,0,hk5r2,2011-05-25 17:27,u/[deleted],1,3.0,https://www.reddit.com/r/selfimprovement/comme...,i had an appointment today with the dentist ov...,65,submission,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,selfimprovement
1,1,iqimz,2011-07-15 11:15,u/dustinsmusings,3,0.0,https://www.reddit.com/r/selfimprovement/comme...,i created this site several months ago and i s...,116,submission,...,0.00,2.59,0.00,0.86,0.00,0.00,3.45,0.00,3.45,selfimprovement
2,2,pfzt5,2012-02-08 01:40,u/aeoz,6,4.0,https://www.reddit.com/r/selfimprovement/comme...,hello everyone i have recently took over this...,194,submission,...,0.00,2.06,0.00,0.00,0.00,0.00,2.06,0.00,2.06,selfimprovement
3,3,pk714,2012-02-10 19:16,u/[deleted],1,0.0,https://www.reddit.com/r/selfimprovement/comme...,i grew up with body dysmorphia eating disorder...,583,submission,...,0.17,2.23,0.17,0.34,0.00,0.68,2.74,0.68,3.42,selfimprovement
4,4,q0q8x,2012-02-22 03:24,u/[deleted],1,0.0,https://www.reddit.com/r/selfimprovement/comme...,i have to ask when do you get to a point where...,558,submission,...,0.54,1.44,0.00,0.00,0.00,1.08,0.90,1.62,2.52,selfimprovement
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1514236,502528,kfrkhs5,2023-12-31 17:29,u/Earl_your_friend,1,,https://www.reddit.com/r/homeowners/comments/1...,i lived next to a guy who sold and bought scap...,198,comment,...,0.51,0.51,0.51,0.00,1.52,0.51,2.03,1.02,3.05,homeowners
1514237,502529,kfrl16r,2023-12-31 17:33,u/UntypicalCouple,8,,https://www.reddit.com/r/homeowners/comments/1...,you do realize that not all businesses can be ...,62,comment,...,0.00,0.00,0.00,0.00,3.23,1.61,4.84,0.00,4.84,homeowners
1514238,502530,kfrm79i,2023-12-31 17:41,u/blockneighborradio,2,,https://www.reddit.com/r/homeowners/comments/1...,the neighbor isnt going to do anything stupid ...,51,comment,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,homeowners
1514239,502531,kfrmlea,2023-12-31 17:43,u/chof2018,2,,https://www.reddit.com/r/homeowners/comments/1...,i was this guy running a landscaping business ...,137,comment,...,0.00,0.00,0.73,0.00,1.46,0.00,2.19,0.00,2.19,homeowners


In [8]:
all_reddits.to_csv("datavis/engineered_data.csv")

Ensure correct parsing

In [9]:
saved_csv = pd.read_csv("datavis/engineered_data.csv")

print(all_reddits.shape)
print(saved_csv.shape)

  saved_csv = pd.read_csv("datavis/engineered_data.csv")


(1514241, 40)
(1514241, 41)


## Explore morality words frequence LIWC

Create list with LIWC moral words

In [17]:
liwc_moral = [
    "absurd", "absurdity", "absurdities", "accusation", "accusations", "accusative", "accuse",
    "accuses", "accusing", "admirable", "admonish", "admonished", "admonishing", "admonishes",
    "admonishment", "adulterate", "adulterated", "adulterating", "adulterates", "adulteration",
    "adulterer", "adulterers", "adulteress", "adulteresses", "adulteries", "adulterous",
    "adultery", "amoral", "amorality", "arrogant", "betray", "betrayed", "betraying",
    "betrays", "betrayal", "betrayer", "bigot", "bigots", "bigoted", "bigotry", "blame",
    "blames", "blamed", "blaming", "brave", "bravely", "braver", "bravest", "buffoon",
    "buffoons", "buffoonish", "careless", "carelessness", "carpetbag", "carpetbags",
    "censure", "censured", "censures", "censuring", "chastise", "chastised", "chastises",
    "chastising", "chauvinism", "chauvinist", "chauvinistic", "cheat", "cheats", "cheated",
    "cheating", "commend", "commended", "commends", "commending", "competence",
    "competent", "conceit", "conceited", "connive", "connived", "connives", "conniving",
    "conscience", "contemptible", "contemptibly", "corrupt", "corrupted", "corrupting",
    "corruption", "courage", "courageous", "craven", "criminal", "criminals", "crook",
    "crooks", "cruel", "crueler", "cruelest", "crueller", "cruellest", "cruelly",
    "cruelties", "cruelty", "debauch", "debauched", "debauches", "debauching", "decadence",
    "decadent", "deceive", "deceived", "deceives", "deceiving", "decency", "decent",
    "decently", "deceptive", "deceptively", "delinquent", "delinquency", "deprave",
    "depraved", "depraves", "depraving", "deserve", "deserved", "deserves", "deserving",
    "despicable", "deviant", "deviants", "dignified", "dignity", "disapprove",
    "disapproved", "disapproves", "disapproving", "disgrace", "disgraced", "disgraces",
    "disgracing", "dishonest", "dishonesty", "dishonor", "dishonored", "dishonorable",
    "dishonourable", "disloyal", "disrespect", "disrespected", "disrespecting",
    "disrespectful", "diss", "dissed", "dissing", "dumb", "dutiful", "duty", "elitism",
    "elitist", "elitists", "equality", "equitable", "ethic", "ethical", "ethics", "evil",
    "evildoer", "evildoers", "excuse", "excuses", "excused", "excusing", "fairness",
    "faithful", "faithless", "fake", "fakes", "faking", "fatass", "fatso", "fatties",
    "forgive", "forgiven", "forgives", "forgiving", "foul", "fouled", "fouling", "fraud",
    "frauds", "fraudulent", "generosity", "generous", "glutton", "gluttony", "godless",
    "godlessness", "grandiose", "greed", "greedy", "hateful", "haters", "heathen",
    "heathens", "hero", "heroes", "heroic", "heroine", "heroines", "hideous", "hideously",
    "homily", "fault", "faults", "faulted", "faulting", "honest", "honesty", "honor",
    "honored", "honoring", "honorable", "honour", "horrid", "horridly", "humane",
    "humanitarian", "hypocrisy", "hypocrite", "hypocrites", "ideal", "ideals", "ideologue",
    "ignoble", "ignorant", "immodest", "immoral", "immorality", "inappropriate",
    "inconsiderate", "incorruptible", "indecency", "indecent", "indignantly", "inequity",
    "infallible", "infidel", "infidels", "infidelity", "inhumane", "iniquity", "injustice",
    "innocence", "innocent", "innocently", "irresponsible", "judge", "judged", "judges",
    "judging", "judgy", "justice", "justness", "kosher", "laughingstock", "lawless",
    "lawlessness", "lazier", "laziest", "laziness", "lazy", "lecherous", "lewd", "liar",
    "liars", "lousy", "loyal", "magnanimity", "magnanimous", "mansplain", "misbehave",
    "misbehaved", "misbehaving", "misconduct", "miser", "miserly", "misogynist",
    "misogynistic", "mistreat", "mistreated", "mistreating", "misuse", "misused",
    "misuses", "misusing", "molest", "molested", "molesting", "moral", "morality",
    "nefarious", "nerd", "nerds", "nerdy", "noble", "obstinate", "offensive",
    "opinionated", "outlaw", "outlawed", "outlawing", "outrageous", "overbearing",
    "overconfident", "pariah", "patriot", "patriots", "pedophile", "penance", "penitent",
    "perv", "pervert", "perverted", "perverts", "pervy", "pettier", "pettiest", "pettily",
    "pettiness", "petty", "phony", "pitiful", "pitifully", "plagiarize", "prejudice",
    "principled", "promiscuity", "promiscuous", "prude", "prudish", "psycho", "puny",
    "pussies", "racist", "rapist", "rectitude", "redneck", "reprehensible", "repulsive",
    "revolting", "revoltingly", "ridicule", "ridiculous", "ridiculously", "righteous",
    "righteously", "ruthless", "scandal", "scandals", "scruples", "scrupulous", "scum",
    "selfish", "selflessness", "sexism", "sexist", "shame", "shamed", "shaming", "sin",
    "sincere", "sincerity", "sinful", "sinfully", "sinister", "sinned", "sinner", "sinners",
    "sins", "sissies", "sissy", "skank", "slander", "slandered", "slandering", "slimy",
    "slothful", "slut", "sluts", "slutty", "smug", "sneakily", "sneaky", "snide",
    "snidely", "snob", "snobs", "spineless", "thief", "thieves", "traitor", "transgress",
    "treacherous", "treason", "trustworthy", "trusty", "truthful", "truthfully",
    "unacceptable", "unethical", "unfair", "unfaithful", "ungodly", "ungracious", "unjust",
    "unloyal", "unpatriotic", "unprincipled", "unqualified", "unreasonable", "unsavory",
    "unscrupulous", "unselfish", "untrustworthy", "unvirtuous", "unworthy", "upstanding",
    "useful", "useless", "vain", "vainly", "vanity", "vengeance", "vile", "vilify",
    "vindicate", "virtue", "virtuous", "wanton", "wicked", "worthless", "worthwhile",
    "worthy", "wrong", "wrongdoing", "wronged", "wrongful", "wrongly", "zealot"]

In [23]:
def get_moral_words(text):
    '''
    Tokenizes text into list of words and keeps only those from the LIWC
    dictionary

    Inputs:
      - text (str): cleaned text from subreddit

    Outputs:
      (list): list of LIWC words
    '''
    words_liwc = []

    words = text.split()

    for word in words:
        if word in liwc_moral:
            words_liwc.append(word)

    return words_liwc

def count_words(data):
    '''
    Create a dictionary that maps moral words with their frequency on the entire
    corpus

    Inputs:
      - data (series): cleaned_text column

    Outputs:
      - (dict): dictionary mapping words with their frequencies

    '''

    moral_words_counts = {}

    for entry in data:
        row_words = get_moral_words(entry)

        for word in row_words:
            moral_words_counts[word] = moral_words_counts.get(word, 0) + 1

    #I asked ChatGPT how to sort by value

    sorted_dict= dict(sorted(moral_words_counts.items(), key=lambda item: item[1],
                                                                  reverse=True))
    return sorted_dict

In [21]:
moral_dict = count_words(selfimprovement["cleaned_text"])

In [25]:
selfimprovement2["moral_words"] = selfimprovement2["cleaned_text"].apply(lambda x: get_moral_words(x))

## Get average positive emotion and negative emotion scores for top 15 words

In [40]:
def get_avg_emotion(df, moral_dict):
    avg_pos_emotion = {}
    avg_neg_emotion = {}

    for idx, row in df.iterrows():
        for word in row["moral_words"]:
            avg_pos_emotion[word] = avg_pos_emotion.get(word, 0) + row["emo_pos"]
            avg_neg_emotion[word] = avg_neg_emotion.get(word, 0) + row["emo_neg"]

    for word, value in avg_pos_emotion.items():
        avg_pos_emotion[word] = value / moral_dict[word]

    for word, value in avg_neg_emotion.items():
        avg_neg_emotion[word] = value / moral_dict[word]

    return avg_pos_emotion, avg_neg_emotion

In [67]:
avg_pos, avg_neg = get_avg_emotion(selfimprovement2, moral_dict)

In [51]:
def create_emotion_df(avg_pos_emotion, avg_neg_emotion, moral_dict):
    # Create a dictionary with words as keys and their corresponding pos, neg scores and counts as values
    data = {
        'word': list(avg_pos_emotion.keys()),
        'count': [moral_dict.get(word, 0) for word in avg_pos_emotion],  # Use moral_dict to get word count
        'pos_score': [avg_pos_emotion.get(word, 0) for word in avg_pos_emotion],
        'neg_score': [avg_neg_emotion.get(word, 0) for word in avg_neg_emotion]
    }

    # Convert to DataFrame
    df = pd.DataFrame(data)

    return df


In [68]:
df_moral_words = create_emotion_df(avg_pos, avg_neg, moral_dict)
df_moral_words

Unnamed: 0,word,count,pos_score,neg_score
0,forgive,6151,1.089117,1.488629
1,useful,7870,0.985366,0.772639
2,judging,3278,1.027117,1.155918
3,generous,604,1.464520,0.804023
4,loyal,523,1.380096,1.049235
...,...,...,...,...
422,depraves,1,0.000000,1.350000
423,deprave,1,0.980000,1.950000
424,cruellest,1,0.820000,1.640000
425,adulterer,1,2.060000,1.030000


Create a combined emotional score by substracting the negative score from the positive score

In [74]:
df_moral_words["overall_emo"] = df_moral_words["pos_score"] - df_moral_words["neg_score"]

In [75]:
df_moral_words.to_csv("datavis/moral_words.csv")

Ensure correct parsing

In [4]:
saved_csv = pd.read_csv("datavis/moral_words.csv")

print(df_moral_words.shape)
print(saved_csv.shape)

NameError: name 'df_moral_words' is not defined