This notebook prepares the data for Tableau exploratory data analysis from results of LIWC software.

In [1]:
import pandas as pd


In [2]:
#Previous data
#selfimprovement = pd.read_csv("data/selfimpr_liwc.csv")
#investing = pd.read_csv("data/investing_liwc.csv")
#homeowners = pd.read_csv("data/homeowners_liwc.csv")
#Sanity check data size April 6
selfimprovement = pd.read_csv("data/new_sanitycheck/selfimpr_liwc.csv")
investing = pd.read_csv("data/new_sanitycheck/investing_liwc.csv")
homeowners = pd.read_csv("data/new_sanitycheck/homeowners_liwc.csv")

  selfimprovement = pd.read_csv("data/new_sanitycheck/selfimpr_liwc.csv")
  investing = pd.read_csv("data/new_sanitycheck/investing_liwc.csv")
  homeowners = pd.read_csv("data/new_sanitycheck/homeowners_liwc.csv")


### Basic descriptive statistics

In [3]:
selfimprovement.moral.describe()

count    506574.000000
mean          0.330980
std           0.700334
min           0.000000
25%           0.000000
50%           0.000000
75%           0.410000
max          14.810000
Name: moral, dtype: float64

In [4]:
investing.moral.describe()

count    503158.000000
mean          0.193725
std           0.550578
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          13.210000
Name: moral, dtype: float64

In [5]:
homeowners.moral.describe()

count    498733.000000
mean          0.154216
std           0.493689
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          13.890000
Name: moral, dtype: float64

### Observe proportions of moral language across subreddits

Get total sizes

In [6]:
#Call shape method to get shape of dataframe and select the number of rows
total_si = selfimprovement.shape[0]
total_i = investing.shape[0]
total_h = homeowners.shape[0]

Get number of posts with moralization score higher than 0.25 (the LIWC's reported mean for their corpus of Reddit + other sources)

In [7]:
moral_si = selfimprovement[selfimprovement.moral > 0.25]
moral_i = investing[investing.moral > 0.25]
moral_h = homeowners[homeowners.moral > 0.25]

#Repear workflow to get number of rows
moral_si_n = moral_si.shape[0]
moral_i_n = moral_i.shape[0]
moral_h_n = moral_h.shape[0]

Calculate proportion of moralization language across the three subreddits

In [8]:
percentage_si = (moral_si_n / total_si) * 100
percentage_i = (moral_i_n / total_i) * 100
percentage_h = (moral_h_n / total_h) * 100

In [9]:
print(f"Proportion of moralized posts in the r/selfimprovement subreddit: {percentage_si:.2f}%")
print(f"Proportion of moralized posts in the r/investing subreddit:{percentage_i:.2f}%")
print(f"Proportion of moralized posts in the r/homeowners subreddit:{percentage_h:.2f}%")

Proportion of moralized posts in the r/selfimprovement subreddit: 28.08%
Proportion of moralized posts in the r/investing subreddit:15.58%
Proportion of moralized posts in the r/homeowners subreddit:12.64%


## Feature engineering with foundations scores

Some feature engineering to create combined scores for each foundation, and combined scores for virtue and vice

In [10]:
def feature_engineering(df):

    # Calculate total for each foundation
    df["Care_total"] = df["Care_Virtue"] + df["Care_Vice"]
    df["Fairness_total"] = df["Fairness_Virtue"] + df["Fairness_Vice"]
    df["Loyalty_total"] = df["Loyalty_Virtue"] + df["Loyalty_Vice"]
    df["Authority_total"] = df["Authority_Virtue"] + df["Authority_Vice"]
    df["Sanctity_total"] = df["Sanctity_Virtue"] + df["Sanctity_Vice"]

    # Vice and virtue scores
    df["Virtue_total"] = (df["Care_Virtue"] + df["Fairness_Virtue"] 
                          + df["Loyalty_Virtue"] + df["Authority_Virtue"] 
                          + df["Sanctity_Virtue"])
    
    df["Vice_total"] = (df["Care_Vice"] + df["Fairness_Vice"] 
                    + df["Loyalty_Vice"] + df["Authority_Vice"] 
                    + df["Sanctity_Vice"])
    
    # Overall total score across all foundations
    df["Foundations_total_score"] = (
        df["Care_total"] + df["Fairness_total"] + df["Loyalty_total"] +
        df["Authority_total"] + df["Sanctity_total"])
    
    return df

In [11]:
selfimprovement2 = feature_engineering(selfimprovement)
investing2 = feature_engineering(investing)
homeowners2 = feature_engineering(homeowners)

Create one single df to use in Tableau

In [12]:
selfimprovement2["Subreddit"] = "selfimprovement"
investing2["Subreddit"] = "investing"
homeowners2["Subreddit"] = "homeowners"

In [13]:
all_reddits = pd.concat([selfimprovement2, investing2, homeowners2], ignore_index=True)

In [14]:
all_reddits

Unnamed: 0.1,Unnamed: 0,id,created,author,score,num_comments,link,cleaned_text,word_count,type,...,Sanctity_Vice,Care_total,Fairness_total,Loyalty_total,Authority_total,Sanctity_total,Virtue_total,Vice_total,Foundations_total_score,Subreddit
0,0,hk5r2,2011-05-25 17:27,u/[deleted],1,3.0,https://www.reddit.com/r/selfimprovement/comme...,i had an appointment today with the dentist ov...,64,submission,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,selfimprovement
1,1,iqimz,2011-07-15 11:15,u/dustinsmusings,3,0.0,https://www.reddit.com/r/selfimprovement/comme...,i created this site several months ago and i s...,116,submission,...,0.00,2.59,0.00,0.86,0.00,0.00,3.45,0.00,3.45,selfimprovement
2,2,pfzt5,2012-02-08 01:40,u/aeoz,6,4.0,https://www.reddit.com/r/selfimprovement/comme...,hello everyone i have recently took over this ...,195,submission,...,0.00,2.05,0.00,0.00,0.00,0.00,2.05,0.00,2.05,selfimprovement
3,3,pk714,2012-02-10 19:16,u/[deleted],1,0.0,https://www.reddit.com/r/selfimprovement/comme...,i grew up with body dysmorphia eating disorder...,582,submission,...,0.17,2.24,0.17,0.34,0.00,0.69,2.75,0.69,3.44,selfimprovement
4,4,q0q8x,2012-02-22 03:24,u/[deleted],1,0.0,https://www.reddit.com/r/selfimprovement/comme...,i have to ask when do you get to a point where...,561,submission,...,0.53,1.43,0.00,0.00,0.00,1.06,0.89,1.60,2.49,selfimprovement
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1508460,498728,kfrkhs5,2023-12-31 17:29,u/Earl_your_friend,1,,https://www.reddit.com/r/homeowners/comments/1...,i lived next to a guy who sold and bought scap...,194,comment,...,0.52,0.52,0.52,0.00,1.55,0.52,2.07,1.04,3.11,homeowners
1508461,498729,kfrl16r,2023-12-31 17:33,u/UntypicalCouple,8,,https://www.reddit.com/r/homeowners/comments/1...,you do realize that not all businesses can be ...,63,comment,...,0.00,0.00,0.00,0.00,3.17,1.59,4.76,0.00,4.76,homeowners
1508462,498730,kfrm79i,2023-12-31 17:41,u/blockneighborradio,2,,https://www.reddit.com/r/homeowners/comments/1...,the neighbor isnt going to do anything stupid ...,50,comment,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,homeowners
1508463,498731,kfrmlea,2023-12-31 17:43,u/chof2018,2,,https://www.reddit.com/r/homeowners/comments/1...,i was this guy running a landscaping business ...,137,comment,...,0.00,0.00,0.73,0.00,1.46,0.00,2.19,0.00,2.19,homeowners


In [15]:
all_reddits.to_csv("data/engineered_morality.csv")

Ensure correct parsing

In [16]:
saved_csv = pd.read_csv("data/engineered_morality.csv")

print(all_reddits.shape)
print(saved_csv.shape)

  saved_csv = pd.read_csv("data/engineered_morality.csv")


(1508465, 44)
(1508465, 45)
