In [1]:
import os
import pandas as pd
from textblob import TextBlob

In [2]:
data = pd.read_json("./data/merged_data.json")

In [3]:
list(data.columns)

['index',
 'total_awards_received',
 'approved_at_utc',
 'ups',
 'awarders',
 'mod_reason_by',
 'banned_by',
 'author_flair_type',
 'removal_reason',
 'link_id',
 'author_flair_template_id',
 'likes',
 'user_reports',
 'saved',
 'id',
 'banned_at_utc',
 'mod_reason_title',
 'gilded',
 'archived',
 'no_follow',
 'author',
 'can_mod_post',
 'send_replies',
 'parent_id',
 'score',
 'author_fullname',
 'report_reasons',
 'approved_by',
 'all_awardings',
 'subreddit_id',
 'body',
 'edited',
 'author_flair_css_class',
 'steward_reports',
 'is_submitter',
 'downs',
 'author_flair_richtext',
 'author_patreon_flair',
 'body_html',
 'gildings',
 'collapsed_reason',
 'associated_award',
 'stickied',
 'subreddit_type',
 'can_gild',
 'subreddit',
 'author_flair_text_color',
 'score_hidden',
 'permalink',
 'num_reports',
 'locked',
 'name',
 'created',
 'author_flair_text',
 'collapsed',
 'created_utc',
 'subreddit_name_prefixed',
 'controversiality',
 'depth',
 'author_flair_background_color',
 'mo

## Perform Sentiment Analysis

In [4]:
def calc_sentiment(x):
    text = TextBlob(x.body)
    return pd.Series([text.sentiment.polarity, text.sentiment.subjectivity ])

In [5]:
newcols = data.apply(calc_sentiment, axis=1)
newcols.columns = ['sent_polarity', 'sent_subjectivity']
df = data.join(newcols)

In [6]:
df[['body', 'sent_polarity', 'sent_subjectivity']]

Unnamed: 0,body,sent_polarity,sent_subjectivity
0,Is murder illegal?\n\nThat's the point.,-0.500000,0.500000
1,But the question posed is why most Republicans...,0.154167,0.775000
2,Murder gets the ultimate punishment. That's h...,-0.247917,0.645833
3,So murder is okay if it deemed necessary by a ...,0.166667,0.500000
4,"Executing a murderer is not murder, it's the t...",0.000000,0.000000
...,...,...,...
49090,Pure waste and expansion. The current building...,0.087608,0.371591
49091,And where are they going to get the billions n...,0.136364,0.454545
49092,"Yeah, how are they going to pay for this?",0.000000,0.000000
49093,>where are they going to get the billions need...,0.187500,0.533333


## What should we do with values 0 on polarity and subjectivity?

### I would drop them

In [7]:
df[ df['sent_polarity'] == 0.00 ][['body', 'sent_polarity']]

Unnamed: 0,body,sent_polarity
4,"Executing a murderer is not murder, it's the t...",0.0
5,Why?,0.0
13,It’s illegitimate should be one,0.0
17,Such a cheating crook.,0.0
31,But Putin told him it exists! Putin would neve...,0.0
...,...,...
49077,that this sub has been overrun by lefties that...,0.0
49081,/r/goldandblack,0.0
49085,TTNP,0.0
49089,Redistributing wealth via the federal governme...,0.0


In [8]:
df[ df['sent_subjectivity'] == 0.00 ][['body', 'sent_subjectivity']]

Unnamed: 0,body,sent_subjectivity
4,"Executing a murderer is not murder, it's the t...",0.0
5,Why?,0.0
13,It’s illegitimate should be one,0.0
31,But Putin told him it exists! Putin would neve...,0.0
32,Maybe he should pull his ace team of investiga...,0.0
...,...,...
49079,There has been no attempt to curate this forum...,0.0
49081,/r/goldandblack,0.0
49085,TTNP,0.0
49089,Redistributing wealth via the federal governme...,0.0


In [9]:
df.to_json('./data/merged_data_SA.json')