In [1]:
import json, re
from tqdm import tqdm
from collections import Counter, defaultdict
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
from glob import glob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

from scipy import stats

In [2]:
analyzer = SentimentIntensityAnalyzer()

In [3]:
subs = '''Conservative Liberal Republican democrats hillaryclinton The_Donald news worldnews politics'''.split()
print(subs)

['Conservative', 'Liberal', 'Republican', 'democrats', 'hillaryclinton', 'The_Donald', 'news', 'worldnews', 'politics']


In [4]:
filelist =  glob('sentiment/RC/RC*')#[i for i in glob('./data/*') if i.split('_')[-1] in subs]
filelist

['sentiment/RC\\RC_Conservative',
 'sentiment/RC\\RC_democrats',
 'sentiment/RC\\RC_hillaryclinton',
 'sentiment/RC\\RC_Liberal',
 'sentiment/RC\\RC_news',
 'sentiment/RC\\RC_politics',
 'sentiment/RC\\RC_politics_sample',
 'sentiment/RC\\RC_Republican',
 'sentiment/RC\\RC_SandersForPresident',
 'sentiment/RC\\RC_The_Donald',
 'sentiment/RC\\RC_worldnews']

In [5]:
data = defaultdict(list)
failed = []


for file in filelist:
    if 'RC_politics_sample' not in file: continue
    with open(file) as f:
        print(file)
        for i, line in enumerate(tqdm(f)):
            post = json.loads(line)
            try:
                data[post['subreddit']].append((post['body'], post['author']))
            except Exception as e:
                failed.append((i, e))

for i in data:
    print(i, '\t', len(data[i]))

sentiment/RC\RC_politics_sample


3000000it [00:28, 103808.26it/s]


politics 	 3000000


In [6]:
sentiment = defaultdict(lambda: defaultdict(list))

In [7]:
keywords = ['donald', 'trump', 'hillary', 'clinton', 'republican', 'democrat', 'conservative', 'liberal']

In [9]:
for i in data:
    for idx, comment in enumerate(tqdm(data[i])):
        for kw in keywords:
            if kw in comment[0].lower():
                sentiment[i][kw].append((TextBlob(comment[0]).sentiment, analyzer.polarity_scores(comment[0])))

  5%|███▎                                                                    | 140377/3000000 [02:03<49:11, 968.97it/s]

KeyboardInterrupt: 

In [None]:
data['worldnews'][0][0]

In [None]:
sentiment['worldnews']['trump'][0]

In [None]:
lst = [i[1]['pos'] for i in sentiment['worldnews']['trump']]; print(len(lst), np.mean(lst))
lst = [i[1]['neg'] for i in sentiment['worldnews']['trump']]; print(len(lst), np.mean(lst))
lst = [i[1]['neu'] for i in sentiment['worldnews']['trump']]; print(len(lst), np.mean(lst))

In [None]:
lst = [i[1]['pos'] for i in sentiment['worldnews']['hillary']]; print(len(lst), np.mean(lst))
lst = [i[1]['neg'] for i in sentiment['worldnews']['hillary']]; print(len(lst), np.mean(lst))
lst = [i[1]['neu'] for i in sentiment['worldnews']['hillary']]; print(len(lst), np.mean(lst))

In [None]:
results_dict = {}
means = []
errors = []
for i in sentiment:
    for kw in keywords:
        vpos = [i[1]['pos'] for i in sentiment[i][kw]]
        vneg = [i[1]['neg'] for i in sentiment[i][kw]]
        vneu = [i[1]['neu'] for i in sentiment[i][kw]]
        vcom = [i[1]['compound'] for i in sentiment[i][kw]]
        tpol = [i[0].polarity for i in sentiment[i][kw]]
        tsub = [i[0].subjectivity for i in sentiment[i][kw]]
        
        results_dict.update({kw:(len(sentiment[i][kw]),np.mean(vcom), stats.sem(vcom))})
        means.append(np.mean(vcom))
        errors.append(stats.sem(vcom))
    
        print(kw)
        print("Number of comments: ", len(sentiment[i][kw]))
#         print('pos:', np.mean(vpos))
#         print('neg:', np.mean(vneg))
#         print('neu:', np.mean(vneu))
#         print('pol:', np.mean(tpol))
#         print('sub:', np.mean(tsub))
#         print('sen:', np.mean(vpos)-np.mean(vneg))
        print('sen:', np.mean(vcom))
        print('pol:', np.mean(tpol))
        print("error: ", stats.sem(vcom))
        print()

In [None]:
# keywords, means, errors as above

keys_left = ["hillary", "clinton", "democrat", "liberal"]
keys_right = ["donald","trump", "republican", "conservative"]
x_pos = np.arange(len(keywords))
fig, ax = plt.subplots(figsize=(10, 12))
ax.bar(x_pos, means, yerr=errors, align='center', alpha=0.5, ecolor='black', capsize=10)
ax.set_ylabel('Senitment')
ax.set_xticks(x_pos)
ax.set_xticklabels(keywords)
ax.set_title('Sentiment of comments for keywords')
ax.yaxis.grid(True)
# plt.ylim((-0.1,0.1))