In [22]:
import pandas as pd
import numpy as np
import pickle
import glob
import matplotlib.pyplot as plt
import json
from tqdm import tqdm_notebook
import os

In [2]:
full_df = pd.concat([pd.read_csv(f, compression='gzip') for f in glob.glob("../data/full-data/*.gzip")], ignore_index=True)
full_df

Unnamed: 0,GLOBALEVENTID,MentionSourceName,DocumentIdentifier,DATE,V2Tone
0,714712065,npr.org,https://www.npr.org/podcasts/510318/up-first,20171213163000,"-1.51745068285281,1.97268588770865,3.490136570..."
1,714694876,npr.org,https://www.npr.org/podcasts/510318/up-first,20171213163000,"-1.51745068285281,1.97268588770865,3.490136570..."
2,714914448,npr.org,https://www.npr.org/podcasts/510318/up-first,20171213163000,"-1.51745068285281,1.97268588770865,3.490136570..."
3,714914794,npr.org,https://www.npr.org/podcasts/510318/up-first,20171213163000,"-1.51745068285281,1.97268588770865,3.490136570..."
4,714913892,npr.org,https://www.npr.org/podcasts/510318/up-first,20171213163000,"-1.51745068285281,1.97268588770865,3.490136570..."
5,714694328,npr.org,https://www.npr.org/podcasts/510318/up-first,20171213163000,"-1.51745068285281,1.97268588770865,3.490136570..."
6,721613923,npr.org,https://www.npr.org/series/473636949/schoolmoney,20180111203000,"0,3.18471337579618,3.18471337579618,6.36942675..."
7,771525591,npr.org,https://www.npr.org/podcasts/381444767/u-w-m-t...,20180712174500,"-1.00090991810737,1.45586897179254,2.456778889..."
8,771526954,npr.org,https://www.npr.org/podcasts/381444767/u-w-m-t...,20180712174500,"-1.00090991810737,1.45586897179254,2.456778889..."
9,733479049,npr.org,https://www.npr.org/series/347174484/jazz-nigh...,20180223211500,"1.26742712294043,3.80228136882129,2.5348542458..."


In [3]:
sources = full_df["MentionSourceName"].unique()

In [29]:
full_df["avg_tone"] = full_df.V2Tone.map(lambda l : str(l).split(",")[0]).astype(float)

In [4]:
full_df["month"] = pd.to_datetime(full_df.DATE, format='%Y%m%d%H%M%S').dt.strftime("%b-%y")

In [5]:
MONTH_LIST = list(set(full_df["month"].unique())-{'Dec-18'}|{'full_year'})
THEME_LIST = [ "all", "social", "conflict", "env", "health", "eco" ]

In [6]:
ind = {}
ind['all'] = full_df.index.values
for theme in THEME_LIST[1:]:
    with open("../data/indices/"+theme+".pkl", "rb") as f:
        ind[theme] = pickle.load(f)

In [18]:
nb_mentions_by_sources = full_df.groupby("MentionSourceName").size()

In [23]:
theme_ratio_by_source = pd.concat([ full_df.loc[ind[theme]].groupby("MentionSourceName").size() / nb_mentions_by_sources for theme in theme_list ], axis=1)

In [7]:
theme_count_by_source = pd.concat([ full_df.loc[ind[theme]].groupby("MentionSourceName").size() for theme in theme_list ], axis=1)

In [8]:
theme_count_by_source.rename(dict(zip(range(len(theme_list)),theme_list)), axis=1, inplace=True)

In [39]:
idx = pd.MultiIndex.from_product([MONTH_LIST, THEME_LIST[1:]],
                                 names=['Month', 'Theme'])
col = sorted(full_df["MentionSourceName"].unique())

count_histograms = pd.DataFrame('-', idx, col)
tone_histograms = pd.DataFrame('-', idx, col)

In [47]:
pd.DataFrame('-', idx, col)

Unnamed: 0_level_0,Unnamed: 1_level_0,abc13.com,abc7news.com,autonews.com,boston.com,bostonherald.com,breitbart.com,cbsnews.com,chicagotribune.com,denverpost.com,fox2now.com,...,seattlepi.com,seattletimes.com,stltoday.com,theonion.com,twincities.com,usatoday.com,villagevoice.com,westword.com,wgntv.com,wtop.com
Month,Theme,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Jan-18,social,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
Jan-18,conflict,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
Jan-18,env,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
Jan-18,health,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
Jan-18,eco,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
Jul-18,social,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
Jul-18,conflict,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
Jul-18,env,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
Jul-18,health,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
Jul-18,eco,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


In [40]:
for theme in tqdm_notebook(THEME_LIST[1:]):
    theme_df = full_df.loc[ind[theme]]
    for m in MONTH_LIST:
        if m == 'full_year':
            df = theme_df.copy()
        else:
            df = theme_df[theme_df.month==m].copy()
                
        count_histograms.loc[m,theme] = df.groupby("MentionSourceName").size()
        tone_histograms.loc[m,theme] = df.groupby("MentionSourceName").mean()["avg_tone"]

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




In [46]:
tone_histograms['abc13.com']

Month      Theme   
Jan-18     social     -3.28837
           conflict   -3.32274
           env          -3.356
           health     -3.29691
           eco        -3.16733
Jul-18     social     -3.27027
           conflict   -3.28681
           env        -3.23819
           health     -3.27341
           eco        -3.21248
Mar-18     social     -3.06384
           conflict   -3.04496
           env        -3.12063
           health     -3.03101
           eco        -2.93312
Nov-18     social     -3.29792
           conflict   -3.29746
           env        -3.34633
           health     -3.31227
           eco        -3.19119
May-18     social     -3.44319
           conflict   -3.42491
           env        -3.46296
           health     -3.43676
           eco        -3.42048
Feb-18     social     -3.64064
           conflict   -3.59594
           env        -3.61532
           health     -3.56332
           eco        -3.62453
                        ...   
Oct-18     social  

In [48]:
for m in MONTH_LIST:
    os.makedirs("../data/histograms/"+m)

In [52]:
for s in sources:
    for x,y in zip(count_histograms[s].unstack().iterrows(), tone_histograms[s].unstack().iterrows()):
        pd.concat([x[1],y[1]],axis=1).to_csv("../data/histograms/"+x[0]+"/"+s+".csv", header=["count", "avg_tone"])

In [45]:
for s in sources:
    for r in tone_histograms[s].unstack().iterrows():
        r[1].to_csv("../data/histograms/"+s+"/"+r[0]+"/tone.csv", header=["Tone"])