In [1]:
import pandas as pd
import numpy as np
import pickle
import glob
import matplotlib.pyplot as plt
import json
from tqdm import tqdm_notebook

In [2]:
full_df = pd.concat([pd.read_csv(f, compression='gzip') for f in glob.glob("../data/full-data/*.gzip")], ignore_index=True)
full_df

Unnamed: 0,GLOBALEVENTID,MentionSourceName,DocumentIdentifier,DATE,V2Tone
0,714712065,npr.org,https://www.npr.org/podcasts/510318/up-first,20171213163000,"-1.51745068285281,1.97268588770865,3.490136570..."
1,714694876,npr.org,https://www.npr.org/podcasts/510318/up-first,20171213163000,"-1.51745068285281,1.97268588770865,3.490136570..."
2,714914448,npr.org,https://www.npr.org/podcasts/510318/up-first,20171213163000,"-1.51745068285281,1.97268588770865,3.490136570..."
3,714914794,npr.org,https://www.npr.org/podcasts/510318/up-first,20171213163000,"-1.51745068285281,1.97268588770865,3.490136570..."
4,714913892,npr.org,https://www.npr.org/podcasts/510318/up-first,20171213163000,"-1.51745068285281,1.97268588770865,3.490136570..."
5,714694328,npr.org,https://www.npr.org/podcasts/510318/up-first,20171213163000,"-1.51745068285281,1.97268588770865,3.490136570..."
6,721613923,npr.org,https://www.npr.org/series/473636949/schoolmoney,20180111203000,"0,3.18471337579618,3.18471337579618,6.36942675..."
7,771525591,npr.org,https://www.npr.org/podcasts/381444767/u-w-m-t...,20180712174500,"-1.00090991810737,1.45586897179254,2.456778889..."
8,771526954,npr.org,https://www.npr.org/podcasts/381444767/u-w-m-t...,20180712174500,"-1.00090991810737,1.45586897179254,2.456778889..."
9,733479049,npr.org,https://www.npr.org/series/347174484/jazz-nigh...,20180223211500,"1.26742712294043,3.80228136882129,2.5348542458..."


In [3]:
THEME_LIST = [ "all", "social", "conflict", "env", "health", "eco" ]

In [4]:
ind = {}
ind['all'] = np.arange(full_df.shape[0])
for theme in THEME_LIST[1:]:
    with open("../data/indices/"+theme+".pkl", "rb") as f:
        ind[theme] = pickle.load(f)

In [5]:
ind

{'all': array([      0,       1,       2, ..., 5912887, 5912888, 5912889]),
 'social': array([      0,       1,       2, ..., 5876196, 5876197, 5876198]),
 'conflict': array([      0,       1,       2, ..., 5876197, 5876198, 5876199]),
 'env': array([      3,       5,       8, ..., 5876183, 5876191, 5876197]),
 'health': array([      1,       2,       3, ..., 5876195, 5876197, 5876198]),
 'eco': array([     17,      22,      27, ..., 5876196, 5876197, 5876199])}

In [None]:
full_df["month"] = pd.to_datetime(full_df.DATE, format='%Y%m%d%H%M%S').dt.strftime("%b-%y")

In [18]:
nb_mentions_by_sources = full_df.groupby("MentionSourceName").size()

In [23]:
theme_ratio_by_source = pd.concat([ full_df.loc[ind[theme]].groupby("MentionSourceName").size() / nb_mentions_by_sources for theme in theme_list ], axis=1)

In [7]:
theme_count_by_source = pd.concat([ full_df.loc[ind[theme]].groupby("MentionSourceName").size() for theme in theme_list ], axis=1)

In [8]:
theme_count_by_source.rename(dict(zip(range(len(theme_list)),theme_list)), axis=1, inplace=True)

In [51]:
idx = pd.MultiIndex.from_product([MONTH_LIST, THEME_LIST],
                                 names=['Month', 'Theme'])
col = sorted(full_df["MentionSourceName"].unique())

histograms = pd.DataFrame('-', idx, col)

In [64]:
for theme in tqdm_notebook(theme_list):
    theme_df = full_df.loc[ind[theme]]
    for m in range(13):
        if m == 0:
            df = theme_df.copy()
        else:
            df = theme_df[theme_df.month==m].copy()

        count = df.groupby("MentionSourceName").size()

        histograms.loc[m,theme] = count

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




In [101]:
month_indices = np.arange(1,13)

In [103]:
np.roll(month_indices, 1)

array([12,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [99]:
for r in histograms.transpose().iterrows():
    print(r["1"])

TypeError: tuple indices must be integers or slices, not str

In [76]:
for f in histograms.unstack(0).unstack():
    print(f)
    print("===")

91645
===
75267
===
25327
===
20242
===
38175
===
50064
===
6637.0
===
5461.0
===
1913.0
===
1478.0
===
2855.0
===
3553.0
===
7062.0
===
5801.0
===
1956.0
===
1563.0
===
2967.0
===
3871.0
===
8160.0
===
6740.0
===
2279.0
===
1833.0
===
3457.0
===
4467.0
===
8685
===
7123
===
2389
===
1978
===
3632
===
4822
===
9343
===
7681
===
2532
===
2015
===
3871
===
5080
===
8325
===
6811
===
2324
===
1790
===
3450
===
4534
===
8088
===
6643
===
2213
===
1727
===
3329
===
4419
===
7339.0
===
6040.0
===
2004.0
===
1635.0
===
3076.0
===
3932.0
===
7209.0
===
5933.0
===
1954.0
===
1592.0
===
2931.0
===
3943.0
===
7229.0
===
5901.0
===
1971.0
===
1567.0
===
2982.0
===
3983.0
===
7366.0
===
6036.0
===
2059.0
===
1682.0
===
3097.0
===
4021.0
===
6202
===
5097
===
1733
===
1382
===
2528
===
3439
===
83612
===
68568
===
22981
===
18494
===
34955
===
45306
===
5547.0
===
4498.0
===
1501.0
===
1254.0
===
2242.0
===
2955.0
===
5624.0
===
4643.0
===
1547.0
===
1255.0
===
2398.0
===
3054.0
===
7718.0
===
6394.

58.0
===
110.0
===
128.0
===
175.0
===
147.0
===
51.0
===
30.0
===
76.0
===
93.0
===
175.0
===
148.0
===
50.0
===
38.0
===
76.0
===
87.0
===
188
===
151
===
56
===
37
===
81
===
112
===
383
===
306
===
113
===
113
===
151
===
231
===
275
===
238
===
72
===
48
===
122
===
153
===
130
===
112
===
33
===
22
===
60
===
71
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
nan
===
159
===
135
===
40
===
37
===
59
===
82
===
53289
===
43859
===
14614
===
11649
===
22243
===
29104
===
6808.0
===
5638.0
===
1805.0
===
1434.0
===
2831.0
===
3757.0
===
5951.0
===
4874.0
===
1662.0
===
1308.0
===
2530.0
===
3282.0
===
8131.0
===
6732.0
===
2247.0
===
1775.0
===
3415.0
===
4514.0
===
7728
===
6329
===
2117
===
1744
===
3266
===
4226
===
8991
===
7404
===
2541
===
1949
===
3743
===
4843
===
7145
===
5872
===
1942
===
1510
===
2872
===
3895
===
2482
===
2076
===
65

===
5312.0
===
8878.0
===
7242.0
===
2467.0
===
1938.0
===
3711.0
===
4840.0
===
10921.0
===
8969.0
===
3013.0
===
2360.0
===
4600.0
===
6008.0
===
10785
===
8813
===
3009
===
2324
===
4522
===
5824
===
12377
===
10173
===
3357
===
2676
===
5183
===
6855
===
10632
===
8787
===
2818
===
2333
===
4494
===
5812
===
9051
===
7433
===
2527
===
2026
===
3839
===
4898
===
7793.0
===
6397.0
===
2206.0
===
1740.0
===
3293.0
===
4268.0
===
7590.0
===
6243.0
===
2047.0
===
1617.0
===
3238.0
===
4197.0
===
7287.0
===
5976.0
===
1984.0
===
1592.0
===
3062.0
===
4013.0
===
7759.0
===
6318.0
===
2183.0
===
1674.0
===
3257.0
===
4164.0
===
9970
===
8168
===
2727
===
2231
===
4177
===
5538
===
297310
===
243962
===
81619
===
64943
===
124108
===
161543
===
23141.0
===
18981.0
===
6467.0
===
5097.0
===
9697.0
===
12440.0
===
20888.0
===
17178.0
===
5767.0
===
4688.0
===
8728.0
===
11462.0
===
31428.0
===
25718.0
===
8537.0
===
6862.0
===
13114.0
===
17015.0
===
28752
===
23703
===
7815
===
6126
===
1187

In [None]:
[
    {
        "month": "January", 
        "values": [
            {
                "value": 0, 
                "theme": "Conflict"
            }, 
            {
                "value": 4, 
                "rate": "Not very much"
            }, 
            {
                "value": 12, 
                "rate": "Medium"
            }, 
            {
                "value": 6, 
                "rate": "Very much"
            }, 
            {
                "value": 0, 
                "rate": "Tremendously"
            }
        ]
    }, 