In [1]:
from codingtools import from_json
import pandas as pd
from collections import Counter
from datetime import date

data = 'D:/Thesis/Actors/totaal.json.gz'
data_trouw = 'D:/Thesis/Actors/trouw.json.gz'
codes = 'D:/Thesis/Actors/tool.json'

coded = from_json(open(codes,'r')).coded
df = pd.concat([pd.read_json(data), pd.read_json(data_trouw)])
df['DATE_dt'] = df['DATE_dt'].apply(lambda x: date.fromtimestamp(x/1000))

#Set the unique cases:
coded['http://nl.dbpedia.org/resource/Europese_Unie'] = 'not political'
coded['http://nl.dbpedia.org/resource/Frans_Timmermans_(politicus)'] = 'FT'
coded['http://nl.dbpedia.org/resource/Frits_Bolkestein'] = 'FB'
coded['http://nl.dbpedia.org/resource/Donald_Tusk'] = 'DT'
coded['http://nl.dbpedia.org/resource/Romano_Prodi'] = 'RP'

cases = [('FT', date(year = 2014, month = 8, day = 1), date.today()),
         ('FB', date(year = 1999, month = 6, day = 17), date(year = 2004, month = 12, day = 31)),
         ('DT', date(year = 2014, month = 12, day = 1), date.today()),
         ('RP', date(year = 1999, month = 6, day = 16), date(year = 2004, month = 12, day = 31))]

#Count occurrences of unique actors
count = lambda x: Counter(map(coded.get,x.keys()))

#Is actor in case and the date in the relevant interval?
def isInInterval(row, case):
    if case[0] in row['counters']:
        if case[1] < row['DATE_dt'] < case[2]:
            return 1
        return 0
    return -1      

#Run the counters
df['counters'] = df['actors'].apply(count)
df['Dutch_count'] = df['counters'].apply(lambda x: x['Dutch'])
df['European_count'] = df['counters'].apply(lambda x: x['European'])
df['Other_national_count'] = df['counters'].apply(lambda x: x['other national'])

#Correct for special cases
for index,row in df.iterrows():
    testFT = isInInterval(row,cases[0])
    testFB = isInInterval(row,cases[1])
    testDT = isInInterval(row,cases[2])
    testRP = isInInterval(row,cases[3])
    if testFT == 1:
        df['European_count'].at[index] += 1
    elif testFT == 0:
        df['Dutch_count'].at[index] += 1
    if testFB == 1:
        df['European_count'].at[index] += 1
    elif testFB == 0:
        df['Dutch_count'].at[index] += 1
    if testDT == 1:
        df['European_count'].at[index] += 1
    elif testDT == 0:
        df['Other_national_count'].at[index] += 1
    if testRP == 1:
        df['European_count'].at[index] += 1
    elif testRP == 0:
        df['Other_national_count'].at[index] += 1
del testFT,testFB,testDT,testRP

df.drop('counters',axis=1,inplace=True)

#Code
df['type'] = 'None'
df.loc[(df['Dutch_count'] == 0) & (df['European_count'] > 0),'type'] = 'Fully European'
df.loc[(df['Dutch_count'] > 0) & (df['European_count'] == 0),'type'] = 'Fully national'
df.loc[(df['Dutch_count'] > 0) & (df['European_count'] > 0),'type'] = 'Mixed'

In [42]:
#Export voor ggplot
grouped = df[df['type']!='None'].groupby(['YEAR'])
a = (100*grouped['type'].value_counts()/grouped['type'].count())
a.unstack().join(grouped['type'].count()).to_csv('D:/Thesis/Actors/counts.csv')

tabloidMap = {'Algemeen Dagblad' : 'Tabloid',
 'De Telegraaf' : 'Tabloid',
 'Het Financieele Dagblad' : 'Quality',
 'Metro' : 'Tabloid',
 'NRC Handelsblad' : 'Quality',
 'NRC.NEXT' : 'Quality',
 'Nederlands Dagblad' : 'Quality',
 'Reformatorisch Dagblad' : 'Quality',
 'Trouw' : 'Quality',
 'de Volkskrant' : 'Quality'}

df['Paper_type'] = df['MEDIUM'].map(tabloidMap)

grouped = df[df['type']!='None'].groupby(['YEAR','Paper_type'])
a = grouped['type'].value_counts().unstack()
for col in a:
    a[col] = 100*a[col]/grouped['type'].count()
a.join(grouped['type'].count()).to_csv('D:/Thesis/Actors/papertype_counts.csv')

In [10]:
#Table
grouped = df[df['type']!='None'].groupby(['MEDIUM'])
(100*grouped['type'].value_counts()/grouped['type'].count()).unstack().to_csv('D:/Thesis/Actors/newspaper_bar.csv')

In [13]:
import datetime
random_state = 1530100473 #int(datetime.datetime.now().timestamp)

test = df.sample(25,random_state=random_state)
test.TEXT = test.TEXT.apply(lambda x: ' '.join(x))
test[['DATE_dt','HEADLINE','TEXT','actors']].to_csv('D:/Thesis/Actors/testsample.csv')

In [8]:
def lenMinusEu(actors):
    if 'http://nl.dbpedia.org/resource/Europese_Unie' in actors:
        return len(actors)-1
    return len(actors)

    
df['actors'].apply(lenMinusEu).value_counts()

df.loc[(df['Dutch_count'] == 0) & (df['European_count'] == 0 ) & (df['Other_national_count'] > 0),'type'].count()

13256