# Context-enhanced dictionary approach: Base dictionaries

In [None]:
import pandas as pd
import numpy as np
import os
from urllib.parse import urlencode
from urllib.parse import quote
from datetime import datetime, timedelta

In [None]:
import re
import joblib
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stopwords = stopwords.words('dutch') # is this the best stopword removal approach?
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
stemmer = SnowballStemmer('dutch')

In [None]:
## preprocessing functions

def transform_lowercase(x):
    return x.lower()

def remove_punctuation(x):
    return re.sub(r'[^\w\s]|_', '', x)

def remove_numbers(x):
    return re.sub(r'\d+', '', x)

def remove_links(x):
    return re.sub(r'http\S+', '', x)

def remove_linebreaks(x):
    return x.replace('\n', ' ').strip() # also remove double whitespace

def remove_stopwords(x):
    x = x.split(" ")
    x = " ".join([w for w in x if (w not in stopwords)&(w!="")]) # if not stop word or empty
    return x

def list_of_words(x):
    return x.split(" ")

def remove_numbers2(string):
    pattern = r'\b\d+\b'
    matches = re.findall(pattern, string)
    for match in matches:
        string = string.replace(match, '')
    return string

In [None]:
news = pd.read_pickle("data/dictionaries/newsoutlets_100.pkl")
parties = pd.read_pickle("data/dictionaries/parties.pkl")
pol = pd.read_pickle("data/dictionaries/tweedekamerleden_kabinetsleden.pkl")
pa = pd.read_pickle("data/dictionaries/policy_agendas_dutch.pkl")

In [None]:
polent = parties + pol # combine parties and politicians into political entities list

In [None]:
# remove because it messes things up
polent = [w for w in polent if w not in ['50+', "GO", "GOUD"]]

In [None]:
print('Policy Agendas:',len(pa),
      'News:', len(news),
      'Parties:',len(parties),
      'Politicians:', len(pol))

In [None]:
# file with manual labels.
labels = pd.read_csv("data/annotations.csv")
labels.shape

In [None]:
terms = labels.q.values.tolist()

## preprocessing

In [None]:
# preprocessing
terms_lc = [transform_lowercase(w) for w in terms] # lowercase
terms_lc_punc = [remove_punctuation(w) for w in terms_lc]# version with numbers in it
terms_lc_num_punc = [remove_numbers2(w) for w in terms_lc_punc] # wihtout numbers, punctuation and lowercase
terms_tokenized = [word_tokenize(w) for w in terms_lc_num_punc] # tokenized

In [None]:
pa_lc_num_punc = []
for w in pa:
    w = transform_lowercase(w)
    w = remove_punctuation(w)
    w = remove_numbers2(w)
    pa_lc_num_punc.append(w)

In [None]:
news_lc_num_punc = []
for w in news:
    w = transform_lowercase(w)
    w = remove_numbers2(w)
    w = remove_punctuation(w)
    news_lc_num_punc.append(w)

In [None]:
polent_lc_num_punc = [] 
for w in polent:
    w = transform_lowercase(w)
    w = remove_punctuation(w)
    w = remove_numbers2(w)
    polent_lc_num_punc.append(w)

In [None]:
# stemmed versions
terms_stemmed = [[stemmer.stem(w) for w in word] for word in terms_tokenized]
pa_stemmed = [stemmer.stem(w) for w in pa_lc_num_punc]

## overlap with dictionaries

In [None]:
def check_overlap(search_terms, dictionary):
    results = []
    for sublist in search_terms:
        found = False
        for word in sublist:
            if word in dictionary:
                found = True
                break
        results.append(1 if found else 0)
    return results

In [None]:
def check_overlap2(search_terms, dictionary):
    '''only whole words that match dictionary words, and not partial matches, by using word boundary characters (\b).'''
    pattern = r'\b(' + '|'.join(dictionary) + r')\b'
    matches = []
    for w in search_terms:
        match = re.search(pattern, w)
        matches.append(1 if match else 0)
    return matches

In [None]:
pa_only_stemmed = check_overlap(terms_stemmed, pa_lc_num_punc) # pa is already stemmed
pa_only_notstemmed = check_overlap(terms_tokenized, pa_lc_num_punc) # not stemmed
news_only = check_overlap2(terms_lc_num_punc, news_lc_num_punc) # fully processed
polent_only = check_overlap2(terms_lc_num_punc, polent_lc_num_punc) # fully processed
news_word = check_overlap(terms_tokenized, ['nieuws']) # tokenized

In [None]:
df = pd.DataFrame({'pa_only_notstemmed':pa_only_notstemmed,
                   'pa_only_stemmed':pa_only_stemmed,
                   'news_only':news_only, 
                   'news_word':news_word, 
                   'polent_only':polent_only, 
                   'search_terms':terms, 
                   'terms_processed':terms_tokenized,
                   'terms_stemmed':terms_stemmed})

In [None]:
df['news_only2'] = (df.news_only==1) | (df.news_word==1) # combine news word and news outlets.
df['news_only2'] = df['news_only2'].astype(int)

# combinations of dictionaries
df['polent_news'] = (df.news_only2==1) | (df.polent_only==1)
df['polent_news'] = df['polent_news'].astype(int)

df['pa_notstemmed_news'] = (df.news_only2==1) | (df.pa_only_notstemmed==1)
df['pa_notstemmed_news'] = df['pa_notstemmed_news'].astype(int)

df['pa_stemmed_news'] = (df.news_only2==1) | (df.pa_only_stemmed==1)
df['pa_stemmed_news'] = df['pa_stemmed_news'].astype(int)

df['pa_notstemmed_polent'] = (df.polent_only==1) | (df.pa_only_notstemmed==1)
df['pa_notstemmed_polent'] = df['pa_notstemmed_polent'].astype(int)

df['pa_stemmed_polent'] = (df.polent_only==1) | (df.pa_only_stemmed==1)
df['pa_stemmed_polent'] = df['pa_stemmed_polent'].astype(int)

df['pa_notstemmed_polent_news'] = (df.polent_only==1) | (df.pa_only_notstemmed==1) | (df.news_only2==1)
df['pa_notstemmed_polent_news'] = df['pa_notstemmed_polent_news'].astype(int)

df['pa_stemmed_polent_news'] = (df.polent_only==1) | (df.pa_only_stemmed==1) | (df.news_only2==1)
df['pa_stemmed_polent_news'] = df['pa_stemmed_polent_news'].astype(int)

In [None]:
df = pd.merge(labels, df, left_on='q', right_on='search_terms')
df.head()

### Evaluation

In [None]:
# test dataset
X_test = np.load("data/train_test/X_test.npy", allow_pickle=True).tolist()
y_test = np.load("data/train_test/y_test.npy", allow_pickle=True).tolist()

In [None]:
def preprocess2(x):
    # for matching
    return re.sub(r'[^\w\s]|_', '', x).strip()

In [None]:
X_test = [preprocess2(x) for x in X_test]
df['q_clean'] = df['q'].astype(str).apply(preprocess2)
df2 = pd.DataFrame({"test_set":X_test})

In [None]:
merged = pd.merge(df2, df, left_on='test_set', right_on='q_clean', how='left')
merged.shape

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
def make_table(df, cols):
    '''
    Takes list of classification reports as dicts as input, and outputs one table with only
    '''
    new = []
    for c in cols:
        dct = classification_report(df['Q1_checked'], df[c], output_dict=True)
        dct = dct['1']
        dct.update({'model':c})
        new.append(dct)
    new=pd.DataFrame(new).set_index('model')
    return new

In [None]:
cols = ['polent_only', 'news_only2', 'pa_only_notstemmed', 'pa_only_stemmed', 'polent_news', 
        'pa_notstemmed_news', 'pa_stemmed_news', 'pa_notstemmed_polent', 'pa_stemmed_polent', 
        'pa_notstemmed_polent_news', 'pa_stemmed_polent_news']

In [None]:
report = make_table(merged, cols)
report

In [None]:
for c in cols:
    print(c)
    print(classification_report(merged['Q1_checked'], merged[c]))
    print('\n')