# Context-enhanced dictionaries: Rolling dictionaries
Note that news content cannot be shared in this repo but is available publicly (see paper for source). 

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from urllib.parse import urlencode
from urllib.parse import quote
from datetime import datetime, timedelta
import re
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('dutch') 
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
stemmer = SnowballStemmer('dutch')
import matplotlib.pyplot as plt
import spacy
nlp = spacy.load("nl_core_news_sm")
from fuzzywuzzy import fuzz

In [None]:
## preprocessing functions
nlp = spacy.load("nl_core_news_sm")

def transform_lowercase(x):
    return x.lower()

def remove_punctuation(x):
    return re.sub(r'[^\w\s]|_', '', x)

def remove_numbers(x):
    return re.sub(r'\d+', '', x)

def remove_numbers2(string): # avoid removing numbers when they are part of a word (e.g., D66)
    pattern = r'\b\d+\b'
    matches = re.findall(pattern, string)
    for match in matches:
        string = string.replace(match, '')
    return string.strip()

def remove_links(x):
    return re.sub(r'http\S+', '', x)

def remove_linebreaks(x):
    return x.replace('\n', ' ').strip() # also remove double whitespace

def remove_stopwords(x):
    x = x.split(" ")
    x = " ".join([w for w in x if (w not in stopwords)&(w!="")]) # if not stop word or empty
    return x

def list_of_words(x):
    return x.split(" ")

def tokenize(text):
    words = word_tokenize(text.lower())
    words_stemmed = [stemmer.stem(word) for word in words]
    return words_stemmed

def extract_named_entities_nouns(text):
    # Process whole documents
    doc = nlp(text)
    # Analyze syntax
    named_entities = [chunk.text for chunk in doc.ents] # extract named entities
    #print('named entities:', named_entities)
    nouns = [chunk.text for chunk in doc if chunk.pos_ == 'NOUN']
    #print('nouns:', nouns)
    return list(set(named_entities + nouns))

def extract_named_entities(text):
    # Process whole documents
    doc = nlp(text)
    # Analyze syntax
    named_entities = [chunk.text for chunk in doc.ents] # extract named entities
    #print('named entities:', named_entities)
    return list(set(named_entities))



def stem_list_of_words(x):
    x = [stemmer.stem(w) for w in x]
    return x

In [None]:
## for matching purposes
def preprocess(x):
    # punctuation, lowercase, whitespace
    return re.sub(r'[^\w\s]|_', '', x.lower()).strip()

# Dictionaries

In [None]:
news = pd.read_pickle("data/dictionaries/newsoutlets_100.pkl")
parties = pd.read_pickle("Pdata/dictionaries/parties.pkl")
pol = pd.read_pickle("data/dictionaries/tweedekamerleden_kabinetsleden.pkl")
pa = pd.read_pickle("data/dictionaries/policy_agendas_dutch.pkl")

In [None]:
polent = parties + pol # combine parties and politicians into political entities list

In [None]:
# remove these because it messes things up
polent = [w for w in polent if w not in ['50+', "GO", "GOUD"]]

In [None]:
print('Policy Agendas:',len(pa),
      'News:', len(news),
      'Parties:',len(parties),
      'Politicians:', len(pol))

In [None]:
### PREPROCESSING

In [None]:
## lowercase, punct removal for policy agendas, simply lowercase for the rest
news = [w.lower() for w in news]
polent = [w.lower() for w in polent]
pa = [remove_punctuation(w.lower()) for w in pa]

In [None]:
news = news + ["nieuws"]

In [None]:
exclusion_list=polent+news

# News article data

In [None]:
news_data = pd.read_csv("news_03012021-06302022.csv")
news_data.shape

In [None]:
news_data.head()

In [None]:
# Convert the date column to a datetime format
news_data['date'] = pd.to_datetime(news_data['date'])
#news.set_index('date', inplace=True)
# Group the title column by date
news_per_day = news_data.groupby(pd.Grouper(key='date', freq='D'))['title'].apply(lambda x: ' '.join(x)).reset_index()

### Preprocessing

In [None]:
# with nouns
def preprocess_analysis_news(x, exclusion_list=exclusion_list):
    x = extract_named_entities_nouns(x)
    x = [w.lower() for w in x]
    x = [remove_punctuation(w) for w in x if w not in exclusion_list]
    x = [remove_numbers(w) for w in x if w not in exclusion_list] # remove punctuation but not if it's a party or news outlet name
    x = [remove_links(e) for e in x]
    x = [remove_linebreaks(e) for e in x]
    x = [w.strip() for w in x]
    x = [w for w in x if (w not in stopwords)&(w!="")] # if not stop word or empty
    return x

#without nouns
def preprocess_analysis_news2(x, exclusion_list=exclusion_list):
    x = extract_named_entities(x)
    x = [w.lower() for w in x]
    x = [remove_punctuation(w) for w in x if w not in exclusion_list]
    x = [remove_numbers(w) for w in x if w not in exclusion_list] # remove punctuation but not if it's a party or news outlet name
    x = [remove_links(e) for e in x]
    x = [remove_linebreaks(e) for e in x]
    x = [w.strip() for w in x]
    x = [w for w in x if (w not in stopwords)&(w!="")] # if not stop word or empty
    return x

In [None]:
news_per_day['title_c'] = news_per_day.title.apply(preprocess_analysis_news)
news_per_day['title_c2'] = news_per_day.title.apply(preprocess_analysis_news2)

# Search query data + labels

In [None]:
# test dataset
X_test = np.load("data/train_test/X_test.npy", allow_pickle=True).tolist()
y_test = np.load("data/train_test/y_test.npy", allow_pickle=True).tolist()
print(len(y_test))

In [None]:
df = pd.DataFrame({"X_test":X_test, "y_test":y_test})
print(df.shape)
df.head()

In [None]:
# browsing data
br = pd.read_csv("browser2022.csv")
br.shape

In [None]:
# search only
br = br[br['text_search']==1].copy()
br.shape

In [None]:
br['date_easy'] = br.date_dt.astype(str).str[0:10]

In [None]:
# make matching column
df['q_match'] = df['X_test'].astype(str).apply(preprocess)
br['q_match'] = br['q'].astype(str).apply(preprocess)

In [None]:
df = pd.merge(df,br,on='q_match', how='left')
df.shape

In [None]:
# 22,178 searches with labels.
df.y_test.isna().value_counts()

In [None]:
df.X_test.nunique()

In [None]:
df_u = df.drop_duplicates(subset=['X_test', 'date_easy'])
print(df.shape, df_u.shape)

### Preprocessing

In [None]:
def preprocess_analysis_sq(x, exclusion_list=exclusion_list):
    x = transform_lowercase(x)
    x = list_of_words(x)
    x = [remove_punctuation(w) for w in x if w not in exclusion_list]
    x = [remove_numbers(w) for w in x if w not in exclusion_list] # remove punctuation but not if it's a party or news outlet name
    x = [w.strip() for w in x]
    x = [w for w in x if (w not in stopwords)&(w!="")]
    return x 

In [None]:
df_u['X_test_c'] = df_u.X_test.apply(preprocess_analysis_sq)

# News content overlap

In [None]:
def create_timeframe(given_date, days_before, days_after):
    given_date = datetime.strptime(given_date, '%Y-%m-%d')
    
    dates_before = []
    for i in range(1, days_before+1):
        d = given_date - timedelta(days=i)
        dates_before.append(d)
        
    dates_after = []
    for i in range(1, days_after+1):
        d = given_date + timedelta(days=i)
        dates_after.append(d)
    
    timeframe = dates_before + [given_date] + dates_after
    timeframe.sort()
    
    return timeframe

In [None]:
def fuzzymatching_news(query, query_date, news_df, text_col, days_before=0, days_after=1, threshold=80): 
    # slice news dataframe according to given timeframe.
    timeframe = create_timeframe(query_date, days_before, days_after)

    # slice news dataframe and return text
    news_df = news_df[news_df['date'].isin(timeframe)]
    title = news_df[text_col].tolist()
    title = [item for sublist in title for item in sublist] # flatten list
    
    if isinstance(query, list):
        query = " ".join(query) # make one string
    
    for w in title:
        r = fuzz.token_set_ratio(query, w)
        if r >= threshold:
            return 1
    return 0 


In [None]:
q_lst = df_u.X_test_c.tolist()
d_lst = df_u.date_easy.tolist()
l_lst = df_u.y_test.tolist()

res = {"query":q_lst,
       "X_test":df_u.X_test.tolist(),
      "date":d_lst,
      'label':l_lst,
      }

for t in [80,85,90,95]:
    for col_name, col in zip(['entities', 'entities_nouns'], ['title_c2', 'title_c']):
        col_name = col_name+"_"+str(t)
        ratios = []
        for query, query_date in tqdm(zip(q_lst, d_lst)):
            ratio = fuzzymatching_news(query, query_date, news_per_day, col, days_before=1, days_after=1, threshold=t)
            ratios.append(ratio)
        res.update({col_name:ratios})

In [None]:
res = pd.DataFrame(res)
res

In [None]:
## higher thresholds do not make a substantial difference
cols_en = [c for c in res.columns if c.startswith("entities_nouns")]
for c in cols_en:
    print(c)
    print(classification_report(res['label'], res[c]))
    print()

In [None]:
cols = [c for c in res.columns if c.startswith("entities_") and c not in cols_en]
for c in cols:
    print(c)
    print(classification_report(res['label'], res[c]))
    print()

In [None]:
print(classification_report(res['label'], res['entities_80']))

In [None]:
print(classification_report(res['label'], res['entities_nouns_80']))

In [None]:
# entities is better overall f1 and entities+nouns is better for recall. 

## Combine with dictionary

In [None]:
dictionary_preds = pd.read_csv("dictionary_preds.csv")
dictionary_preds.shape

In [None]:
res2 = pd.merge(res, dictionary_preds[['X_test', 'pa_stemmed_polent_news_dict']], how='left', on='X_test')
res2.shape

In [None]:
res2.pa_stemmed_polent_news_dict.value_counts(dropna=False)

In [None]:
res2.entities_80.value_counts(dropna=False)

In [None]:
res2.entities_nouns_80.value_counts(dropna=False)

In [None]:
res2['dict_news_e'] = (res2['pa_stemmed_polent_news_dict'] | res2['entities_80']).astype(int)
res2['dict_news_e_n'] = (res2['pa_stemmed_polent_news_dict'] | res2['entities_nouns_80']).astype(int)

In [None]:
res2.dict_news_e.value_counts(dropna=False)

In [None]:
res2.dict_news_e_n.value_counts(dropna=False)

In [None]:
def make_table(df, cols):
    '''
    Takes list of classification reports as dicts as input, and outputs one table with only
    '''
    new = []
    for c in cols:
        dct = classification_report(df['label'], df[c], output_dict=True)
        dct = dct['1']
        dct.update({'model':c})
        new.append(dct)
    new=pd.DataFrame(new).set_index('model')
    return new

In [None]:
cols = ["entities_80", "entities_nouns_80", 'dict_news_e', 'dict_news_e_n']
report = make_table(res2, cols)
report