# Find mentions of the EU in articles

This notebook searches the news articles to extract the paragraphs that mention EU institutions and gathers these in a pandas DataFrame.

In [1]:
import pandas as pd
import json
import re

settings_file = 'D:/thesis/settings - nl.json'

In [2]:
#Preparation

#Read settings
settings = json.loads(open(settings_file).read())["settings"]

#Read data
df = pd.read_json(settings['data_json'], compression = 'gzip')
df.sort_index(inplace = True)

In [15]:
#Choose search terms to denote EU
terms = ['Europese Unie','EU']
terms = sorted(terms, key = len, reverse = True) #Sort by length to fix overlapping patterns.
    #Not strictly necessary here, but to register the exact matches this is a nice trick.

EU_terms = re.compile('|'.join(terms))

terms = ['[^a-zA-Z]'+word+'[^a-zA-Z]' for word in terms]
EU_terms_pattern = re.compile('|'.join(terms))


In [63]:
#Create df with relevant pieces of text
EU_snippets = []

for row in df.index:
    paragraph_no = 0 #Start index at 0!
    for text in df.loc[row,'TEXT']:
        if re.search(EU_terms_pattern,text) is not None:
            snippet = {
                'TEXT' : text,
                'PARAGRAPH_NO' : paragraph_no,
                'MEDIUM' : df.loc[row,'MEDIUM'],
                'HEADLINE' : df.loc[row,'HEADLINE'],
                'DATE_dt' : df.loc[row,'DATE_dt'],
                'MATCHES' : re.findall(EU_terms_pattern,text)
            }
            snippet['MATCHES'] = [re.search(EU_terms,match)[0] for match in snippet['MATCHES'] if re.search(EU_terms,match) is not None]
            EU_snippets.append(snippet)
            
        paragraph_no += 1

EU_snippets = pd.DataFrame(EU_snippets)

In [72]:
EU_snippets.to_json(settings['mentions_json'], compression = 'gzip')