<a href="https://colab.research.google.com/github/lucacerab/DSEthesis-transformers/blob/main/ECB_NER%2BTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extracting structured information from an unstructured set of textual documents

## - EDA

In [None]:
# run this to avoid bugs with latest px version 
!pip install PyYAML==5.4.1 #restart notebook 

Collecting PyYAML==5.4.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[?25l[K     |▌                               | 10 kB 29.8 MB/s eta 0:00:01[K     |█                               | 20 kB 31.0 MB/s eta 0:00:01[K     |█▌                              | 30 kB 35.0 MB/s eta 0:00:01[K     |██                              | 40 kB 25.9 MB/s eta 0:00:01[K     |██▋                             | 51 kB 15.5 MB/s eta 0:00:01[K     |███                             | 61 kB 16.3 MB/s eta 0:00:01[K     |███▋                            | 71 kB 12.1 MB/s eta 0:00:01[K     |████▏                           | 81 kB 12.7 MB/s eta 0:00:01[K     |████▋                           | 92 kB 12.3 MB/s eta 0:00:01[K     |█████▏                          | 102 kB 12.8 MB/s eta 0:00:01[K     |█████▋                          | 112 kB 12.8 MB/s eta 0:00:01[K     |██████▏                         | 122 kB 12.8 MB/s eta 0:00:01[K     |██████▊                         | 1

In [None]:
%%capture
import pandas as pd
import re
import plotly.express as px
import plotly.graph_objs as go
from functools import reduce
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
!pip install langdetect
from langdetect import detect
!pip install spacy-transformers
!python -m spacy download en_core_web_trf
import spacy 
from spacy import displacy
!pip install bertopic

In [None]:
df = pd.read_csv('https://www.ecb.europa.eu/press/key/shared/data/all_ECB_speeches.csv?34c1d2c3450cc09ed620ef318601dc78', sep='|', encoding='utf8').dropna().reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,date,speakers,title,subtitle,contents
0,2021-07-14,Isabel Schnabel,A new strategy for a changing world,"Speech by Isabel Schnabel, Member of the Execu...",SPEECH A new strategy for a changing world...
1,2021-07-11,Christine Lagarde,"Climate Change and Central Banks: Analysing, A...","Speech by Christine Lagarde, President of the ...",SPEECH Climate Change and Central Banks: A...
2,2021-07-08,"Christine Lagarde,Luis de Guindos",Opening remarks to the press conference (with ...,"Christine Lagarde, President of the ECB, Luis ...",OPENING REMARKS PRESS CONFERENCE Christi...
3,2021-07-03,Isabel Schnabel,Escaping low inflation?,"Speech by Isabel Schnabel, Member of the Execu...",SPEECH Escaping low inflation? Speech by...
4,2021-06-29,Christine Lagarde,Financing a green and digital recovery,"Speech by Christine Lagarde, President of the ...",SPEECH Financing a green and digital recov...


In [None]:
df.speakers.unique()

array(['Isabel Schnabel', 'Christine Lagarde',
       'Christine Lagarde,Luis de Guindos', 'Luis de Guindos',
       'Fabio Panetta', 'Frank Elderson', 'Philip R. Lane', 'Yves Mersch',
       'Benoît Cœuré', 'Sabine Lautenschläger', 'Mario Draghi',
       'Peter Praet', 'Vítor Constâncio', 'Jörg Asmussen',
       'José Manuel González-Páramo', 'Lorenzo Bini Smaghi',
       'Jürgen Stark', 'Jean-Claude Trichet', 'Gertrude Tumpel-Gugerell',
       'Lucas Papademos', 'Otmar Issing', 'Tommaso Padoa-Schioppa',
       'Eugenio Domingo Solans', 'Willem F. Duisenberg',
       'Sirkka Hämäläinen', 'Christian Noyer',
       'Willem F. Duisenberg,Eugenio Domingo Solans',
       'Alexandre Lamfalussy'], dtype=object)

In [None]:
df.isna().sum()

date        0
speakers    0
title       0
subtitle    0
contents    0
dtype: int64

In [None]:
df.shape

(2473, 5)

In [None]:
df[["year", "month", "day"]] = df["date"].str.split("-", expand = True)

In [None]:
df.head(3)

Unnamed: 0,date,speakers,title,subtitle,contents,year,month,day
0,2021-07-14,Isabel Schnabel,A new strategy for a changing world,"Speech by Isabel Schnabel, Member of the Execu...",SPEECH A new strategy for a changing world...,2021,7,14
1,2021-07-11,Christine Lagarde,"Climate Change and Central Banks: Analysing, A...","Speech by Christine Lagarde, President of the ...",SPEECH Climate Change and Central Banks: A...,2021,7,11
2,2021-07-08,"Christine Lagarde,Luis de Guindos",Opening remarks to the press conference (with ...,"Christine Lagarde, President of the ECB, Luis ...",OPENING REMARKS PRESS CONFERENCE Christi...,2021,7,8


Number of speeches per year

In [None]:
count_year = df.groupby(['year'])['title'].count().reset_index().rename(columns={'title':'number'})

In [None]:
fig = px.bar(count_year, y='number', x='year', text='number')
fig.update_traces(textposition='outside', marker_color='rgb(55, 83, 109)')
#fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)',}) # remove background color
fig.update_layout(title_text="Number of speeches per year", 
                  xaxis_title="Year",
                  yaxis_title="Number of speeches",
                  width=1280, height=720)
fig.show(renderer='colab')

Number of speeches per month

In [None]:
count_month = df.groupby(['month'])['title'].count().reset_index().rename(columns={'title':'number'})

In [None]:
fig = px.bar(count_month, y='number', x='month', text='number')
fig.update_traces(textposition='outside', marker_color='rgb(55, 83, 109)')
#fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)',}) # remove background color
fig.update_layout(title_text="Number of speeches per month", 
                  xaxis_title="Month",
                  yaxis_title="Number of speeches",
                  width=1280, height=720)
fig.show(renderer='colab')

Presidents speeches over the total per year

In [None]:
pres_list = ['Mario Draghi', 'Christine Lagarde', 'Willem F. Duisenberg', 'Jean-Claude Trichet']

In [None]:
grouped = df.groupby(['year', 'speakers'])['title'].count().reset_index().rename(columns={'title':'number'})

In [None]:
pres_numbers = grouped[grouped['speakers'].isin(pres_list)]

In [None]:
pres_numbers = pres_numbers.groupby(['year'])['number'].sum().reset_index()

In [None]:
merged = pd.merge(pres_numbers, count_year, on='year').rename(columns={'number_x': 'pres_speeches', 'number_y': 'total_speeches'})

In [None]:
merged['others'] = merged['total_speeches'] - merged['pres_speeches']

In [None]:
merged['percent'] = round((merged['pres_speeches'] / merged['total_speeches'])*100).astype(int)

In [None]:
fig = go.Figure(data=[
    go.Bar(name='President speeches', x=merged.year, y=merged.pres_speeches, marker_color='rgb(26, 118, 255)', text=merged.percent, 
           texttemplate="%{text}%", textposition="inside"),
    go.Bar(name='Total speeches', x=merged.year, y=merged.others, marker_color='rgb(55, 83, 109)', 
           text=merged.total_speeches, textposition='outside')
])
# Change the bar mode
fig.update_layout(barmode='stack',
                  title_text="Number of speeches per year and percentage of president speeches", 
                  xaxis_title="Year",
                  yaxis_title="Number of speeches",
                  width=1280, height=720)
#fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)',}) # remove background color
fig.show()

Statistics per speaker

In [None]:
# number of speeches per speaker
num = df.groupby(['speakers'])['title'].count().reset_index().rename(columns={'title':'number'})

In [None]:
# first speech per speaker
date_first = df.groupby(['speakers'])['date'].min().reset_index().rename(columns={'date':'First'})

In [None]:
# last soeech per speaker
date_last = df.groupby(['speakers'])['date'].max().reset_index().rename(columns={'date':'Last'})

In [None]:
# merging a, b, c
stats = reduce(lambda left,right: pd.merge(left,right,on='speakers'), [num, date_first, date_last])

In [None]:
stats.sort_values(by=['number'], inplace=True, ascending=False)

In [None]:
stats

Unnamed: 0,speakers,number,First,Last
10,Jean-Claude Trichet,329,2003-11-20,2011-10-24
1,Benoît Cœuré,190,2012-02-06,2019-12-18
17,Mario Draghi,186,2011-11-18,2019-10-28
27,Yves Mersch,161,2013-02-27,2020-11-04
8,Gertrude Tumpel-Gugerell,158,2003-06-30,2011-05-23
25,Willem F. Duisenberg,157,1997-06-30,2003-10-29
24,Vítor Constâncio,126,2010-07-09,2018-05-17
19,Peter Praet,126,2011-06-16,2019-05-15
11,José Manuel González-Páramo,119,2004-11-11,2012-05-18
14,Lorenzo Bini Smaghi,119,2005-10-10,2011-12-16


Speeches length

In [None]:
# keep only english speeches
def detect_en(text):
    try:
        return detect(text) == 'en'
    except:
        return False

In [None]:
df = df[df['contents'].apply(detect_en)].reset_index(drop=True)
df.head()

Unnamed: 0,date,speakers,title,subtitle,contents,year,month,day
0,2021-07-14,Isabel Schnabel,A new strategy for a changing world,"Speech by Isabel Schnabel, Member of the Execu...",SPEECH A new strategy for a changing world...,2021,7,14
1,2021-07-11,Christine Lagarde,"Climate Change and Central Banks: Analysing, A...","Speech by Christine Lagarde, President of the ...",SPEECH Climate Change and Central Banks: A...,2021,7,11
2,2021-07-08,"Christine Lagarde,Luis de Guindos",Opening remarks to the press conference (with ...,"Christine Lagarde, President of the ECB, Luis ...",OPENING REMARKS PRESS CONFERENCE Christi...,2021,7,8
3,2021-07-03,Isabel Schnabel,Escaping low inflation?,"Speech by Isabel Schnabel, Member of the Execu...",SPEECH Escaping low inflation? Speech by...,2021,7,3
4,2021-06-29,Christine Lagarde,Financing a green and digital recovery,"Speech by Christine Lagarde, President of the ...",SPEECH Financing a green and digital recov...,2021,6,29


In [None]:
df.shape

(2314, 8)

In [None]:
# filter 'SPEECH' from contents column
df.contents = df.apply(lambda row: re.sub(r"SPEECH", "", row.contents).strip(), 1)

In [None]:
df['contents_tokenized'] = df['contents'].apply(word_tokenize)

In [None]:
# count only words in tokenized text
def cnt(li):
  c = 0
  for i in li:
    if i.isalpha():
      c += 1
  return c

In [None]:
df['token_number'] = df['contents_tokenized'].apply(cnt)

In [None]:
df = df.drop(df[df.token_number == 0].index).reset_index(drop=True)

In [None]:
df.token_number.describe()

count     2314.000000
mean      2958.838807
std       1786.465976
min         37.000000
25%       1825.500000
50%       2657.500000
75%       3741.000000
max      19214.000000
Name: token_number, dtype: float64

In [None]:
avg_words_year = df.groupby('year')['token_number'].mean().astype(int).reset_index()

In [None]:
# assuming 140 words per minute we can compute the avg speech time per year dividing token number per 140
avg_words_year['minutes'] = avg_words_year['token_number'].div(140).astype(int)

In [None]:
import plotly.express as px

fig = px.bar(avg_words_year, y='minutes', x='year', text='minutes')
fig.update_traces(textposition='outside', marker_color='rgb(55, 83, 109)')
#fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)',}) # remove background color
fig.update_layout(title_text="Avg speech time per year", 
                  xaxis_title="Year",
                  yaxis_title="Avg time (minutes)",
                  width=1280, height=720)
fig.show(renderer='colab')

In [None]:
frequency_table = df.groupby('token_number')['title'].count().reset_index().rename(columns={'title':'freq'})
frequency_table['minutes'] = frequency_table['token_number'].div(140).astype(int)

In [None]:
fig = px.histogram(frequency_table, x="minutes", y='freq', nbins=50)
fig.update_traces(marker_color='rgb(55, 83, 109)')
#fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)',}) # remove background color
fig.update_layout(bargap=0.2,
                  title_text="Speeches length distribution (140wpm)<br><sup>avg speech time = {} minutes</sup>".format(int(frequency_table.minutes.mean())),
                  xaxis_title="Minutes",
                  yaxis_title="Frequency",
                  width=1280, height=720)
fig.show()

## - NER to associate each speech with a set of keywords (speaker, date, location, conference...)

In [None]:
trf = spacy.load('en_core_web_trf')

In [None]:
def ner(st):
  doc = trf(st)
  d = {}
  for ent in doc.ents:
    if ent.label_ not in d:
      d[ent.label_] = ent.text
    else:
      d[ent.label_] += ',' + ent.text
  return d

In [None]:
df['tags'] = df['subtitle'].apply(ner)

In [None]:
df.head(1)

Unnamed: 0,date,speakers,title,subtitle,contents,year,month,day,contents_tokenized,token_number,tags
0,2021-07-14,Isabel Schnabel,A new strategy for a changing world,"Speech by Isabel Schnabel, Member of the Execu...",A new strategy for a changing world Speech b...,2021,7,14,"[A, new, strategy, for, a, changing, world, Sp...",3829,"{'PERSON': 'Isabel Schnabel', 'ORG': 'the Exec..."


## - Topic Modeling

In [None]:
time = df.date.to_list()
speeches = df.contents.to_list()

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
topic_model = BERTopic(language='english', nr_topics='auto', 
                       vectorizer_model=vectorizer_model, verbose=True)

topics, _ = topic_model.fit_transform(speeches)

Batches:   0%|          | 0/73 [00:00<?, ?it/s]

2021-10-15 15:44:12,323 - BERTopic - Transformed documents to Embeddings
2021-10-15 15:44:25,590 - BERTopic - Reduced dimensionality with UMAP
2021-10-15 15:44:25,729 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-10-15 15:45:29,357 - BERTopic - Reduced number of topics from 44 to 28


In [None]:
freq = topic_model.get_topic_info()
print(len(freq), 'topics found')

28 topics found


In [None]:
freq.head(10)

Unnamed: 0,Topic,Count,Name
0,-1,807,-1_monetary_euro_euro area_monetary policy
1,0,434,0_euro_euro area_monetary_monetary policy
2,1,207,1_financial_markets_financial integration_banks
3,2,126,2_monetary_monetary policy_inflation_bank
4,3,120,3_banks_banking_supervisory_bank
5,4,117,4_euro_euro area_countries_european
6,5,57,5_pandemic_2020_inflation_pepp
7,6,45,6_macroprudential_financial_systemic risk_macr...
8,7,42,7_data_ecb_financial_accounts
9,8,40,8_payments_payment_services_retail payments


In [None]:
topics_over_time = topic_model.topics_over_time(docs=speeches, 
                                                topics=topics, 
                                                timestamps=time, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

20it [11:00, 33.01s/it]


In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=8)

In [None]:
df['topic'] = topics

In [None]:
# filter only monetary policy speeches (topic #2)
mp = df[df['topic'] == 2]

In [None]:
final_df = mp[['contents', 'tags', 'topic']].reset_index(drop=True)

In [None]:
final_df.head()

Unnamed: 0,contents,tags,topic
0,"Speech New York, 21 February 2020 The monetar...","{'PERSON': 'Philip R. Lane', 'ORG': 'the Execu...",2
1,"Frankfurt am Main, 24 September 2019 Measures...","{'PERSON': 'Luis de Guindos', 'ORG': 'ECB,the ...",2
2,"Communication, expectations and monetary polic...","{'PERSON': 'Luis de Guindos', 'ORG': 'ECB,ECB'...",2
3,The effects of APP reinvestments on euro area ...,"{'PERSON': 'Benoît Cœuré', 'ORG': 'the Executi...",2
4,Providing monetary policy stimulus after the n...,"{'PERSON': 'Peter Praet', 'ORG': 'the Executiv...",2


## Example: select speeches to be feeded to the argumentation mining pipeline (ECBspeech_AM.ipynb)

In [None]:
for index, row in final_df.iterrows():
  if 'Mario Draghi' in row['tags'].get('PERSON'):
    print(index)

32
43
44
51
62


In [None]:
final_df.contents[32][:189]

'Researching the Macroeconomic Policies of the Future   Welcome address by Mario Draghi, President of the ECB,at the first ECB Annual Research Conference,Frankfurt am Main, 28 September 2016'