In [1]:
import re
from huggingface_hub import hf_hub_download
import pandas as pd
from datasets import load_dataset
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Data Cleaning

In [2]:
# Loading dataset with FED and ECB speeches from Hugging Face
ds = load_dataset("istat-ai/ECB-FED-speeches")
df = ds['train'].to_pandas()

In [3]:
# Separating speech dates by country
us_dates = df[df['country'] == 'United States']['date']
ea_dates = df[df['country'] == 'Euro area']['date']

In [4]:
# Filtering speeches from 1999 onwards
df = df[df['date'] >= pd.to_datetime('1999-01-01')]

In [5]:
# Loading historical FED interest rates dataset
fed_df = pd.read_csv("data/FEDFUNDS.csv", parse_dates=['observation_date'])
fed_df = fed_df.rename(columns={
    'observation_date': 'rate_date',
    'FEDFUNDS': 'rate'
})
fed_df = fed_df.sort_values('rate_date')

# Loading historical ECB interest rates dataset + dropping unnecessary coloumns
ecb_df = pd.read_csv("data/ECB_data.csv", parse_dates=['DATE'])
ecb_df = ecb_df.rename(columns={
    'DATE': 'rate_date',
    'Main refinancing operations - fixed rate tenders (fixed rate) (date of changes) - Level (FM.B.U2.EUR.4F.KR.MRR_FR.LEV)': 'rate'
})
ecb_df = ecb_df.drop(columns=['TIME PERIOD'])
ecb_df['rate_date'] = ecb_df['rate_date']
ecb_df = ecb_df.sort_values('rate_date')

In [6]:
# Filter rows where the rate difference is not 0
fed_df = fed_df[fed_df['rate'].diff().fillna(0) != 0]
ecb_df = ecb_df[ecb_df['rate'].diff().fillna(0) != 0]

In [7]:
# Manuelles Hinzufügen eines Werts für den 1. Januar 1999
ecb_df.loc[len(ecb_df)] = [pd.Timestamp('1999-01-01'), 3.0]
fed_df.loc[len(fed_df)] = [pd.Timestamp('1999-01-01'), 4.63]

In [8]:
# Dropping irrelevant columns from speech dataset
df = df.drop(columns=['mistral_ocr', 'url'])

In [9]:
def clean_ecb_speech(text):
    """
    Clean ECB speech text extracted via web scraping.
    Removes headers, bracketed references, slide annotations, and trims at 'Thank you' or fallback end markers.
    If no ending markers are found, the full text is retained.
    """

    if pd.isna(text):
        return ""

    # Step 1: Remove all-caps 'SPEECH'
    text = re.sub(r'\bSPEECH\b', '', text)

    # Step 2: Remove citation brackets like [10]
    text = re.sub(r'\[\d+\]', '', text)

    # Step 3: Remove (Slide ...) type annotations
    text = re.sub(r'\([^)]*slide[^)]*\)', '', text, flags=re.IGNORECASE)

    # Step 4: Truncate after "Thank you." or "Thank you for your attention."
    thank_you_match = re.search(r'(.*?\bThank you(?: for your attention)?\.\s*)', text, flags=re.IGNORECASE | re.DOTALL)
    if thank_you_match:
        text = thank_you_match.group(1).strip()
    else:
        # Step 5: Fallback endings – if no Thank you, truncate before known post-speech sections
        fallback_endings = [
            r'\bI thank .{5,100}? for their contributions',  # Matches names
            r'\bI am grateful .{5,100}? for their contributions',
            r'\bThe views expressed in this speech are personal',
            r'\bIndex based on',
        ]
        found_fallback = False
        for pattern in fallback_endings:
            fallback_match = re.search(pattern, text, flags=re.IGNORECASE)
            if fallback_match:
                text = text[:fallback_match.start()].strip()  # Keep everything before the fallback phrase
                found_fallback = True
                break
        
        if not found_fallback:
            # If no "Thank you" or fallback phrases found, keep the full text
            pass  # No truncation, keep full text

    # Step 6: Replace long dashes with spaces
    text = re.sub(r'[–—]', ' ', text)

    # Step 7: Normalize punctuation and symbols
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,;:!?%$€£¥\'\"()-]', '', text)
    text = re.sub(r'[“”]', '"', text)
    text = re.sub(r"[‘’]", "'", text)

    return text.strip()

In [10]:
def clean_fed_speech(text):
    """
    Clean Federal Reserve speech text extracted via web scraping.
    Removes boilerplate, footers, and extraneous navigation/junk content while preserving true speech content.
    """

    if pd.isna(text):
        return ""

    # Step 1: Remove preamble: from 'Skip to main content' to 'PDF' inclusive
    text = re.sub(r'Skip to main content.*?PDF', '', text, flags=re.IGNORECASE | re.DOTALL)

    # Step 2: Remove footers and post-speech disclaimers
    # Remove text after specific known ending patterns
    end_markers = [
        r'Last Update:.*$',                     # Final update marker
        r'1\. The views expressed.*$',          # Disclaimer ending
        r'Here are the slides.*$',              # Slide deck notice
        r'1\. Thank you.*$',                    # Soft ending
        r'References\s*\n.*$',                  # References followed by newline
    ]
    for pattern in end_markers:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)

    # Step 3: Remove in-text junk like 'Share Watch Live' or isolated 'Share'
    text = re.sub(r'Share\s*Watch Live', '', text, flags=re.IGNORECASE)
    text = re.sub(r'^\s*Share\s*$', '', text, flags=re.IGNORECASE | re.MULTILINE)

    # Step 4: Replace dashes with spaces to avoid word merging
    text = re.sub(r'[–—]', ' ', text)

    # Step 5: Normalize punctuation and remove other irrelevant symbols
    text = re.sub(r'\[\d+\]', '', text)  # Remove citation numbers like [1]
    text = re.sub(r'\s+', ' ', text)  # Normalize all whitespace
    text = re.sub(r'[^\w\s.,;:!?%$€£¥\'\"()-]', '', text)  # Remove unwanted symbols
    text = re.sub(r'[“”]', '"', text)  # Normalize quotes
    text = re.sub(r"[‘’]", "'", text)

    # Final clean-up
    return text.strip()

In [11]:
# Loading additional ECB speeches of 2025 obtained from web scraping
ecb = pd.read_csv("data/all_ECB_speeches.csv", sep='|')
ecb = ecb.rename(columns={
    'speakers': 'author',
    'subtitle': 'description',
    'contents': 'text'
})
ecb['country'] = 'Euro area'
ecb['clean_text'] = ''
ecb['date'] = pd.to_datetime(ecb['date'])

# Delete rows where there are slides and not text
ecb = ecb.dropna(subset=['text']).reset_index(drop=True)

# Apply text cleaning to new ECB speeches
filtered_ecb = ecb[ecb['date'] > pd.to_datetime('2024-12-18')].copy()
filtered_ecb['clean_text'] = filtered_ecb['text'].apply(clean_ecb_speech)

In [12]:
# Loading additional FED speeches of 2025 obtained from web scraping
fed = pd.read_csv("data/federal_reserve_speeches.csv")
fed = fed.drop(columns=['year', 'link'])
fed = fed.rename(columns={
    'speaker': 'author'})
fed['country'] = 'United States'
fed['clean_text'] = ''
fed['description'] = ''
fed['date'] = pd.to_datetime(fed['date'])

# Apply text cleaning to new FED speeches
filtered_fed = fed[fed['date'] > pd.to_datetime('2024-12-18')].copy()
filtered_fed['clean_text'] = filtered_fed['text'].apply(clean_fed_speech) 

In [13]:
# Merging recent ECB and FED speeches
merged_df = pd.concat([filtered_fed, filtered_ecb], ignore_index=True)
merged_df = merged_df.sort_values(by='date')

In [14]:
# Concatenating historical and recent speech datasets
dff = pd.concat([df, merged_df], ignore_index=True)
dff = dff.sort_values(by='date')

In [15]:
dff.loc[[334, 1711], 'description'] = ''

In [16]:
ecb_df = ecb_df.sort_values('rate_date').reset_index(drop=True)
fed_df = fed_df.sort_values('rate_date').reset_index(drop=True)

In [17]:
# Ensuring date fields are properly formatted
dff['date'] = pd.to_datetime(dff['date'])
ecb_df['rate_date'] = pd.to_datetime(ecb_df['rate_date'])
fed_df['rate_date'] = pd.to_datetime(fed_df['rate_date'])

# Function to find current and next interest rate at the time of a speech
def find_rates(date, rate_df):
    current_rates = rate_df[rate_df['rate_date'] <= date]
    if current_rates.empty:
        # No previous rates available
        current_rate = np.nan
        next_rate_date = np.nan
        next_rate = np.nan
    else:
        # Latest rate before or at speech date
        current_rate = current_rates.iloc[-1]['rate']
        future_rates = rate_df[rate_df['rate_date'] > date]
        if future_rates.empty:
            # No future rate available
            next_rate_date = np.nan
            next_rate = np.nan
        else:
            # Next rate after speech date
            next_rate_date = future_rates.iloc[0]['rate_date']
            next_rate = future_rates.iloc[0]['rate']
    
    return current_rate, next_rate, next_rate_date

current_rates = []
next_rates = []
next_rate_dates = []
rate_differences = []

for idx, row in dff.iterrows():
    if row['country'] == 'United States':
        rate_df = fed_df
    elif row['country'] == 'Euro area':
        rate_df = ecb_df
    else:
        current_rates.append(np.nan)
        next_rates.append(np.nan)
        next_rate_dates.append(np.nan)
        rate_differences.append(np.nan)
        continue

    # Finding rates
    current_rate, next_rate, next_rate_date = find_rates(row['date'], rate_df)
    
    current_rates.append(current_rate)
    next_rates.append(next_rate)
    next_rate_dates.append(next_rate_date)
    
    # Calculating rate difference if possible
    if pd.notna(current_rate) and pd.notna(next_rate):
        rate_differences.append(next_rate - current_rate)
    else:
        rate_differences.append(np.nan)

dff['current_rate'] = current_rates
dff['next_rate'] = next_rates
dff['next_rate_date'] = next_rate_dates
dff['rate_difference'] = rate_differences
dff['previous_rate'] = dff.groupby('country')['current_rate'].shift(1)

# Adding rate_change_direction column: Increase, Decrease, No Change
dff['rate_change_direction'] = 'no change'
dff.loc[dff['current_rate'] > dff['previous_rate'], 'rate_change_direction'] = 'increase'
dff.loc[dff['current_rate'] < dff['previous_rate'], 'rate_change_direction'] = 'decrease'

# Adding days_to_next_decision column
dff['days_to_next_decision'] = dff.groupby('country')['next_rate_date'].shift(-1) - dff['date']
dff['days_to_next_decision'] = dff['days_to_next_decision'].dt.days  # Convert to number of days

# Extracting quarter and year for temporal analysis
dff['quarter'] = dff['date'].dt.quarter
dff['year'] = dff['date'].dt.year

In [18]:
# Define a function to extract speaker role from the author or description columns
def extract_speaker_role(text):
    if pd.isna(text):  # Check if the text is NaN or None
        return 'Unknown'
    
    roles = ['Chair', 'President', 'Governor', 'Board Member', 'Vice President', 'Director']  # Add any other roles as needed
    for role in roles:
        if role.lower() in text.lower():
            return role
    return 'Unknown'  # Return 'Unknown' if no match is found

# Apply the function to extract the speaker's role from the 'author' column
dff['speaker_role'] = dff['author'].apply(extract_speaker_role)


In [19]:
# Saving the final merged and enriched dataset
dff.to_csv("data/final.csv", index=False)
dff.tail()

Unnamed: 0,date,title,description,text,author,country,clean_text,current_rate,next_rate,next_rate_date,rate_difference,previous_rate,rate_change_direction,days_to_next_decision,quarter,year,speaker_role
4813,2025-04-03,U.S. Economic Outlook and Central Bank Communi...,,Skip to main content\nStay Connected \nRecent ...,Vice Chair Philip N. Jefferson,United States,"April 03, 2025 U.S. Economic Outlook and Centr...",4.33,,NaT,,4.33,no change,,2,2025,Chair
4814,2025-04-03,The Economic Outlook and Path of Policy,,Skip to main content\nStay Connected \nRecent ...,Governor Lisa D. Cook,United States,"April 03, 2025 The Economic Outlook and Path o...",4.33,,NaT,,4.33,no change,,2,2025,Governor
4816,2025-04-04,"AI, Fintechs, and Banks",,Skip to main content\nStay Connected \nRecent ...,Governor Michael S. Barr,United States,"April 04, 2025 AI, Fintechs, and Banks Governo...",4.33,,NaT,,4.33,no change,,2,2025,Governor
4815,2025-04-04,Economic Outlook,,Skip to main content\nStay Connected \nRecent ...,Chair Jerome H. Powell,United States,"April 04, 2025 Economic Outlook Chair Jerome H...",4.33,,NaT,,4.33,no change,,2,2025,Chair
4817,2025-04-07,Inflation Dynamics and the Phillips Curve,,Skip to main content\nStay Connected \nRecent ...,Governor Adriana D. Kugler,United States,"April 07, 2025 Inflation Dynamics and the Phil...",4.33,,NaT,,4.33,no change,,2,2025,Governor


In [20]:
import numpy as np
import pandas as pd

# Replace empty strings with NaN for easier handling
dff['description'].replace('', np.nan, inplace=True)

# Define a function to extract the first sentence or line from clean_text
def extract_description(text):
    if pd.isna(text):
        return np.nan
    # Split by period, newline, or other common sentence end markers
    sentences = text.strip().split('.')
    if sentences:
        return sentences[0].strip() + '.'  # Add period back
    return text.strip()

# Fill missing descriptions
dff['description'] = dff.apply(
    lambda row: extract_description(row['clean_text']) if pd.isna(row['description']) else row['description'],
    axis=1
)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dff['description'].replace('', np.nan, inplace=True)


In [21]:
dff.loc[3567, 'author'] = 'Jerome Powell'
dff.loc[3620, 'author'] = 'Benoît Cœuré'

In [22]:
import re
import pandas as pd

def extract_role(description, speaker):
    if pd.isna(description) or pd.isna(speaker):
        return None

    description = description.strip()

    # 1. "by the [role], [speaker]"
    match1 = re.search(r'by\s+(the\s+.*?),\s*' + re.escape(speaker), description, re.IGNORECASE)
    if match1:
        return match1.group(1).strip()

    # 2. "[speaker], [role]"
    match2 = re.search(re.escape(speaker) + r',\s*(.*?)(?:\sat\s|\sto\s|\son\s|\.|$)', description)
    if match2:
        return match2.group(1).strip()

    # 3. "[Role] [Speaker]"
    match3 = re.search(r'([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)\s+' + re.escape(speaker), description)
    if match3:
        return match3.group(1).strip()

    # 4. "[Speaker title], [Role], ..."
    match4 = re.search(r'by\s+[^,]+,\s*([^,]+?)(?:,|\sat|\son|\sto|\.|$)', description, re.IGNORECASE)
    if match4:
        return match4.group(1).strip()

    # 5. "a/the [role] of ..."
    match5 = re.search(r'((?:a|the)\s+.*?\s+of\s+.*?)(?:\sat\s|\sto\s|\son\s|\.|,|$)', description, re.IGNORECASE)
    if match5:
        return match5.group(1).strip()

    return None

# Apply function
dff['role'] = dff.apply(lambda row: extract_role(row['description'], row['author']), axis=1)

In [23]:
# Update the 'role' column with 'speaker_role' where 'role' is NaN and 'speaker_role' is not 'Unknown'
dff['role'] = dff.apply(lambda row: row['speaker_role'] if pd.isna(row['role']) and row['speaker_role'] != 'Unknown' else row['role'], axis=1)

In [24]:
dff = dff.drop(columns=['speaker_role'])

In [25]:
dff['role'].value_counts()

role
Member of the Executive Board of the European Central Bank,                                                                                    797
President of the European Central Bank,                                                                                                        534
Member of the Board of Governors of the Federal Reserve System,                                                                                463
Vice-President of the European Central Bank,                                                                                                   260
Member of the Board of Governors of the US Federal Reserve System,                                                                             247
                                                                                                                                              ... 
President of the European Central Bank, with the Wall Street Journal, published                                  

In [26]:
def simplify_role(role):
    if pd.isna(role):
        return None

    # Normalize whitespace and lowercase for uniform matching
    role = role.strip().lower()

    # Look for common titles in the role string
    titles = ['president', 'vice president', 'governor', 'member', 'chair', 'chairman', 'vice chair', 'executive']
    for title in titles:
        if title in role:
            return title

    return 'other'
dff['simplified_role'] = dff['role'].apply(simplify_role)


In [27]:
dff['simplified_role'].value_counts()

simplified_role
president    1864
governor     1696
member       1170
other          52
chair          30
executive       6
Name: count, dtype: int64

In [28]:
dff.loc[dff['simplified_role'] == 'other', 'role'] = None

In [29]:
import pandas as pd
import re


# Define function to extract standardized role
def extract_role(desc):
    # Convert to lowercase for easier matching
    desc_lower = desc.lower()

    # Define role mapping with priority (longest to shortest to avoid substring mismatches)
    roles = {
        "vice chair": "Vice Chair",
        "chair": "Chair",
        "president": "President",
        "governor": "Governor",
        "member": " Member",
        "board member": "Member",
        "executive": "Executive",
    }

    for pattern, standard in roles.items():
        if pattern in desc_lower:
            return standard
    return "Unknown"

# Apply function to extract roles
dff['role'] = dff['description'].apply(extract_role)


In [30]:
dff.loc[dff['simplified_role'] == 'other', 'simplified_role'] = None

In [31]:
def simplify_role(role):
    if pd.isna(role):
        return None

    # Normalize whitespace and lowercase for uniform matching
    role = role.strip().lower()

    # Look for common titles in the role string
    titles = ['president', 'vice president', 'governor', 'member', 'chair', 'chairman', 'vice chair', 'executive']
    for title in titles:
        if title in role:
            return title

    return 'other'

dff.loc[dff['simplified_role'].isna(), 'simplified_role'] = dff.loc[dff['simplified_role'].isna(), 'role'].apply(simplify_role)

In [32]:
dff.loc[dff['simplified_role'] == 'member', 'simplified_role'] = 'board member'

In [33]:
dff['simplified_role'].value_counts()

simplified_role
president       1888
governor        1703
board member    1171
chair             50
executive          6
Name: count, dtype: int64

In [34]:
dff = dff.drop(columns=['role'])
dff = dff.rename(columns={'simplified_role': 'role'})

In [35]:
# Save the final DataFrame to a CSV file
dff.to_csv("data/final_v2.csv", index=False)

In [42]:
dff.head(20)

Unnamed: 0,date,title,description,text,author,country,clean_text,current_rate,next_rate,next_rate_date,rate_difference,previous_rate,rate_change_direction,days_to_next_decision,quarter,year,role
0,1999-01-03,Mr Ferguson reviews last year's economic perfo...,"Remarks by Mr Roger W. Ferguson, Jr., a member...",Mr Ferguson reviews last year's economic perfo...,Roger W Ferguson,United States,"Remarks by Mr Roger W. Ferguson, Jr., a member...",4.63,4.76,1999-02-01,0.13,,no change,29.0,1,1999,governor
1,1999-01-07,Mr Duisenberg's opening statement at the press...,Introductory statement by the President of the...,Mr Duisenberg's opening statement at the press...,Willem F Duisenberg,Euro area,Introductory statement by the President of the...,3.0,2.5,1999-04-09,-0.5,,no change,92.0,1,1999,president
2,1999-01-14,Mr Duisenberg discusses the arrival of the eur...,Speech by the President of the European Centra...,Mr Duisenberg discusses the arrival of the eur...,Willem F Duisenberg,Euro area,Speech by the President of the European Centra...,3.0,2.5,1999-04-09,-0.5,3.0,no change,85.0,1,1999,president
3,1999-01-15,Mr Ferguson expresses his views on monetary po...,"Remarks by Mr Roger W. Ferguson, Jr., a member...",Mr Ferguson expresses his views on monetary po...,Roger W Ferguson,United States,"Remarks by Mr Roger W. Ferguson, Jr., a member...",4.63,4.76,1999-02-01,0.13,4.63,no change,17.0,1,1999,governor
4,1999-01-18,Mr Duisenberg's opening statement at the Europ...,Introductory statement by the President of the...,Mr Duisenberg's opening statement at the Europ...,Willem F Duisenberg,Euro area,Introductory statement by the President of the...,3.0,2.5,1999-04-09,-0.5,3.0,no change,81.0,1,1999,president
5,1999-01-20,Mr Greenspan testifies on the state of the US ...,Testimony of the Chairman of the Board of Gove...,Mr Greenspan testifies on the state of the US ...,Alan Greenspan,United States,Testimony of the Chairman of the Board of Gove...,4.63,4.76,1999-02-01,0.13,4.63,no change,12.0,1,1999,chair
6,1999-01-21,Mr McDonough focuses on the importance of risk...,Remarks by the President of the Federal Reserv...,Mr McDonough focuses on the importance of risk...,William J McDonough,United States,Remarks by the President of the Federal Reserv...,4.63,4.76,1999-02-01,0.13,4.63,no change,11.0,1,1999,president
7,1999-01-21,Mr Ferguson remarks on the international mille...,"Remarks by Mr Roger W Ferguson, Jr, a member o...",Mr Ferguson remarks on the international mille...,Roger W Ferguson,United States,"Remarks by Mr Roger W Ferguson, Jr, a member o...",4.63,4.76,1999-02-01,0.13,4.63,no change,11.0,1,1999,governor
8,1999-01-25,Mr Duisenberg reports on monetary policy in th...,Speech by the President of the European Centra...,Mr Duisenberg reports on monetary policy in th...,Willem F Duisenberg,Euro area,Speech by the President of the European Centra...,3.0,2.5,1999-04-09,-0.5,3.0,no change,74.0,1,1999,president
9,1999-01-26,Mr Lopes' speech before the Economic Affairs C...,Summary of a speech by the new President of th...,Mr Lopes' speech before the Economic Affairs C...,Francisco L. Lopes,United States,Summary of a speech by the new President of th...,4.63,4.76,1999-02-01,0.13,4.63,no change,34.0,1,1999,president
