## Initial Preprocessing

In [3]:
import pandas as pd
import numpy as np
import re
import unicodedata
from dateutil import parser

BDM Corpus 2

Bom Dia Mercado (BDM) → xlsx file with BDM articles and more → preprocessing to CSV → export to repository → final dataset

In [None]:
# load
news_df = pd.read_excel("../data/raw/bdm-corpus-2.xlsx")

# Normalize individual DATE and TIME cells
def parse_datetime_components(date_cell, time_cell):
    try:
        # Coerce both to string and strip spaces
        date_str = str(date_cell).strip()
        time_str = str(time_cell).strip()
        
        # Combine and parse flexibly
        dt = parser.parse(f"{date_str} {time_str}", dayfirst=True)
        return dt.isoformat()
    except Exception:
        return pd.NaT  # mark invalid rows

# Create ISO 8601 Timestamp column
news_df['Timestamp'] = news_df.apply(lambda row: parse_datetime_components(row['DATE'], row['TIME']), axis=1)
news_df['Timestamp'] = pd.to_datetime(news_df['Timestamp'], errors='coerce')

# Drop old columns
news_df.drop(columns=['DATE', 'TIME', 'Index', 'DIRECTION', 'BRER', 'LABEL'], inplace=True)

# Clean newlines in ARTICLE CONTENT and COMMENTS
for col in ['HEADING', 'ARTICLE CONTENT', 'COMMENTS']:
    if col in news_df.columns:
        news_df[col] = news_df[col].astype(str).str.replace(r'[\r\n]+', ' ', regex=True).str.strip()

# Reorder columns
news_df = news_df[['Timestamp'] + [col for col in news_df.columns if col != 'Timestamp']]

# Rename "HEADING" to "Headline" "ARTICLE CONTENT" to "Article" and "COMMENTS" to "Comments"
news_df.rename(columns={
    'HEADING': 'Headline',
    'ARTICLE CONTENT': 'Article',
    'COMMENTS': 'Comments'
}, inplace=True)

# save
news_df.to_csv("../data/interim/bdm-corpus-2/stage-0.csv", index=False, encoding='utf-8-sig')

### Check for invalid rows (rows with no headlines) and drop them

In [None]:
news_df = pd.read_csv("../data/interim/bdm-corpus-2/stage-0.csv", encoding='utf-8-sig') # reload as csv to ensure correct encoding

invalid_rows = news_df[news_df['Headline'].isna()]
print(f"{len(invalid_rows)} invalid rows found in 'Headline' column.")
display(invalid_rows)

In [None]:
# drop rows with invalid headlines and resave
news_df = news_df.dropna(subset=['Headline'])
news_df.to_csv("../data/interim/bdm-corpus-2/stage-0.csv", index=False, encoding='utf-8-sig')

## Exchange Rate Preprocessing

Bloomberg → Download USD/BRL exchange rates as excel file → preprocess to CSV → export to repository → final dataset

In [None]:
# Step 0: Load 
df_usd_brl = pd.read_excel("../data/raw/usd-brl.xlsx")

# Step 1: Clean column names
df_usd_brl.columns = [col.strip() for col in df_usd_brl.columns]
df_usd_brl.rename(columns={"Date": "Raw Timestamp", "Último preço": "USD/BRL"}, inplace=True)

# Step 2: Parse "Raw Timestamp" directly into pandas datetime (no ISO string conversion)
df_usd_brl["Timestamp"] = pd.to_datetime(df_usd_brl["Raw Timestamp"], errors="coerce")

# Step 3: Drop the original column
df_usd_brl.drop(columns=["Raw Timestamp"], inplace=True)

# Step 4: Reorder columns
df_usd_brl = df_usd_brl[["Timestamp", "USD/BRL"]]

# Step 5: Save
df_usd_brl.to_csv("../data/interim/usd-brl.csv", index=False, encoding="utf-8-sig")

Clean up the interim stage of the exchange rate file

In [None]:
import pandas as pd
df = pd.read_csv("../data/interim/usd-brl.csv")
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df = df.sort_values('Timestamp').reset_index(drop=True) # Sort to ensure chronological order

# Check for duplicate timestamps
duplicate_rows = df[df.duplicated(subset=['Timestamp'], keep=False)]
if not duplicate_rows.empty:
    print("Duplicate timestamps found:")
    print(duplicate_rows)
else:
    print("No duplicate timestamps found.")

# Remove duplicates (keeping first occurrence)
df = df.drop_duplicates(subset=['Timestamp'], keep='first').reset_index(drop=True)

In [None]:
# Detect intraday gaps
df['TimeDiff'] = df['Timestamp'].diff()

intraday_gaps = df[
    (df['TimeDiff'] > pd.Timedelta(minutes=1)) &
    (df['Timestamp'].dt.date == df['Timestamp'].shift().dt.date)
]

# Group gaps by date
intraday_gap_summary = {}
for idx, row in intraday_gaps.iterrows():
    prev_time = df.loc[idx - 1, 'Timestamp']
    curr_time = row['Timestamp']
    gap_minutes = int(row['TimeDiff'].total_seconds() // 60)
    date = curr_time.date()
    intraday_gap_summary.setdefault(date, []).append(
        (prev_time.time(), curr_time.time(), gap_minutes)
    )

# Display results
if intraday_gap_summary:
    print("\nIntraday gaps found:")
    for date, gaps in intraday_gap_summary.items():
        print(f"\nDate: {date}")
        for prev_t, curr_t, gap in gaps:
            print(f"  {prev_t} → {curr_t}  ({gap} min gap)")
else:
    print("\nNo intraday gaps found.")

# Remove days with intraday gaps
if intraday_gap_summary:
    gap_dates = set(intraday_gap_summary.keys())
    df = df[~df['Timestamp'].dt.date.isin(gap_dates)].reset_index(drop=True)
    print(f"\nRemoved all rows from days with intraday gaps: {gap_dates}")

# Drop helper column
df = df.drop(columns=['TimeDiff'])

In [None]:
df.to_csv("../data/processed/usd-brl.csv", index=False)

## Stage 1
  - remove all rows following the last timestamp in the interim/bdm-corpus-2/stage-0.csv at 2024-12-30 17:32:00
  - remove article and comments columns from interim/bdm-corpus-2/stage-0.csv
  - merge the USD/BRL values from interim/usd-brl-continuous.csv into interim/bdm-corpus-2/stage-0.csv by matching timestamps
  - compute forward returns based on the price from t+1 to t+20 minutes subtracted from the price during the timestamp of the news
    - positive returns will map to +1, negative returns will map to -1, and no change will map to 0
    - each computed forward return will be stored in a new column named "Forward Return t+X" where X is the number of minutes ahead

    - incorporate a stability threshold when mapping "no change":
      - unchanged is defined as an absolute log return less than or equal to the Nth percentile of all 1-minute absolute log returns in the dataset (default: 60th percentile)
      - this replaces the exact-zero check for unchanged, allowing small price movements to be considered stable


In [1]:
# INCLUDE stable metric

import pandas as pd
import numpy as np

# Load datasets
fx_df = pd.read_csv("../data/processed/usd-brl.csv", parse_dates=["Timestamp"])
news_df = pd.read_csv("../data/interim/bdm-corpus-2/stage-0.csv", parse_dates=["Timestamp"])

# Restrict news to timestamps before cutoff
last_timestamp = pd.to_datetime("2024-12-30 17:58:00")
news_df = news_df[news_df["Timestamp"] < last_timestamp]
news_df = news_df[["Timestamp", "Headline"]]

'''
Compute forward returns with day-end invalidation
'''
def compute_forward_returns(df, horizon_minutes=20):
    df = df.copy()

    df["Date"] = df["Timestamp"].dt.date
    day_end = df.groupby("Date")["Timestamp"].transform("max")
    for i in range(1, horizon_minutes + 1):
        col = f"Forward Return t+{i}"
        ret_i = df["USD/BRL"].shift(-i) - df["USD/BRL"]   # actual price change
        ret_i = ret_i.where(df["Timestamp"] + pd.Timedelta(minutes=i) <= day_end, np.nan)
        df[col] = ret_i

    return df.drop(columns="Date")

fx_df = compute_forward_returns(fx_df)

# Merge into news_df
merged_df = pd.merge(news_df, fx_df, on="Timestamp", how="left")

# Key checks
print(f"Total news rows before cutoff: {len(news_df)}")
print(f"News timestamps matching FX bars: {merged_df['USD/BRL'].notna().sum()}")
for i in (1, 5, 10, 20):
    col = f"Forward Return t+{i}"
    invalid_count = merged_df[col].isna().sum()
    print(f"Invalid {col} (NaN): {invalid_count}")

Total news rows before cutoff: 3523
News timestamps matching FX bars: 3506
Invalid Forward Return t+1 (NaN): 20
Invalid Forward Return t+5 (NaN): 25
Invalid Forward Return t+10 (NaN): 35
Invalid Forward Return t+20 (NaN): 108


In [2]:
merged_df.to_csv(
    "../data/interim/bdm-corpus-2/stage-1.csv",
    index=False,
    encoding="utf-8-sig",
    na_rep="NA" # will be used later to identify which forward return horizons we can't use when assigning ground truth
)

## Stage 2
Preparing for language model inference
- DO NOT remove stopwords
- DO NOT lemmatize or stem
- DO NOT lowercase
- DO NOT translate or normalize to English
- Preserve accents, diacritics, and original formatting


In [4]:
df = pd.read_csv("../data/interim/bdm-corpus-2/stage-1.csv")

In [5]:
# === Define robust headline cleaner ===
def clean_headline(text: str) -> str:
    text = str(text).strip()

    # Normalize smart quotes and apostrophes
    text = re.sub(r"[“”]", '"', text)
    text = re.sub(r"[‘’]", "'", text)

    # Remove repeated quotes/apostrophes
    text = re.sub(r'"{2,}', '"', text)
    text = re.sub(r"'{2,}", "'", text)

    # Remove leading/trailing quotes (even if multiple)
    text = re.sub(r'^(["\']+)', '', text)
    text = re.sub(r'(["\']+)$', '', text)

    # Remove noisy special character sequences
    text = re.sub(r"[_•√×+÷=<>^~|#*@¬]{2,}", " ", text)
    text = re.sub(r"[_•√×+÷=<>^~|#*@¬]", "", text)

    # Normalize unicode
    text = unicodedata.normalize("NFKC", text)

    # Remove non-printable/unusual characters except Latin-1
    text = re.sub(r"[^\x20-\x7EÀ-ÿ°€¢£¥‰–—…]", " ", text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()

In [6]:
df["Headline"] = df["Headline"].apply(clean_headline)
df.to_csv("../data/interim/bdm-corpus-2/stage-2.csv", index=False, na_rep="NA")

## Stage 3
- remove duplicate rows (eg., row_a timestamp/headline == row_b timestamp/headline)
    - note: some timestamps contain multiple headlines. They are all different, no need to fret over this
- removed all rows with duplicate "Manchete" column value and kept first occurence
- chronologically create train/test set for temporal evaluation
- keep test set slightly imbalanced, as it already is, to mimic real world scenario
- 3465 headlines -> 3457 headlines

In [7]:
import pandas as pd

df = pd.read_csv("../data/interim/bdm-corpus-2/stage-2.csv")
print("Total rows before removing duplicates:", len(df))

# Drop exact (Timestamp, Headline) duplicates
dupes = df[df.duplicated(subset=["Timestamp", "Headline"], keep=False)]
print("Duplicate rows:\n", dupes)
df = df.drop_duplicates(subset=["Timestamp", "Headline"], keep="first")

# Drop headline duplicates (model shouldn't be trained on multiple instances of a headline, for semantic reasons)
dupe_heads = df[df.duplicated(subset="Headline", keep=False)]
print("Duplicate Headline count:", len(dupe_heads))
df = df.drop_duplicates(subset="Headline", keep="first")

print(f"\nTotal rows after removing duplicates: {len(df)}")

Total rows before removing duplicates: 3523
Duplicate rows:
                 Timestamp                                           Headline  \
239   2024-11-26 10:02:00                                  Reação ao IPCA-15   
242   2024-11-26 10:02:00                                  Reação ao IPCA-15   
489   2024-11-28 09:02:00  AOVIVO/Haddad sobre IR: Nosso objetivo é que e...   
490   2024-11-28 09:02:00  Ela irá beneficiar todo mundo que ganha até 5 ...   
491   2024-11-28 09:02:00  Com essa fórmula de cálculo, a suposta renúnci...   
494   2024-11-28 09:02:00  AOVIVO/Haddad sobre IR: Nosso objetivo é que e...   
495   2024-11-28 09:02:00  Ela irá beneficiar todo mundo que ganha até 5 ...   
496   2024-11-28 09:02:00  Com essa fórmula de cálculo, a suposta renúnci...   
600   2024-11-28 11:54:00  MERCADOS: Sob pressão do fiscal, Ibovespa amea...   
601   2024-11-28 11:54:00  MERCADOS: Sob pressão do fiscal, Ibovespa amea...   
857   2024-12-02 10:56:00  AOVIVO/Galípolo diz que a questã

In [8]:
df.to_csv("../data/interim/bdm-corpus-2/stage-3.csv", index=False, na_rep="NA")

## Stage 4 (Temporary) - filtering out noisy headlines
REMOVED:
- Removed via code:
    - Anything that starts with "Reação" since lacks context and articles are too noisy
    - "Inicialmente, dado seria publicado à tarde"
    - "Mensagem apagada"
    - "Mídia oculta"

Important terms:
- losses: perda, queda, recuo
- fiscal, abaixo (fell under), Galípolo (prez of central bank of BR)

Terms to Ignore: Ibovespa, Stoxx600, Dow, S&P, Nasdaq (stock market terms)

Change labels to: em alta/baixa or valorização/desvalorização

Create with datasets with neutral and without (filtered) for binary classification

In [None]:
import pandas as pd
df = pd.read_csv("../data/interim/bdm-corpus-2/stage-3.csv")

removed = df[df['Headline'].str.startswith("Reação", na=False)]
df = df[~df['Headline'].str.startswith("Reação", na=False)]

# remove specific bad texts
bad_texts = [
    "Inicialmente, dado seria publicado à tarde",
    "Mensagem apagada",
    "Mídia oculta"
]
extra_removed = df[df['Headline'].isin(bad_texts)]
df = df[~df['Headline'].isin(bad_texts)]

display(removed)
display(extra_removed)

df.to_csv("../data/interim/bdm-corpus-2/stage-4.csv", index=False, na_rep="NA")

Unnamed: 0,Timestamp,Headline,USD/BRL,Forward Return t+1,Forward Return t+2,Forward Return t+3,Forward Return t+4,Forward Return t+5,Forward Return t+6,Forward Return t+7,...,Forward Return t+11,Forward Return t+12,Forward Return t+13,Forward Return t+14,Forward Return t+15,Forward Return t+16,Forward Return t+17,Forward Return t+18,Forward Return t+19,Forward Return t+20
46,2024-11-22 09:00:00,Reação aos PMIs da Zona do euro,5.8238,-0.0005,-0.005,-0.0118,-0.015,-0.0205,-0.0207,-0.02,...,-0.0232,-0.024,-0.0283,-0.0295,-0.0321,-0.0321,-0.0252,-0.0261,-0.0273,-0.0305
159,2024-11-25 12:36:00,Reação a Conta Corrente e IDP,5.7953,0.0016,0.003,0.0042,0.006,0.0066,0.0072,0.0116,...,0.012,0.0137,0.0176,0.0203,0.0197,0.0195,0.018,0.017,0.0163,0.0187
232,2024-11-26 10:00:00,Reação ao IPCA-15,5.7902,-0.0038,-0.0015,0.0009,-0.001,0.0046,0.006,0.0075,...,0.0089,0.0104,0.0105,0.0107,0.0128,0.0119,0.0112,0.0122,0.0129,0.011
445,2024-11-27 16:07:00,Reação ao Caged,5.919,-0.0017,-0.0026,-0.0004,-0.0028,-0.0005,-0.0013,-0.0022,...,-0.0003,0.001,0.0042,0.0022,0.0017,-0.0009,-0.0005,-0.0027,-0.0006,-0.002
538,2024-11-28 10:44:00,Reação ao pacote fiscal,5.982,0.0015,0.0006,-0.0002,-0.0003,0.0009,0.0003,-0.0028,...,-0.0053,-0.0012,0.0014,0.0032,0.0051,0.0067,0.0067,0.0063,0.007,0.0039
544,2024-11-28 10:57:00,Reação ao pacote/Bradesco Asset,5.9834,0.0018,0.0037,0.0053,0.0053,0.0049,0.0056,0.0025,...,0.0024,0.0017,0.0004,0.0026,0.0049,0.0041,0.0054,0.0063,0.0084,0.009
578,2024-11-28 12:08:00,Reação ao pacote,5.9892,-0.0016,-0.0031,-0.002,-0.0026,-0.0031,-0.0055,-0.0063,...,0.0001,0.0019,0.0006,0.0005,0.0004,0.0,0.0006,0.0006,0.0026,0.0044
642,2024-11-29 10:01:00,Reação à Pnad,6.1028,-0.0058,-0.0123,-0.0044,-0.0088,-0.0109,-0.0123,-0.02,...,-0.0064,-0.0064,0.0015,0.0115,0.0047,0.0048,0.0012,-0.002,-0.003,-0.0044
746,2024-11-29 16:12:00,Reação à bandeira verde,6.0125,-0.004,-0.0068,-0.007,-0.0085,-0.0037,-0.0069,-0.0059,...,-0.0059,-0.0066,-0.0047,-0.0061,-0.0067,-0.0063,-0.0081,-0.0075,-0.0087,-0.0072
791,2024-12-02 10:25:00,Reação ao PMI da zona do euro,6.0431,0.0011,-0.0019,-0.0023,-0.0009,-0.0001,-0.0001,0.0032,...,0.0077,0.0081,0.0108,0.0164,0.0146,0.0159,0.0137,0.0117,0.0099,0.0125


Unnamed: 0,Timestamp,Headline,USD/BRL,Forward Return t+1,Forward Return t+2,Forward Return t+3,Forward Return t+4,Forward Return t+5,Forward Return t+6,Forward Return t+7,...,Forward Return t+11,Forward Return t+12,Forward Return t+13,Forward Return t+14,Forward Return t+15,Forward Return t+16,Forward Return t+17,Forward Return t+18,Forward Return t+19,Forward Return t+20
573,2024-11-28 11:25:00,Mensagem apagada,5.9858,0.0039,0.0054,0.0074,0.0059,0.0029,-0.0001,-0.001,...,0.0064,0.0088,0.0069,0.0039,0.0045,0.0034,0.0053,0.0044,0.0051,0.0014
583,2024-11-28 12:44:00,Mídia oculta,5.9866,0.0006,0.0005,-0.0003,0.0,0.0003,0.0003,0.002,...,0.002,0.002,0.0022,0.0032,0.0072,0.0072,0.0082,0.0092,0.0082,0.0089
3150,2024-12-26 13:29:00,"Inicialmente, dado seria publicado à tarde",6.1526,0.0031,0.006,0.0057,0.0057,0.0044,0.0046,0.0033,...,0.0065,0.007,0.0073,0.008,0.0072,0.0069,0.0074,0.0073,0.0049,0.0053


## Stage ?

Step 1: Create functionality that creates dataframes from our processed data that converts the actual forward return to increase/decrease/stable based on movement exceeding 5 pips of pos/neg change
- Must be able to do for any desired time horizon
- Must be able to do with and without stable pip thing
- Must be created for experiments
- probably implemented via a script with functionality (calling the function)

Step 2: 
- Run experiments for and measure against every time horizon. Save results in a csv with each time horizon acccuracy
- possibly make a function that can save the results properly
- Function to create confusion matrices, classification reports, and profitability reports (total the profit/loss over the test set results based on ground truth - if increase/decrease was predicted, treat as the pip change)