### Preprocessing

In [1]:
import pandas as pd
import numpy as np
from dateutil import parser

## News Data Preprocessing

Bom Dia Mercado (BDM) → xlsx file with BDM articles and more → preprocessing to CSV → export to repository → final dataset

In [2]:
# Load dataset
news_df = pd.read_excel("../data/raw/bdm-corpus-2.xlsx")

# Normalize individual DATE and TIME cells
def parse_datetime_components(date_cell, time_cell):
    try:
        # Coerce both to string and strip spaces
        date_str = str(date_cell).strip()
        time_str = str(time_cell).strip()
        
        # Combine and parse flexibly
        dt = parser.parse(f"{date_str} {time_str}", dayfirst=True)
        return dt.isoformat()
    except Exception:
        return pd.NaT  # mark invalid rows

# Create ISO 8601 Timestamp column
news_df['Timestamp'] = news_df.apply(lambda row: parse_datetime_components(row['DATE'], row['TIME']), axis=1)
news_df['Timestamp'] = pd.to_datetime(news_df['Timestamp'], errors='coerce')

# Drop old columns
news_df.drop(columns=['DATE', 'TIME', 'Index', 'DIRECTION', 'BRER', 'LABEL'], inplace=True)

# Clean newlines in ARTICLE CONTENT and COMMENTS
for col in ['HEADING', 'ARTICLE CONTENT', 'COMMENTS']:
    if col in news_df.columns:
        news_df[col] = news_df[col].astype(str).str.replace(r'[\r\n]+', ' ', regex=True).str.strip()

# Reorder columns
news_df = news_df[['Timestamp'] + [col for col in news_df.columns if col != 'Timestamp']]

# Rename "HEADING" to "Headline" "ARTICLE CONTENT" to "Article" and "COMMENTS" to "Comments"
news_df.rename(columns={
    'HEADING': 'Headline',
    'ARTICLE CONTENT': 'Article',
    'COMMENTS': 'Comments'
}, inplace=True)

# Save to CSV
news_df.to_csv("../data/interim/bdm-corpus-2.csv", index=False, encoding='utf-8-sig')

### Check for invalid rows (rows with no headlines) and drop them

In [3]:
news_df = pd.read_csv("../data/interim/bdm-corpus-2.csv", encoding='utf-8-sig') # reload as csv to ensure correct encoding

invalid_rows = news_df[news_df['Headline'].isna()]
print(f"{len(invalid_rows)} invalid rows found in 'Headline' column.")
display(invalid_rows)

6 invalid rows found in 'Headline' column.


Unnamed: 0,Timestamp,Headline,Article,Comments
2332,2024-12-16 09:02:00,,,
4299,2025-01-10 12:14:00,,,
4302,2025-01-10 12:14:00,,,
4306,2025-01-10 12:18:00,,,
4367,2025-01-10 15:58:00,,,
4372,2025-01-10 15:59:00,,,


In [4]:
# drop rows with invalid headlines and resave
news_df = news_df.dropna(subset=['Headline'])
news_df.to_csv("../data/interim/bdm-corpus-2.csv", index=False, encoding='utf-8-sig')

## Exchange Rate Preprocessing

Bloomberg → Download USD/BRL exchange rates as excel file → preprocess to CSV → export to repository → final dataset

In [5]:
# Step 0: Load the dataset
df_usd_brl = pd.read_excel("../data/raw/usd-brl.xlsx")

# Step 1: Clean column names
df_usd_brl.columns = [col.strip() for col in df_usd_brl.columns]
df_usd_brl.rename(columns={"Date": "Raw Timestamp", "Último preço": "USD/BRL"}, inplace=True)

# Step 2: Parse "Raw Timestamp" directly into pandas datetime (no ISO string conversion)
df_usd_brl["Timestamp"] = pd.to_datetime(df_usd_brl["Raw Timestamp"], errors="coerce")

# Step 3: Drop the original column
df_usd_brl.drop(columns=["Raw Timestamp"], inplace=True)

# Step 4: Reorder columns
df_usd_brl = df_usd_brl[["Timestamp", "USD/BRL"]]

# Step 5: Save to CSV
df_usd_brl.to_csv("../data/interim/usd-brl.csv", index=False, encoding="utf-8-sig")

In [6]:
# Load new datasets
bdm_df = pd.read_csv("../data/interim/bdm-corpus-2.csv", parse_dates=['Timestamp'])
fx_df = pd.read_csv("../data/interim/usd-brl.csv", parse_dates=['Timestamp'])

In [7]:
# check how many duplicate timestamps there are
num_duplicates = fx_df.duplicated(subset="Timestamp").sum()
print(f"Found {num_duplicates} duplicate timestamps.")

# remove duplicate timestamps (keep first occurrence)
fx_df = fx_df.drop_duplicates(subset="Timestamp", keep="first")
print("Removed duplicate timestamps.")

# set timestamp index and sort
fx_df = fx_df.set_index("Timestamp").sort_index()
print("Set 'Timestamp' as index and sorted chronologically.")

# forward fill missing timestamps to create continuous minute-level series
fx_df = fx_df.resample("1min").ffill()
print("Forward-filled missing minute-level timestamps.")

# verify that the dataframe is now fully continuous
expected_index = pd.date_range(start=fx_df.index.min(), end=fx_df.index.max(), freq="1min")
missing_timestamps = expected_index.difference(fx_df.index)

if missing_timestamps.empty:
    print("Timestamps are now continuous and minute-by-minute. No gaps remain.")
else:
    print(f"{len(missing_timestamps)} missing timestamps still remain:")
    print(missing_timestamps[:10])  # preview first 10 missing

Found 5929 duplicate timestamps.
Removed duplicate timestamps.
Set 'Timestamp' as index and sorted chronologically.
Forward-filled missing minute-level timestamps.
Timestamps are now continuous and minute-by-minute. No gaps remain.


In [None]:
# save
fx_df.to_csv("../data/interim/usd-brl-continuous.csv", index=True, encoding='utf-8-sig')

## Final Processing

In [None]:
'''
Functions needed for experimental dataset creation:
- Abilities:
  - dataset with +1, -1 for binary forward returns
  - dataset with +1, -1, and 0 by forward return threshold of choice
  - dataset with forward returns as percentage changes
  - dataset with forward returns as absolute changes
  - dataset with only timestamps and headlines
  - dataset with only timestamps, headlines, and articles
  - load in original datasets
  - choose prediction horizon by choice of minutes (eg. t+1, t+5, t+15, etc.)
  - map forward returns +1, -1 for decrease to measure directional accuracy
  - map forward returns +1, -1, 0 for thresholded returns to measure directional accuracy

I will develop functionality that will prepare a single experimental dataset:
  - remove all rows following the last timestamp in the interim/brl-corpus-2.csv at 2024-12-30 17:32:00
  - remove article and comments columns from interim/brl-corpus-2.csv
  - merge the USD/BRL values from interim/usd-brl-continuous.csv into interim/brl-corpus-2.csv by matching timestamps
  - compute forward returns based on the prediction horizon t+1 to t+20 minutes
    - positive returns will map to +1, negative returns will map to -1, and no change will map to 0
    - each computed forward return will be stored in a new column named "Forward Return t+X" where X is the number of minutes ahead
  - the function will return a new DataFrame with the merged data and forward returns
  - save the new DataFrame to a CSV file named "experimental_dataset.csv"
'''



In [2]:
fx_df = pd.read_csv("../data/interim/usd-brl-continuous.csv", parse_dates=['Timestamp'])
news_df = pd.read_csv("../data/interim/bdm-corpus-2.csv", parse_dates=['Timestamp'])

# remove all rows following the last timestamp in the interim/brl-corpus-2.csv at 2024-12-30 17:32:00
last_timestamp = pd.to_datetime("2024-12-30 17:32:00")
news_df = news_df[news_df['Timestamp'] <= last_timestamp]

# remove article and comments columns from interim/brl-corpus-2.csv
news_df = news_df[['Timestamp', 'Headline']]

# Merge the datasets on Timestamp
merged_df = pd.merge(news_df, fx_df, on='Timestamp', how='left')

# count rows in merged_df
num_rows = len(merged_df)
print(f"Number of rows in merged dataset: {num_rows}")

Number of rows in merged dataset: 3519


In [3]:
'''
compute forward returns based on the prediction horizon t+1 to t+20 minutes
    - positive returns will map to +1, negative returns will map to -1, and no change will map to 0
'''

def compute_forward_returns(df, horizon_minutes=20):
    for i in range(1, horizon_minutes + 1):
        # new columns for each forward return 
        col_name = f'Forward Return t+{i}'
        df[col_name] = np.nan
        
        # calculate the forward return
        df[col_name] = df['USD/BRL'].shift(-i) - df['USD/BRL'] #this calculates the forward return by shifting the USD/BRL column by i minutes and subtracting the current value
        
        # map to +1, -1, 0 for directional accuracy (DA) metric that i'll use later on
        df[col_name] = np.where(df[col_name] > 0, 1, np.where(df[col_name] < 0, -1, 0))
    
    return df

In [4]:
compute_forward_returns(merged_df, horizon_minutes=20)
merged_df.to_csv("../data/processed/experimental_dataset.csv", index=False, encoding='utf-8-sig')