### Preprocessing

In [None]:
import pandas as pd
from dateutil import parser

Bom Dia Mercado (BDM) → xlsx file with BDM articles and more → preprocessing to CSV → export to repository → final dataset

In [None]:
# Step 0: Load the dataset
df = pd.read_excel("../data/raw/bdm-corpus-2.xlsx")

# Step 1–3: Normalize individual DATE and TIME cells
def parse_datetime_components(date_cell, time_cell):
    try:
        # Coerce both to string and strip spaces
        date_str = str(date_cell).strip()
        time_str = str(time_cell).strip()
        
        # Combine and parse flexibly
        dt = parser.parse(f"{date_str} {time_str}", dayfirst=True)
        return dt.isoformat()
    except Exception:
        return pd.NaT  # mark invalid rows

# Step 4: Create ISO 8601 Timestamp column
df['Timestamp'] = df.apply(lambda row: parse_datetime_components(row['DATE'], row['TIME']), axis=1)

# Step 5: Drop old columns
df.drop(columns=['DATE', 'TIME', 'Index', 'DIRECTION', 'BRER', 'LABEL'], inplace=True)

# Step 6: Clean newlines in ARTICLE CONTENT and COMMENTS
for col in ['HEADING', 'ARTICLE CONTENT', 'COMMENTS']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.replace(r'[\r\n]+', ' ', regex=True).str.strip()

# Step 7: Reorder columns
df = df[['Timestamp'] + [col for col in df.columns if col != 'Timestamp']]

# Step 8: Save as CSV
df.to_csv("../data/interim/bdm-corpus-2.csv", index=False, encoding='utf-8-sig')

# Step 9: Check for invalid rows (passed)
invalid_rows = df[df['Timestamp'].isna()]
print(f"{len(invalid_rows)} invalid rows found.")
print(invalid_rows)


0 invalid rows found.
Empty DataFrame
Columns: [Timestamp, HEADING, ARTICLE CONTENT, COMMENTS]
Index: []


Bloomberg → Download USD/BRL exchange rates as excel file → preprocess to CSV → export to repository → final dataset

In [None]:
# Step 0: Load the dataset
df = pd.read_excel("../data/raw/usd-brl.xlsx")

# Step 1: Clean column names
df.columns = [col.strip() for col in df.columns]
df.rename(columns={"Date": "Raw Timestamp", "Último preço": "USD/BRL"}, inplace=True)

# Step 2: Parse "Raw Timestamp" into ISO 8601 format
def parse_iso8601(raw):
    try:
        return parser.parse(str(raw).strip()).isoformat()
    except Exception:
        return pd.NaT

df["Timestamp"] = df["Raw Timestamp"].apply(parse_iso8601)

# Step 3: Drop the original column
df.drop(columns=["Raw Timestamp"], inplace=True)

# Step 4: Reorder columns
df = df[["Timestamp", "USD/BRL"]]

# Step 5: Save to CSV
df.to_csv("../data/processed/usd-brl.csv", index=False, encoding="utf-8-sig")

# Step 6: Print invalid rows (if any)
invalid_rows = df[df["Timestamp"].isna()]
print(f"{len(invalid_rows)} invalid rows found.")
print(invalid_rows)

0 invalid rows found.
Empty DataFrame
Columns: [Timestamp, USD/BRL]
Index: []


data/interim/bdm-corpus-2.csv + data/processed/usd-brl.csv -> merge -> data/processed/bdm-corpus-2.csv

In [None]:
'''
THE ABOVE VERSIONS WENT INTO INTERIM

Below we will process a universal dataset that contains both the Allen corpus and the USD/BRL exchange rate.
This dataset can be used for any experiment, including the FINBERT baseline model.

Step 1: Load interim data
Step 2: Match exchange rates to each row in the Allen corpus by matching timestamps
Step 3: Save the combined dataset as a new CSV file into data/processed as the universal dataset
Step 4: Refactor the code to a reusable script that can be run from the command line
'''

# Step 1: Load interim data
bdm_df = pd.read_csv("../data/interim/bdm-corpus-2.csv", parse_dates=['Timestamp'])
usd_brl_df = pd.read_csv("../data/processed/usd-brl.csv", parse_dates=['Timestamp'])

# Step 2: Merge datasets on Timestamp
merged_df = pd.merge(bdm_df, usd_brl_df, on='Timestamp', how='left', suffixes=('', '_USD_BRL'))

# Step 3: Save the combined dataset
merged_df.to_csv("../data/processed/bdm-corpus-2.csv", index=False, encoding='utf-8-sig')
