### Setup

In [None]:
import pandas as pd
from pathlib import Path
import os
import re
import numpy as np

### Initial preprocessing: raw -> interim

In [None]:
def load_corpus(input, output):
    usecols = ['date', 'time', 'HEADING']
    df = pd.read_excel(input, usecols=usecols, engine="openpyxl")

    d = df['date'].astype(str).str.strip()
    t = df['time'].astype(str).str.strip()

    # Always discard any time from 'date' and append the separate 'time'
    d_no_time = d.str.replace(r'([ T]\d{1,2}:\d{2}(:\d{2})?)$', '', regex=True)
    combined = d_no_time + " " + t

    # Parse with dayfirst assumption, retry with monthfirst if needed
    parsed = pd.to_datetime(combined, errors='coerce', dayfirst=True)
    mask = parsed.isna()
    if mask.any():
        parsed.loc[mask] = pd.to_datetime(combined[mask], errors='coerce', dayfirst=False)

    # Normalize to one consistent format
    df['date'] = parsed.dt.strftime("%Y-%m-%d %H:%M:%S")

    df = df[['date', 'HEADING']].rename(columns={'HEADING': 'headline'})
    df = df.dropna(subset=['date'])
    df['headline'] = df['headline'].astype(str).str.replace(r'[\r\n]+', ' ', regex=True)

    Path(output).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output, index=False, encoding='utf-8')

load_corpus("../data/raw/corpus.xlsx", "../data/interim/corpus.csv")

### Final preprocessing: interim -> processed

In [11]:
# get rid of data before August 1 (that's when we stop trading)
df = pd.read_csv("../data/interim/corpus.csv", parse_dates=["date"])
df['date'] = pd.to_datetime(df['date'], errors="coerce")
df = df.rename(columns={"date": "timestamp"})
cutoff = pd.to_datetime("2025-07-31")
df = df[df["timestamp"] <= cutoff]

In [12]:
# get rid of duplicates
dup = df[df.duplicated(subset=["timestamp", "headline"], keep=False)]
print("Duplicates:")
print(dup)
df = df.drop_duplicates(subset=["timestamp", "headline"], keep="first")

Duplicates:
                timestamp                                           headline
239   2024-11-26 10:02:00                                  Reação ao IPCA-15
242   2024-11-26 10:02:00                                  Reação ao IPCA-15
488   2024-11-28 09:02:00  AOVIVO/Haddad sobre IR: Nosso objetivo é que e...
489   2024-11-28 09:02:00  Ela irá beneficiar todo mundo que ganha até 5 ...
490   2024-11-28 09:02:00  Com essa fórmula de cálculo, a suposta renúnci...
493   2024-11-28 09:02:00  AOVIVO/Haddad sobre IR: Nosso objetivo é que e...
494   2024-11-28 09:02:00  Ela irá beneficiar todo mundo que ganha até 5 ...
495   2024-11-28 09:02:00  Com essa fórmula de cálculo, a suposta renúnci...
599   2024-11-28 11:54:00  MERCADOS:  Sob pressão do fiscal, Ibovespa ame...
600   2024-11-28 11:54:00  MERCADOS:  Sob pressão do fiscal, Ibovespa ame...
856   2024-12-02 10:56:00  AOVIVO/Galípolo diz que a questão da meta é pá...
857   2024-12-02 10:56:00  AOVIVO/Galípolo diz que a questão da 

In [13]:
MOJIBAKE_MAP = {
    "√°": "á", "√£": "ã", "√©": "é", "√º": "ú", "√≥": "ó", "√±": "ñ",
    "√§": "ç", "√¶": "õ", "√•": "í", "√∫": "ú", "√™": "’",
    "ß": "ç", "‚Äò": "’", "‚Äú": "“", "‚Äù": "”", "‚Äì": "–",
    "´": "’",
}

def clean_headline(text: str, min_chars: int = 40) -> str | None:
    if not isinstance(text, str):
        return None

    # fix mojibake
    for bad, good in MOJIBAKE_MAP.items():
        text = text.replace(bad, good)

    # strip newswire prefixes and junk symbols
    text = re.sub(r"^(\*|BC:|[A-Za-zÀ-ÿ]+/|[A-Za-zÀ-ÿ]+:)\s*", "", text)

    # normalize characters/spaces, strip trailing colon
    text = re.sub(r"[^\w\s.,;:!?-ÁÉÍÓÚÂÊÔÃÕáéíóúâêôãõçÇñÑ]+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r":$", "", text).strip()

    # drop if too short in characters
    return text if len(text) >= min_chars else None

In [None]:
# Apply cleaning but keep both original and cleaned values
df["cleaned_headline"] = df["headline"].apply(clean_headline)

# Separate valid and removed rows
removed_df = df[df["cleaned_headline"].isna()].copy().drop(columns=["cleaned_headline"])
cleaned_df = df[df["cleaned_headline"].notna()].copy()

# Replace headline with cleaned version for the kept rows
cleaned_df = cleaned_df.drop(columns=["headline"])
cleaned_df = cleaned_df.rename(columns={"cleaned_headline": "headline"})

# Save outputs
cleaned_df.to_csv("../data/processed/corpus.csv", index=False, encoding="utf-8")
removed_df.to_csv("../data/interim/corpus-removed.csv", index=False, encoding="utf-8")