Exploratory Data Analysis

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
import numpy as np
import datetime

In [None]:
# load data
news_df = pd.read_csv('../data/interim/bdm-corpus-2/stage-0.csv')
fx_df = pd.read_csv('../data/interim/usd-brl.csv')

# preview
print('News dataset:')
display(news_df.head())
print('Exchange rate dataset:')
display(fx_df.head())

News dataset:


Unnamed: 0,Timestamp,HEADING,ARTICLE CONTENT,COMMENTS,USD/BRL
0,2024-11-21 14:34:00,BC: Fluxo cambial total na semana passada (11 ...,,Fluxo cambial negativo significa menos reserva...,5.8177
1,2024-11-21 14:34:00,"Fluxo total em novembro, até dia 14, está nega...",,Fluxo cambial negativo significa menos reserva...,5.8177
2,2024-11-21 14:35:00,Itaú BBA mantém recomendação de compra para Di...,,Notícias do mercado de ações não afetam a taxa...,5.8206
3,2024-11-21 14:35:00,Banco destaca que desempenho da empresa segue ...,,Notícias do mercado de ações não afetam a taxa...,5.8206
4,2024-11-21 14:36:00,Fed/Goolsbee: Juros devem se aproximar do níve...,,O FED anuncia que a taxa de equilíbrio da econ...,5.8192


Exchange rate dataset:


Unnamed: 0,Timestamp,USD/BRL
0,2024-09-02T09:00:00,5.6379
1,2024-09-02T09:01:00,5.634
2,2024-09-02T09:02:00,5.6277
3,2024-09-02T09:03:00,5.6238
4,2024-09-02T09:04:00,5.6189


## Headline EDA
### Figures

In [None]:
# Figure: This bar chart displays the 20 most frequent tokens found in the headlines, with common stopwords removed.
from collections import Counter
import re

stop_words = set(stopwords.words('portuguese')) | set(stopwords.words('english')) #checks for stop words in both languages

def tokenize(text):
    tokens = re.findall(r'\b\w+\b', str(text).lower()) # lowercase and remove punctuation
    return [t for t in tokens if t not in stop_words and len(t) > 1] # filter out stopwords and single-character tokens

all_tokens = []
for headline in news_df['HEADING'].dropna():
    all_tokens.extend(tokenize(headline)) 

token_counts = Counter(all_tokens)
top_tokens = token_counts.most_common(20)

tokens, counts = zip(*top_tokens)
plt.figure(figsize=(12,6))
sns.barplot(x=list(tokens), y=list(counts), palette='viridis')
plt.title('Top 20 Most Common Tokens in Headlines')
plt.xlabel('Token')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Figure: distribution of the number of tokens per headline in the dataset
headline_lengths = [len(tokenize(h)) for h in news_df['HEADING'].dropna()]
plt.figure(figsize=(10,5))
sns.histplot(headline_lengths, bins=30)
plt.title('Distribution of Headline Lengths (in Tokens)')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

### Tables

In [None]:
# Table: sample of headlines along with their timestamps and source outlets to show the type of financial news and metadata coverage
example_cols = ['Timestamp', 'HEADING']
if 'SOURCE' in news_df.columns:
    example_cols.append('SOURCE')
display(news_df[example_cols].head(10))

In [None]:
# Table: summary stats of headline length - mean, median, and std of token counts in headlines
headline_lengths = [len(tokenize(h)) for h in news_df['HEADING'].dropna()]
import statistics
summary = {
    'Mean': np.mean(headline_lengths),
    'Median': np.median(headline_lengths),
    'Std': np.std(headline_lengths)
}
display(pd.DataFrame([summary]))

In [None]:
# Table: number of headlines and total token count per day, useful for computing daily lexical density and monitoring coverage gaps
news_df['date'] = pd.to_datetime(news_df['Timestamp']).dt.date
daily_headlines = news_df.groupby('date')['HEADING'].count()
daily_tokens = news_df.groupby('date')['HEADING'].apply(lambda x: sum(len(tokenize(h)) for h in x))
daily_stats = pd.DataFrame({'headline_count': daily_headlines, 'token_count': daily_tokens})
display(daily_stats)

In [None]:
# Table: News density by hour-of-day - average frequency of headlines per hour, computed across all days
news_df['hour_of_day'] = pd.to_datetime(news_df['Timestamp']).dt.date
hourly_density = news_df.groupby('hour_of_day').size().div(news_df['date'].nunique())
display(hourly_density)

In [None]:
# Table: 20 days with the most headlines
news_df['date'] = pd.to_datetime(news_df['Timestamp']).dt.date
daily_counts = news_df.groupby('date').size().sort_values(ascending=False)
top20_days = daily_counts.head(20).reset_index()
top20_days.columns = ['Date', 'Headline Count']
display(top20_days)

## Article Content EDA

In [None]:
# Table: summary stats of article content length - summarizes the word/token count distribution for full-text articles, if the data is available (the rows that have it).
if 'ARTICLE CONTENT' in news_df.columns:
    article_lengths = [len(tokenize(a)) for a in news_df['ARTICLE CONTENT'].dropna()]
    if article_lengths:
        summary = {
            'Mean': np.mean(article_lengths),
            'Median': np.median(article_lengths),
            'Std': np.std(article_lengths)
        }
        display(pd.DataFrame([summary]))
    else:
        print('No article content available.')
else:
    print('ARTICLE CONTENT column not found.')

## Exchange Rate Data EDA

In [None]:
# Figure: Daily average exchange rate volatility
# this line chart shows how the volatility of the USD/BRL exchange rate changes over time, computed as the standard deviation of minute-level returns within each day.
fx_df['Timestamp'] = pd.to_datetime(fx_df['Timestamp'])
fx_df = fx_df.sort_values('Timestamp')
fx_df['return'] = fx_df['USD/BRL'].pct_change()
fx_df['date'] = fx_df['Timestamp'].dt.date
volatility = fx_df.groupby('date')['return'].std()

plt.figure(figsize=(12,6))
volatility.plot()
plt.title('Daily Average Exchange Rate Volatility (USD/BRL)')
plt.xlabel('Date')
plt.ylabel('Volatility (Std of 1-min returns)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Table: Graph of USD/BRL exchange rate over all beginning and ending timestamp
fx_df['date'] = pd.to_datetime(fx_df['Timestamp']).dt.date
plt.figure(figsize=(12,6))
fx_df.groupby('date')['USD/BRL'].mean().plot()
plt.title('USD/BRL Exchange Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Exchange Rate')
plt.xticks(rotation=45)
plt.tight_layout()

In [None]:
# Table: Find 20 days with the highest FX volatility
fx_df['return'] = fx_df['USD/BRL'].pct_change()
fx_df['date'] = pd.to_datetime(fx_df['Timestamp']).dt.date
volatility_by_day = fx_df.groupby('date')['return'].std()
top20_vol_days = volatility_by_day.sort_values(ascending=False).head(20).reset_index()
top20_vol_days.columns = ['Date', 'Volatility']
display(top20_vol_days)

## Forward Returns

In [None]:
df = pd.read_csv("../data/interim/bdm-corpus-2/stage-1.csv")

# relevant columns
forward_return_columns = [f"Forward Return t+{i}" for i in range(1, 21)]

# Count the values -1, 0, and 1 in each column
value_counts = {}
for col in forward_return_columns:
    counts = df[col].value_counts().reindex([-1, 0, 1], fill_value=0)
    value_counts[col] = counts

counts_df = pd.DataFrame(value_counts).T
counts_df.columns = ['Count -1', 'Count 0', 'Count 1']
counts_df['Total'] = counts_df.sum(axis=1)

display(counts_df)

Unnamed: 0,Count -1,Count 0,Count 1,Total
Forward Return t+1,1712,123,1684,3519
Forward Return t+2,1735,58,1726,3519
Forward Return t+3,1730,79,1710,3519
Forward Return t+4,1717,86,1716,3519
Forward Return t+5,1682,54,1783,3519
Forward Return t+6,1625,46,1848,3519
Forward Return t+7,1641,58,1820,3519
Forward Return t+8,1636,51,1832,3519
Forward Return t+9,1613,58,1848,3519
Forward Return t+10,1596,34,1889,3519


In [None]:
df = pd.read_csv("../data/interim/bdm-corpus-2/stage-2.csv")
counts = df["Direção"].value_counts()
print(counts)
print(f"Total: {counts.sum()}")

Direção
Aumento       1783
Diminuição    1682
Name: count, dtype: int64
Total: 3465
