# Data Collection and Preparation

This notebook demonstrates the data collection pipeline for stock market volatility prediction using alternative data sources.

In [None]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from src.utils.config import load_config
from src.data.finance_yf import download_price_history
from src.data.trends_pytrends import fetch_trends
from src.data.twitter_snscrape import fetch_tweets_for_tickers
from src.data.reddit_praw import fetch_reddit_posts

## Configuration and Setup

In [None]:
# Load configuration
cfg = load_config()
print(f"Data directory: {cfg.data_dir}")
print(f"Raw data: {cfg.raw_dir}")
print(f"Processed data: {cfg.processed_dir}")

## 1. Financial Data Collection (Yahoo Finance)

In [None]:
# Download price data for target tickers
tickers = ['AAPL', 'MSFT', 'NVDA', 'SPY']
start_date = '2020-01-01'
end_date = datetime.now().strftime('%Y-%m-%d')

print(f"Downloading data for {tickers} from {start_date} to {end_date}")
prices = download_price_history(tickers, start_date, end_date)
print(f"Downloaded {len(prices)} rows of price data")
print(f"Columns: {list(prices.columns)}")
prices.head()

In [None]:
# Visualize price data
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
for i, ticker in enumerate(tickers):
    ax = axes[i//2, i%2]
    close_col = f'{ticker}_close'
    if close_col in prices.columns:
        prices[close_col].plot(ax=ax, title=f'{ticker} Close Price')
        ax.set_ylabel('Price ($)')
plt.tight_layout()
plt.show()

## 2. Google Trends Data

In [None]:
# Fetch Google Trends data
print("Fetching Google Trends data...")
trends = fetch_trends(tickers, start_date, end_date)
print(f"Downloaded {len(trends)} rows of trends data")
print(f"Columns: {list(trends.columns)}")
trends.head()

In [None]:
# Visualize trends data
fig, ax = plt.subplots(figsize=(12, 6))
for ticker in tickers:
    if ticker in trends.columns:
        trends[ticker].plot(ax=ax, label=ticker, alpha=0.7)
ax.set_title('Google Trends Interest Over Time')
ax.set_ylabel('Interest Score')
ax.legend()
plt.show()

## 3. Social Media Data Collection

Note: Twitter and Reddit data collection may take time and require API credentials.

In [None]:
# Twitter data collection (using snscrape)
print("Collecting Twitter data...")
try:
    tweets = fetch_tweets_for_tickers(tickers, start_date, end_date, limit_per_ticker=100)
    print(f"Collected {len(tweets)} tweets")
    if not tweets.empty:
        print(f"Tweet columns: {list(tweets.columns)}")
        tweets.head()
except Exception as e:
    print(f"Twitter collection failed: {e}")
    tweets = pd.DataFrame()

In [None]:
# Reddit data collection (requires credentials)
print("Collecting Reddit data...")
try:
    start_ts = int(datetime.strptime(start_date, '%Y-%m-%d').timestamp())
    end_ts = int(datetime.strptime(end_date, '%Y-%m-%d').timestamp())
    reddit_posts = fetch_reddit_posts(tickers, start_ts, end_ts, limit_per_ticker=50)
    print(f"Collected {len(reddit_posts)} Reddit posts")
    if not reddit_posts.empty:
        print(f"Reddit columns: {list(reddit_posts.columns)}")
        reddit_posts.head()
except Exception as e:
    print(f"Reddit collection failed: {e}")
    reddit_posts = pd.DataFrame()

## 4. Data Quality Assessment

In [None]:
# Check data quality
print("=== Data Quality Report ===")
print(f"Price data: {len(prices)} rows, {prices.isnull().sum().sum()} missing values")
print(f"Trends data: {len(trends)} rows, {trends.isnull().sum().sum()} missing values")
print(f"Twitter data: {len(tweets)} rows")
print(f"Reddit data: {len(reddit_posts)} rows")

# Check date ranges
print("\n=== Date Ranges ===")
print(f"Prices: {prices.index.min()} to {prices.index.max()}")
print(f"Trends: {trends.index.min()} to {trends.index.max()}")

## 5. Save Raw Data

In [None]:
# Save raw data
os.makedirs(cfg.raw_dir, exist_ok=True)

prices.to_csv(os.path.join(cfg.raw_dir, 'prices.csv'))
trends.to_csv(os.path.join(cfg.raw_dir, 'trends.csv'))

if not tweets.empty:
    tweets.to_csv(os.path.join(cfg.raw_dir, 'tweets.csv'), index=False)
if not reddit_posts.empty:
    reddit_posts.to_csv(os.path.join(cfg.raw_dir, 'reddit_posts.csv'), index=False)

print("Raw data saved successfully!")