# HARLF v3: Data Collection

Simple data collection pipeline for NLP features.

**Steps:**
1. Setup & imports
2. Collect price data 
3. Calculate technical indicators
4. Create NLP features
5. Visualize & export

In [1]:
# Imports and setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from pathlib import Path
import pandas as pd
import yfinance as yf
import numpy as np
import warnings
from datetime import datetime


print("✅ Ready for data collection")

✅ Ready for data collection


In [2]:
PORTFOLIO_FILE = "../data/portfolio_holdings.csv"  # Fixed path
PERIOD = "20y"

In [3]:
def collect_numerical_data(portfolio_file=PORTFOLIO_FILE, period=PERIOD):

    print(f"Starting data collection at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    portfolio = pd.read_csv(portfolio_file)
    tickers = portfolio['Ticker'].unique().tolist()

    print(f" Portfolio loaded successfully with {len(tickers)} unique tickers")
    print(f"Tickers: {tickers}")
 
    PERIOD = period
    data = yf.download(tickers, period=PERIOD, auto_adjust=True)['Close'].round(2)

    print(f" Data download completed. Shape: {data.shape}")
    print(f"Date range: {data.index.min().strftime('%Y-%m-%d')} to {data.index.max().strftime('%Y-%m-%d')}")
    print(f"Number of trading days: {len(data)}")
    
    log_returns = np.log(data / data.shift(1)).fillna(0)
    
    sma10 = data.rolling(10).mean() / data - 1 # 10-day SMA
    sma30 = data.rolling(30).mean() / data - 1 # 30-day SMA
    
    # Calculate volatility (10-day rolling standard deviation of returns)
    volatility_10 = log_returns.rolling(10).std().fillna(0)
    
    # Calculate 60-day rolling statistics
    log_return_mean_60 = log_returns.rolling(60).mean()
    log_return_std_60 = log_returns.rolling(60).std()
    # Sharpe ratio = mean return / standard deviation (with small epsilon to avoid division by zero)
    sharpe_60 = log_return_mean_60 / (log_return_std_60 + 1e-8)
    
    # Calculate 120-day rolling statistics
    log_return_mean_120 = log_returns.rolling(120).mean()
    log_return_std_120 = log_returns.rolling(120).std()
    sharpe_120 = log_return_mean_120 / (log_return_std_120 + 1e-8)
    
    # 5. Create Feature Dataset for Machine Learning (exactly as in notebook)

    print(f"\n Dataset Summary:")
    
    # Stack features into a single DataFrame in long form for ML/RL
    feature_frames = []
    
    for ticker in data:
        df = pd.DataFrame({
            'Date': log_returns.index,
            'Ticker': ticker,
            'log_return': log_returns[ticker].values,
            'sma10': sma10[ticker].values,
            'sma30': sma30[ticker].values,
            'volatility_10': volatility_10[ticker].values,
            'log_return_mean_60': log_return_mean_60[ticker].values,
            'log_return_std_60': log_return_std_60[ticker].values,
            'sharpe_60': sharpe_60[ticker].values,
            'log_return_mean_120': log_return_mean_120[ticker].values,
            'log_return_std_120': log_return_std_120[ticker].values,
            'sharpe_120': sharpe_120[ticker].values
        })
        feature_frames.append(df.round(4))
    
    # Combine all feature frames
    features_df = pd.concat(feature_frames, ignore_index=True)
    features_df.fillna(0, inplace=True)
    
    print(f"Features shape: {features_df.shape}")
    print(f"\n Date range: {features_df.Date.min().strftime('%Y-%m-%d')} to {features_df.Date.max().strftime('%Y-%m-%d')}")
    
    # 6. Prepare price data (exactly as in notebook)
    prices = data.copy()
    prices.fillna(0, inplace=True)
    print(f"Price data shape: {prices.shape}")
    print(f"\n Date range: {prices.index.min().strftime('%Y-%m-%d')} to {prices.index.max().strftime('%Y-%m-%d')}")

    return features_df, prices, log_returns, tickers

features_df, prices, log_returns, tickers = collect_numerical_data()

Starting data collection at 2025-08-07 11:31:53
 Portfolio loaded successfully with 18 unique tickers
Tickers: ['RDDT', 'NVDA', 'SMR', 'MU', 'MRVL', 'MSFT', 'ASML', 'AEM', 'AMD', 'VERU', 'AI', 'GOOGL', 'INGM', 'PLUG', 'IONQ', 'CHYM', 'RGTI', 'ARBE']


[*********************100%***********************]  18 of 18 completed

 Data download completed. Shape: (5031, 18)
Date range: 2005-08-08 to 2025-08-06
Number of trading days: 5031

 Dataset Summary:
Features shape: (90558, 12)

 Date range: 2005-08-08 to 2025-08-06
Price data shape: (5031, 18)

 Date range: 2005-08-08 to 2025-08-06





In [5]:
# Save features and prices for training
features_df.to_csv("../data/features_df_for_training.csv", index=False)

prices.to_csv("../data/price_data_for_training.csv", index=True)

# Save log returns data for sentiment analysis validation
log_returns.to_csv("../data/log_returns_data.csv", index=True)

# Save list of tickers used for RL
with open('rl_tickers.txt', 'w') as f:
    for ticker in tickers:
        f.write(f"{ticker}\n")


print(f"\n Data collection completed successfully at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


 Data collection completed successfully at 2025-08-07 11:33:23
