In [3]:
import pandas as pd
import numpy as np
import yfinance as yf
import ta
from datetime import datetime, timedelta

In [4]:
def load_and_process_sentiment_data(filepath: str, ticker: str) -> pd.DataFrame:
    """Load sentiment dataset and process for a given ticker."""
    data = pd.read_csv(filepath)
    ticker_data = data[data['Stock Name'] == ticker].copy()

    # Convert and clean date
    ticker_data['Date'] = pd.to_datetime(ticker_data['Date'])
    ticker_data['Day'] = ticker_data['Date'].dt.date

    # Drop irrelevant columns
    ticker_data.drop(columns=['Stock Name', 'Tweet', 'Company Name', 'Date'], inplace=True)
    ticker_data.rename(columns={'Day': 'Date'}, inplace=True)

    # Aggregate sentiment scores by date
    sentiment_daily = ticker_data.groupby('Date').mean().reset_index()
    return sentiment_daily

In [5]:
def load_and_process_stock_data(ticker: str, sentiment_dates: pd.Series) -> pd.DataFrame:
    """Download stock price data, add technical indicators, and align with sentiment dates."""
    delta = timedelta(days=50)
    delta1 = timedelta(days=1)

    start_date = min(sentiment_dates - delta)
    end_date = max(sentiment_dates + delta1)

    # Download stock data
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    stock_data.columns = stock_data.columns.droplevel(level=1)  # Remove multi-index
    stock_data.reset_index(inplace=True)

    # Simplify date
    stock_data['Date'] = stock_data['Date'].dt.date

    # Add technical indicators
    stock_data['SMA_10'] = ta.trend.sma_indicator(stock_data['Close'], window=10)
    stock_data['SMA_20'] = ta.trend.sma_indicator(stock_data['Close'], window=20)
    stock_data['RSI'] = ta.momentum.rsi(stock_data['Close'], window=14)

    macd = ta.trend.MACD(stock_data['Close'])
    stock_data['MACD'] = macd.macd()
    stock_data['MACD_signal'] = macd.macd_signal()

    return stock_data

In [6]:
def merge_datasets(stock_data: pd.DataFrame, sentiment_data: pd.DataFrame) -> pd.DataFrame:
    """Merge stock data and sentiment data on date."""
    merged = pd.merge(stock_data, sentiment_data, on='Date', how='inner')
    merged.sort_values(by='Date', inplace=True)
    merged.drop(columns=['Date'], inplace=True)  # Drop date after merge
    return merged

In [7]:
def split_dataset(data: pd.DataFrame, train_ratio=0.7, val_ratio=0.15):
    """Split dataset into train, validation, and test sets."""
    total_samples = len(data)
    train_size = round(total_samples * train_ratio)
    val_size = round(total_samples * val_ratio)

    df_train = data.iloc[:train_size].copy()
    df_val = data.iloc[train_size:train_size + val_size].copy()
    df_test = data.iloc[train_size + val_size:].copy()

    return df_train, df_val, df_test

In [8]:
def make_sequences(data: pd.DataFrame, lag: int = 5):
    """Convert dataframe into sequences of features and labels."""
    X, y = [], []
    features = ['Close', 'High', 'Low', 'Open', 'Volume',
                'SMA_10', 'SMA_20', 'RSI', 'MACD', 'MACD_signal',
                'positive', 'neutral', 'negative']

    for idx in range(len(data) - lag):
        X.append(data.iloc[idx:idx + lag][features].values)

        if idx + lag + 1 >= len(data):
            y.append(0)
            continue

        label = 1 if data.iloc[idx + lag - 1]['Close'] < data.iloc[idx + lag]['Close'] else 0
        y.append(label)

    return np.array(X), np.array(y)

In [9]:
def process_ticker(ticker: str, sentiment_filepath: str = "data_sentiment.csv"):
    """Run full pipeline for a single ticker and return datasets."""
    sentiment_data = load_and_process_sentiment_data(sentiment_filepath, ticker)
    stock_data = load_and_process_stock_data(ticker, sentiment_data['Date'])
    merged_data = merge_datasets(stock_data, sentiment_data)

    df_train, df_val, df_test = split_dataset(merged_data)

    train_X, train_y = make_sequences(df_train)
    val_X, val_y = make_sequences(df_val)
    test_X, test_y = make_sequences(df_test)

    return train_X, train_y, val_X, val_y, test_X, test_y

In [10]:
def main():
    tickers = ['TSLA', 'MSFT', 'PG', 'META', 'AMZN', 'GOOG', 'AMD', 'AAPL']

    for ticker in tickers:
        print(f"Processing {ticker}...")

        train_X, train_y, val_X, val_y, test_X, test_y = process_ticker(ticker)

        # Save all datasets for this ticker in one compressed .npz file
        np.savez_compressed(
            f"datasets/{ticker}_data.npz",  # all files go into a datasets/ folder
            train_X=train_X, train_y=train_y,
            val_X=val_X, val_y=val_y,
            test_X=test_X, test_y=test_y
        )

In [16]:
if __name__ == "__main__":
    main()

Processing TSLA...


  stock_data = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


Processing MSFT...


  stock_data = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


Processing PG...


  stock_data = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


Processing META...


  stock_data = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


Processing AMZN...


  stock_data = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


Processing GOOG...


  stock_data = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


Processing AMD...


  stock_data = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


Processing AAPL...


  stock_data = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed
