In [1]:
"""
001_prediction_function.py

Final deliverable with:
- Time-series cross-validation
- Stronger regularization (dropout, weight decay, early stopping)
- Expanded feature set (including volatility measures)
"""

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from datetime import time, timedelta

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def convert_return(x):
    if isinstance(x, str) and x.endswith('%'):
        return float(x[:-1].strip())/100
    return float(x)

def preprocess_sentiment_data(df):
    df = df.copy()
    df['Received_Time'] = pd.to_datetime(df['Received_Time'], utc=True)
    df['Received_Time_EST'] = df['Received_Time'].dt.tz_convert('America/New_York')
    cutoff = time(16,0)
    df['Date'] = df['Received_Time_EST'].apply(
        lambda x: pd.to_datetime(x.date()+timedelta(days=1))
        if x.time()>cutoff else pd.to_datetime(x.date())
    )
    df['Ticker'] = df['Ticker'].str.upper()
    return df

def create_features(df):
    # basic and net sentiment
    agg = df.groupby(['Ticker','Date']).agg(
        sentiment_mean=('Sentiment','mean'),
        sentiment_std=('Sentiment','std'),
        post_count=('Sentiment','count'),
        net_sentiment=('Sentiment','sum'),
        avg_confidence=('Confidence','mean'),
        avg_prob_pos=('Prob_POS','mean'),
        avg_prob_ntr=('Prob_NTR','mean'),
        avg_prob_neg=('Prob_NEG','mean'),
        avg_source_weight=('SourceWeight','mean'),
        avg_topic_weight=('TopicWeight','mean'),
        avg_relevance=('Relevance','mean'),
    ).fillna(0).reset_index()

    # weighted sentiment
    ws = df.groupby(['Ticker','Date']).apply(
        lambda d: (d.Sentiment*d.Relevance).sum()/d.Sentiment.count()
        if d.Sentiment.count()>0 else 0
    ).reset_index(name='weighted_sentiment')
    agg = agg.merge(ws, on=['Ticker','Date'], how='left')

    # time-series per ticker
    def ts_feats(g):
        g = g.sort_values('Date').copy()
        g['cum_sent'] = g.net_sentiment.cumsum()
        g['daily_change'] = g.net_sentiment.diff().fillna(0)
        g['ma_5'] = g.net_sentiment.rolling(5,min_periods=1).mean()
        g['ma_10'] = g.net_sentiment.rolling(10,min_periods=1).mean()
        g['past3'] = g.net_sentiment.shift(1).rolling(3,min_periods=1).sum().fillna(0)
        g['log_vol'] = np.log1p(g.post_count)
        # new volatility features
        g['vol_5'] = g.net_sentiment.rolling(5,min_periods=1).std().fillna(0)
        g['range_5'] = (g.net_sentiment.rolling(5,min_periods=1).max() - 
                        g.net_sentiment.rolling(5,min_periods=1).min()).fillna(0)
        return g

    agg = agg.groupby('Ticker').apply(ts_feats).reset_index(drop=True)

    # topic/source counts (abbreviated hereâ€”expand to full list as before)
    topics = [t.upper() for t in [
        "Biotech","Chart","Commentary","Daily Discussion","Daily Thread","DD","Discussion",
        "Distressed","Earnings Thread","Education","Energy","Fundamentals","Futures","Gain",
        "Help","Industry Report","Interview/Profile","Investor Letter","Long Thesis","Loss",
        "Macro","Meme","Mods","News","None","Options","Profit","Question","Retail","Satire",
        "Shitpost","Short Thesis","Special Situation","Stocks","Storytime","Strategy",
        "TAG ME PLS","Technicals","Thesis","WALL ST. \"LEAKS\"","Weekend Discussion",
        "WSBbooks","YOLO"
    ]]
    for src in ['WSB','INVESTING']:
        mask = df.Source.str.upper().str.contains(src, na=False)
        sub = df[mask]
        cnt = sub.groupby(['Ticker','Date'])['Reddit_Topic']\
                 .apply(lambda s: s.str.upper().isin(topics).value_counts())\
                 .unstack(fill_value=0)
        for t in topics:
            col = f"{src}_count_{t}"
            cnt[col] = cnt.get(t,0)
        cnt = cnt[[f"{src}_count_{t}" for t in topics]].reset_index()
        agg = agg.merge(cnt, on=['Ticker','Date'], how='left').fillna(0)

    return agg

class FeedforwardNet(nn.Module):
    def __init__(self, in_dim, drop=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim,128),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(64,1)
        )
    def forward(self,x): return self.net(x)

def train_model(sentiment_data, return_data):
    sd = preprocess_sentiment_data(sentiment_data)
    feats = create_features(sd)
    rd = return_data.copy()
    rd['Date']=pd.to_datetime(rd['Date']).dt.normalize()
    rd['Ticker']=rd['Ticker'].str.upper()
    rd['Return']=rd['Return'].apply(convert_return)
    data = feats.merge(rd[['Date','Ticker','Return']],on=['Date','Ticker']).dropna(subset=['Return'])
    data = data.sort_values('Date')
    X = data.drop(['Date','Ticker','Return'],axis=1).values
    y = data['Return'].values.reshape(-1,1)

    # cross-validation splits
    tscv = TimeSeriesSplit(n_splits=5)
    best_model, best_scaler, best_loss = None, None, np.inf

    for train_idx,val_idx in tscv.split(X):
        X_tr, X_va = X[train_idx], X[val_idx]
        y_tr, y_va = y[train_idx], y[val_idx]
        scaler = StandardScaler().fit(X_tr)
        X_tr_s, X_va_s = scaler.transform(X_tr), scaler.transform(X_va)

        # tensors
        Xt = torch.tensor(X_tr_s,dtype=torch.float32).to(device)
        yt = torch.tensor(y_tr,dtype=torch.float32).to(device)
        Xv = torch.tensor(X_va_s,dtype=torch.float32).to(device)
        yv = torch.tensor(y_va,dtype=torch.float32).to(device)

        model = FeedforwardNet(Xt.shape[1],drop=0.4).to(device)
        opt = optim.AdamW(model.parameters(),lr=5e-4,weight_decay=1e-3)
        crit = nn.MSELoss()

        # early stopping
        es_best, es_cnt = np.inf, 0
        for ep in range(200):
            model.train(); opt.zero_grad()
            loss = crit(model(Xt),yt)
            loss.backward(); opt.step()

            model.eval()
            with torch.no_grad():
                vloss = crit(model(Xv),yv).item()
            if vloss < es_best:
                es_best, es_cnt = vloss, 0
                best_state = model.state_dict()
            else:
                es_cnt += 1
                if es_cnt>=20: break

        model.load_state_dict(best_state)
        if es_best < best_loss:
            best_loss, best_model, best_scaler = es_best, model, scaler

    # final model on full data
    X_s = best_scaler.transform(X)
    Xt_all = torch.tensor(X_s,dtype=torch.float32).to(device)
    yt_all = torch.tensor(y,dtype=torch.float32).to(device)
    final = FeedforwardNet(X_s.shape[1],drop=0.4).to(device)
    final.load_state_dict(best_model.state_dict())
    opt = optim.AdamW(final.parameters(),lr=2e-4,weight_decay=1e-3)
    for ep in range(50):
        final.train(); opt.zero_grad()
        crit(final(Xt_all),yt_all).backward(); opt.step()

    return {
        'model': final,
        'scaler': best_scaler,
        'feature_columns': data.drop(['Date','Ticker','Return'],axis=1).columns.tolist(),
        'device': device
    }

def predict_returns(model, sentiment_data_today, stock_universe_today):
    """
    Generate predictions of next-day returns for all stocks in the universe.
    
    Parameters
    ----------
    model : dict
        Contains the trained model, scaler, feature columns, and device info.
    sentiment_data_today : DataFrame
        New sentiment data (for a single day).
    stock_universe_today : list
        List of stock tickers available today.
    
    Returns
    -------
    predictions : DataFrame
        A DataFrame with columns ['Ticker', 'Predicted_Return', 'Signal_Rank'].
    """
    # Preprocess today's sentiment data
    sd = preprocess_sentiment_data(sentiment_data_today)
    
    # Determine current date
    if sd.empty:
        today = pd.Timestamp.today().normalize()
    else:
        today = sd['Date'].max()
    
    # Create features up to today
    feats = create_features(sd[sd['Date'] <= today])
    
    # Filter or fill for the universe
    ft = feats[feats['Date'] == today].copy()
    tickers = [t.upper() for t in stock_universe_today]
    
    if ft.empty or 'Ticker' not in ft.columns:
        ft = pd.DataFrame({'Ticker': tickers, 'Date': today})
    else:
        ft = ft[ft['Ticker'].isin(tickers)]
        missing = set(tickers) - set(ft['Ticker'])
        for m in missing:
            ft = ft.append({'Ticker': m, 'Date': today}, ignore_index=True)
    
    # Ensure all feature columns exist
    for col in model['feature_columns']:
        if col not in ft.columns:
            ft[col] = 0
    
    ft = ft.sort_values('Ticker').reset_index(drop=True)
    
    # Scale and predict
    X = ft[model['feature_columns']].fillna(0).values
    Xs = model['scaler'].transform(X)
    Xt = torch.tensor(Xs, dtype=torch.float32).to(model['device'])
    
    net = model['model']
    net.eval()
    with torch.no_grad():
        preds = net(Xt).cpu().numpy().flatten()
    
    # Break ties and rank
    preds += np.random.normal(0, 1e-6, size=preds.shape)
    out = pd.DataFrame({
        'Ticker': ft['Ticker'],
        'Predicted_Return': preds
    })
    out['Signal_Rank'] = out['Predicted_Return'].rank(pct=True)
    
    return out[['Ticker', 'Predicted_Return', 'Signal_Rank']]

####################################
# Test Section (Runs if the script is executed directly)
####################################
if __name__ == "__main__":
    try:
        sentiment_data = pd.read_csv('./data/sentiment_train_2017_2021.csv')
        return_data = pd.read_csv('./data/return_train_2017_2021.csv')
        print("Data loaded successfully.")
    except FileNotFoundError:
        print("Data files not found. Please check file paths.")
        sentiment_data = None
        return_data = None

    if sentiment_data is not None and return_data is not None:
        # Normalize the return_data Date column.
        return_data['Date'] = pd.to_datetime(return_data['Date']).dt.normalize()
        
        model = train_model(sentiment_data, return_data)
        
        sample_day = pd.to_datetime('2021-06-01').normalize()
        preprocessed_sentiment = preprocess_sentiment_data(sentiment_data)
        sentiment_data_today = preprocessed_sentiment[preprocessed_sentiment['Date'] == sample_day].copy()
        
        stock_universe_today = return_data[return_data['Date'] == sample_day]['Ticker'].unique().tolist()
        
        predictions = predict_returns(model, sentiment_data_today=sentiment_data_today, stock_universe_today=stock_universe_today)
        print("Sample predictions:")
        print(predictions.head())

Data loaded successfully.


  ws = df.groupby(['Ticker','Date']).apply(
  agg = agg.groupby('Ticker').apply(ts_feats).reset_index(drop=True)
  ws = df.groupby(['Ticker','Date']).apply(
  agg = agg.groupby('Ticker').apply(ts_feats).reset_index(drop=True)


AttributeError: 'DataFrame' object has no attribute 'append'