In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit

# Load data
returns_df = pd.read_csv('return_train_2017_2021.csv')
reddit_df = pd.read_csv('sentiment_train_2017_2021.csv')

# Data cleaning functions
def clean_returns_data(df):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    if df['Return'].dtype == object:
        df['Return'] = df['Return'].str.rstrip('%').astype('float') / 100
    df = df.dropna(subset=['Return'])
    return df

def clean_sentiment_data(df):
    df = df.copy()
    df['Received_Time'] = pd.to_datetime(df['Received_Time'])
    df['Received_Time'] = df['Received_Time'].dt.tz_localize('UTC').dt.tz_convert('America/New_York')
    df['Effective_Date'] = df['Received_Time'].dt.floor('D')
    after_4pm = df['Received_Time'].dt.hour >= 16
    df.loc[after_4pm, 'Effective_Date'] += pd.Timedelta(days=1)
    return df

# Clean data
returns_df = clean_returns_data(returns_df)
reddit_df = clean_sentiment_data(reddit_df)

def calculate_skew(x):
    return stats.skew(x) if len(x) > 2 else 0

def create_features(df, rolling_windows=[3, 7]):
    """Create features with robust NaN handling"""
    # Basic aggregations
    agg_dict = {
        'Sentiment': ['mean', 'std', calculate_skew],
        'Ticker': 'count'
    }
    
    if 'Author' in df.columns:
        agg_dict['Author'] = 'nunique'
    
    # Group and aggregate
    daily_agg = df.groupby(['Ticker', 'Effective_Date']).agg(agg_dict)
    daily_agg.columns = [f"{col[0]}_{col[1]}" if isinstance(col, tuple) else col for col in daily_agg.columns]
    daily_agg = daily_agg.reset_index()
    
    # Rename columns
    rename_map = {
        'Sentiment_mean': 'Daily_Sentiment',
        'Sentiment_std': 'Sentiment_Std',
        'Sentiment_calculate_skew': 'Sentiment_Skew',
        'Ticker_count': 'Post_Count'
    }
    if 'Author_nunique' in daily_agg.columns:
        rename_map['Author_nunique'] = 'Unique_Author_Count'
    
    daily_agg = daily_agg.rename(columns=rename_map)
    
    # Ensure required columns exist and fill NAs
    required = ['Daily_Sentiment', 'Sentiment_Std', 'Sentiment_Skew', 'Post_Count']
    for col in required:
        if col not in daily_agg.columns:
            daily_agg[col] = 0
        daily_agg[col] = daily_agg[col].fillna(0)
    
    # Add derived features
    daily_agg['Log_Post_Count'] = np.log1p(daily_agg['Post_Count'])
    daily_agg = daily_agg.sort_values(['Ticker', 'Effective_Date'])
    
    # Rolling features with min_periods=1 to avoid NaNs
    for window in rolling_windows:
        col_name = f'Rolling_Avg_{window}D'
        daily_agg[col_name] = daily_agg.groupby('Ticker')['Daily_Sentiment'].transform(
            lambda x: x.rolling(window, min_periods=1).mean()
        )
    
    return daily_agg

# Create features with NaN handling
features_df = create_features(reddit_df)

# Prepare for merging
returns_df['Signal_Date'] = returns_df['Date'] - pd.Timedelta(days=1)
features_df['Date_Key'] = features_df['Effective_Date'].dt.strftime('%Y-%m-%d')
returns_df['Signal_Date_Key'] = returns_df['Signal_Date'].dt.strftime('%Y-%m-%d')

# Merge with inner join to avoid NaN targets
# ... [your code up through the merge] ...

merged_df = (
    pd.merge(
        features_df,
        returns_df,
        left_on=['Ticker','Date_Key'],
        right_on=['Ticker','Signal_Date_Key'],
        how='inner'
    )
    .drop(columns=['Date_Key','Signal_Date_Key','Signal_Date'])
    .rename(columns={'Return':'Next_Day_Return'})
)

print("NaN values in merged data:", merged_df.isna().sum().sum())

# split
train_df = merged_df[merged_df['Date'].dt.year < 2021].copy()
val_df   = merged_df[merged_df['Date'].dt.year == 2021].copy()

# prepare X and y
X_train = train_df[required_cols].fillna(0)
y_train = train_df['Next_Day_Return'].fillna(0)
X_val   = val_df  [required_cols].fillna(0)
y_val   = val_df  ['Next_Day_Return'].fillna(0)

# ... rest of your scaling, training, evaluation ...


# Verify no NaNs
assert not X_train.isna().any().any(), "Training features contain NaNs"
assert not y_train.isna().any(), "Training targets contain NaNs"

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train model
ridge_model = RidgeCV(alphas=np.logspace(-3, 3, 13), cv=TimeSeriesSplit(n_splits=5))
ridge_model.fit(X_train_scaled, y_train)

# Evaluation
y_val_pred = ridge_model.predict(X_val_scaled)
print(f"Validation MSE: {np.mean((y_val - y_val_pred)**2):.6f}")
print(f"Validation MAE: {np.mean(np.abs(y_val - y_val_pred)):.6f}")
print(f"Validation R²: {ridge_model.score(X_val_scaled, y_val):.4f}")

  return stats.skew(x) if len(x) > 2 else 0


NaN values in merged data: 0
Validation MSE: 0.001863
Validation MAE: 0.023118
Validation R²: -0.0007
