In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from scipy import stats
import pickle
import warnings
warnings.filterwarnings('ignore')


In [3]:
print("Loading data...")

# Replace these paths with the actual data file paths
sentiment_data_path = "sentiment_train_2017_2021.csv"
return_data_path = "return_train_2017_2021.csv"

# Load the data
sentiment_data = pd.read_csv(sentiment_data_path)
return_data = pd.read_csv(return_data_path)

# Display basic shapes
print("Sentiment data shape:", sentiment_data.shape)
print("Return data shape:", return_data.shape)


Loading data...
Sentiment data shape: (11929999, 21)
Return data shape: (2459589, 5)


In [4]:
def ensure_date_column(df):
    if 'Date' not in df.columns:
        if 'Received_Time' in df.columns:
            df['Date'] = pd.to_datetime(df['Received_Time']).dt.floor('D')
        elif 'Post_Time' in df.columns:
            df['Date'] = pd.to_datetime(df['Post_Time']).dt.floor('D')
        else:
            raise KeyError("No 'Date', 'Received_Time', or 'Post_Time' column found in sentiment data.")
    else:
        df['Date'] = pd.to_datetime(df['Date'])
    return df

sentiment_data = ensure_date_column(sentiment_data)
return_data['Date'] = pd.to_datetime(return_data['Date'])

# Optional: display a sample to verify
print(sentiment_data.head())


                  StoryID                Post_Time        Received_Time  \
0  RR20170101VFVFDB4TGnM=  2017-01-01 00:03:09.000  2017-01-01 00:05:09   
1  RR20170101VFVFDBsVG3M=  2017-01-01 00:06:59.000  2017-01-01 00:08:59   
2  RR20170101VFVFD1RXCXM=  2017-01-01 00:15:36.000  2017-01-01 00:17:36   
3  RR20170101VFVFD1QMCnM=  2017-01-01 00:15:56.000  2017-01-01 00:17:56   
4  RR20170101VFVFDg9QCHM=  2017-01-01 00:45:04.000  2017-01-01 00:47:04   

  Ticker Country          ISIN  Relevance  Sentiment  Confidence  Prob_POS  \
0   HOOD     USA  US7707001027   1.000000          0    0.874448  0.041390   
1    PRI     USA  US74164M1080   1.000000         -1    0.709253  0.059022   
2    BHE     USA  US08160H1014   1.000000          0    0.461974  0.021243   
3   LOGM     USA  US54142L1098   0.703235         -1    0.531746  0.068879   
4    SUN     USA  US86765K1097   0.254271          1    0.212626  0.475079   

   ...  Reddit_Topic  TopicWeight      Alex_Topic          Source  \
0  ...     

In [5]:
def engineer_features(sentiment_data, return_data=None, training=True):
    """
    Engineer features from sentiment data.
    
    Parameters:
    -----------
    sentiment_data : DataFrame
        The Reddit sentiment data
    return_data : DataFrame, optional
        The stock return data (used during training)
    training : bool, default True
        Whether this is for training or prediction
        
    Returns:
    --------
    features_df : DataFrame
        DataFrame with engineered features
    """
    # Ensure the 'Date' column exists
    sentiment_data = ensure_date_column(sentiment_data)
    
    # Copy the data to avoid modifying the original
    sentiment = sentiment_data.copy()
    
    # Group by Date and Ticker
    grouped = sentiment.groupby(['Date', 'Ticker'])
    daily_features = []
    
    for (date, ticker), group in grouped:
        if len(group) < 3:
            continue
        
        # Calculate basic sentiment statistics
        mean_sentiment = group['Sentiment'].mean()
        median_sentiment = group['Sentiment'].median()
        std_sentiment = group['Sentiment'].std() or 0
        min_sentiment = group['Sentiment'].min()
        max_sentiment = group['Sentiment'].max()
        post_count = len(group)
        log_post_count = np.log1p(post_count)
        mean_prob_pos = group['Prob_POS'].mean()
        mean_prob_neg = group['Prob_NEG'].mean()
        mean_prob_ntr = group['Prob_NTR'].mean()
        max_probs = group[['Prob_POS', 'Prob_NEG', 'Prob_NTR']].max(axis=1)
        mean_certainty = max_probs.mean()
        
        def entropy(row):
            probs = [row['Prob_POS'], row['Prob_NEG'], row['Prob_NTR']]
            probs = [p for p in probs if p > 0]
            return -sum(p * np.log(p) for p in probs)
        mean_entropy = group.apply(entropy, axis=1).mean()
        
        unique_authors = group['Author'].nunique()
        author_ratio = unique_authors / post_count
        pos_ratio = (group['Sentiment'] > 0).mean()
        neg_ratio = (group['Sentiment'] < 0).mean()
        ntr_ratio = (group['Sentiment'] == 0).mean()
        sentiment_skew = group['Sentiment'].skew() or 0
        weighted_sentiment = (group['Sentiment'] * group['Prob_POS']).sum() / group['Prob_POS'].sum() \
            if group['Prob_POS'].sum() > 0 else mean_sentiment
        
        features = {
            'Date': date,
            'Ticker': ticker,
            'mean_sentiment': mean_sentiment,
            'median_sentiment': median_sentiment,
            'std_sentiment': std_sentiment,
            'min_sentiment': min_sentiment,
            'max_sentiment': max_sentiment,
            'sentiment_range': max_sentiment - min_sentiment,
            'post_count': post_count,
            'log_post_count': log_post_count,
            'mean_prob_pos': mean_prob_pos,
            'mean_prob_neg': mean_prob_neg,
            'mean_prob_ntr': mean_prob_ntr,
            'mean_certainty': mean_certainty,
            'mean_entropy': mean_entropy,
            'unique_authors': unique_authors,
            'author_ratio': author_ratio,
            'pos_ratio': pos_ratio,
            'neg_ratio': neg_ratio,
            'ntr_ratio': ntr_ratio,
            'sentiment_skew': sentiment_skew,
            'weighted_sentiment': weighted_sentiment
        }
        daily_features.append(features)
    
    features_df = pd.DataFrame(daily_features)
    if features_df.empty:
        return pd.DataFrame()
    
    # Merge with returns if training
    if training and return_data is not None:
        return_data['Date'] = pd.to_datetime(return_data['Date'])
        features_df = pd.merge(
            features_df,
            return_data[['Date', 'Ticker', 'Return']],
            on=['Date', 'Ticker'],
            how='left'
        )
        missing_returns = features_df['Return'].isnull().sum()
        if missing_returns > 0:
            print(f"Warning: {missing_returns} rows have missing returns and will be dropped.")
            features_df = features_df.dropna(subset=['Return'])
    
    return features_df


In [9]:

def train_model(sentiment_data, return_data):
    """
    Train a model using sentiment features to predict next-day returns.
    
    Parameters:
    -----------
    sentiment_data : DataFrame
        The Reddit sentiment data for training (e.g., sentiment_train_2017_2021.csv)
    return_data : DataFrame
        The stock return data for training (e.g., return_train_2017_2021.csv)
        
    Returns:
    --------
    model_info : dict
        Dictionary containing the trained model object and necessary metadata.
    """
    # Ensure that sentiment_data has a 'Date' column.
    def ensure_date_column(df):
        if 'Date' not in df.columns:
            if 'Received_Time' in df.columns:
                df['Date'] = pd.to_datetime(df['Received_Time']).dt.floor('D')
            elif 'Post_Time' in df.columns:
                df['Date'] = pd.to_datetime(df['Post_Time']).dt.floor('D')
            else:
                raise KeyError("No 'Date', 'Received_Time', or 'Post_Time' column found in sentiment data.")
        else:
            df['Date'] = pd.to_datetime(df['Date'])
        return df

    sentiment_data = ensure_date_column(sentiment_data)
    return_data['Date'] = pd.to_datetime(return_data['Date'])
    
    # Use the previously defined engineer_features function
    features_df = engineer_features(sentiment_data, return_data, training=True)
    
    if features_df.empty:
        raise ValueError("No features could be engineered from the provided data.")
    
    # Prepare training data
    X = features_df.drop(['Date', 'Ticker', 'Return'], axis=1)
    y = features_df['Return']
    
    # Fill any remaining missing values with feature means
    X = X.fillna(X.mean())
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    print("Training XGBoost model...")
    model = xgb.XGBRegressor(
        n_estimators=100,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    model.fit(X_scaled, y)
    
    # Package the model along with scaler and feature names
    model_info = {
        'model': model,
        'scaler': scaler,
        'feature_names': X.columns.tolist(),
        'model_type': 'XGBoost'
    }
    
    return model_info


In [6]:
print("Engineering features for training data...")
features_df = engineer_features(sentiment_data, return_data, training=True)
print("Features dataframe shape:", features_df.shape)
print(features_df.head())

Engineering features for training data...
Features dataframe shape: (184284, 23)
         Date Ticker  mean_sentiment  median_sentiment  std_sentiment  \
51 2017-01-03   AAPL       -0.222222              -1.0       0.942809   
52 2017-01-03    AIG       -1.000000              -1.0       0.000000   
53 2017-01-03    AMD        0.369565               1.0       0.798852   
54 2017-01-03   AMGN        0.000000               0.0       1.000000   
55 2017-01-03   AMZN       -0.363636              -1.0       0.924416   

    min_sentiment  max_sentiment  sentiment_range  post_count  log_post_count  \
51             -1              1                2          18        2.944439   
52             -1             -1                0           4        1.609438   
53             -1              1                2          46        3.850148   
54             -1              1                2           3        1.386294   
55             -1              1                2          11        2.4849

In [7]:
# For example, if 'Return' is a percentage string, convert it:
if features_df['Return'].dtype == object:
    features_df['Return'] = features_df['Return'].str.rstrip('%').astype('float') / 100

numeric_features_df = features_df.drop(['Date', 'Ticker'], axis=1)
corr_with_return = numeric_features_df.corr()['Return'].sort_values(ascending=False)
print("Feature correlations with Return:")
print(corr_with_return)


Feature correlations with Return:
Return                1.000000
mean_prob_pos         0.016441
mean_sentiment        0.016027
pos_ratio             0.014095
median_sentiment      0.013822
mean_certainty        0.012993
weighted_sentiment    0.012409
unique_authors        0.011744
post_count            0.010453
max_sentiment         0.005932
log_post_count        0.005897
min_sentiment         0.001979
sentiment_range       0.000729
author_ratio         -0.001694
ntr_ratio            -0.003492
std_sentiment        -0.003508
mean_prob_ntr        -0.005703
sentiment_skew       -0.010114
mean_entropy         -0.013041
neg_ratio            -0.013582
mean_prob_neg        -0.016124
Name: Return, dtype: float64


In [10]:

# Call the train_model function using the sentiment and return data
model_info = train_model(sentiment_data, return_data)
print("Training complete!")
print("Model Information:")
print(model_info)


Training XGBoost model...


ValueError: could not convert string to float: '-0.11%'

In [None]:

# For demonstration, use the most recent date in the sentiment dataset as 'today'
today_date = sentiment_data['Date'].max()
print("Using sample date:", today_date)

# Extract sentiment data corresponding to the most recent date
sentiment_today = sentiment_data[sentiment_data['Date'] == today_date].copy()

# Construct a sample stock universe from the tickers present today
stock_universe_today = sentiment_today['Ticker'].unique().tolist()
print("Number of tickers in today's universe:", len(stock_universe_today))

# Generate predictions using the prediction function
predictions = predict_returns(model_info, sentiment_today, stock_universe_today)
print("Sample predictions:")
print(predictions.head())


In [None]:

print("=== Cell 9: Evaluation and Visualization ===")
# Re-engineer features for the training data (using the same function)
features_train = engineer_features(sentiment_data, return_data, training=True)

# Separate predictor variables (X) and target variable (y)
X_train = features_train.drop(['Date', 'Ticker', 'Return'], axis=1)
y_train = features_train['Return']

# Fill missing values if any
X_train = X_train.fillna(X_train.mean())

# Scale features using the trained scaler from model_info
X_train_scaled = model_info['scaler'].transform(X_train)

# Generate predictions on the training set
y_pred_train = model_info['model'].predict(X_train_scaled)

# Plot Actual vs Predicted Returns
plt.figure(figsize=(8, 6))
plt.scatter(y_train, y_pred_train, alpha=0.5, label="Predictions")
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', label="Ideal Fit")
plt.xlabel("Actual Return")
plt.ylabel("Predicted Return")
plt.title("Actual vs Predicted Returns on Training Data")
plt.legend()
plt.tight_layout()
plt.show()

# Calculate and print evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)
print(f"Training MSE: {mse_train:.6f}")
print(f"Training MAE: {mae_train:.6f}")
print(f"Training RÂ²: {r2_train:.6f}")


In [None]:
print("=== Cell 10: Conclusions and Next Steps ===")
print("Key Insights:")
print("- The model has been trained using engineered sentiment features and shows moderate correlation between actual and predicted returns.")
print("- The scatter plot indicates how closely the model's predictions follow the ideal 45-degree line.")
print("")
print("Next Steps:")
print("1. Experiment with alternative models (e.g., Ridge, Lasso, ensemble methods).")
print("2. Enhance feature engineering by including additional temporal or interaction features.")
print("3. Evaluate the model on validation data with a time-based split to ensure robustness.")
print("4. Test the prediction function with live or out-of-sample data (2022-2024) for further analysis.")