In [5]:
import numpy as np
import pandas as pd
import sklearn as sk

# Load raw data
tweets_df = pd.read_csv("../data/raw/elonmusk_raw.csv")
stock_df = pd.read_csv("../data/raw/teslastock_raw.csv")

print(tweets_df.head())
print(stock_df.head())

                    id                                                url  \
0  1655159652990976000  https://x.com/elonmusk/status/1655159652990976000   
1  1657261624867299339  https://x.com/elonmusk/status/1657261624867299339   
2  1623774484795920384  https://x.com/elonmusk/status/1623774484795920384   
3  1656900119202254854  https://x.com/elonmusk/status/1656900119202254854   
4  1616531874763116544  https://x.com/elonmusk/status/1616531874763116544   

                                          twitterUrl  \
0  https://twitter.com/elonmusk/status/1655159652...   
1  https://twitter.com/elonmusk/status/1657261624...   
2  https://twitter.com/elonmusk/status/1623774484...   
3  https://twitter.com/elonmusk/status/1656900119...   
4  https://twitter.com/elonmusk/status/1616531874...   

                                            fullText  retweetCount  \
0  RT @einarvollset: I read @paulg’s  “How to Mak...           NaN   
1                            https://t.co/Zjn6r15lrR        

  tweets_df = pd.read_csv("../data/raw/elonmusk_raw.csv")


In [6]:
# Drop unnecessary columns
inputs = tweets_df[["createdAt", "fullText"]]
labels = stock_df[["Date", "Open", "Close"]]

# Merge datasets on date
inputs["Date"] = pd.to_datetime(inputs["createdAt"]).dt.date
labels["Date"] = pd.to_datetime(labels["Date"]).dt.date
data = pd.merge(inputs, labels, on="Date", how="left")

# create daily change column to be used as continuous label
data["day_change"] = data["Close"] - data["Open"]

# Basic preprocessing - check and remove for NaN values
print(f'Dropping {data["day_change"].isna().sum()} / {len(data)} NaN values')
data = data.dropna(subset=["day_change"])

print("Samples in dataset:", len(data))

Dropping 20733 / 55099 NaN values
Samples in dataset: 34366


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputs["Date"] = pd.to_datetime(inputs["createdAt"]).dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels["Date"] = pd.to_datetime(labels["Date"]).dt.date


In [7]:
from sklearn.model_selection import train_test_split

# split it into train, val and test sets
X, y = data[["fullText"]], data["day_change"]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, shuffle=True
)

print(f"Train size: {len(X_train)}, Val size: {len(X_val)}, Test size: {len(X_test)}")

Train size: 24056, Val size: 5155, Test size: 5155


In [8]:
# Let's vectorize the text data - use bag of words for simplicity and baseline
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train["fullText"])
X_val_bow = bow_vectorizer.transform(X_val["fullText"])

X_train_bow.shape  # second dimension gives the vocab size

(24056, 24203)

In [9]:
# Lets finally fit a linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


def fit_linear_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)

    # Get metrics
    R2_train = model.score(X_train, y_train)
    MSE_train = mean_squared_error(y_train, model.predict(X_train))

    R2_val = model.score(X_val, y_val)
    MSE_val = mean_squared_error(y_val, model.predict(X_val))

    print(f"Train R2: {R2_train}, Train MSE: {MSE_train}")
    print(f"Validation R2: {R2_val}, Validation MSE: {MSE_val}")


fit_linear_model(LinearRegression(), X_train_bow, y_train, X_val_bow, y_val)

Train R2: 0.7195640785765807, Train MSE: 13.317850195816819
Validation R2: -18.12687994289484, Validation MSE: 888.6973676683656


We can see that the simple Bag-of-words vectorization + linear regression overfits badly (low train error, high validation error). However, we have established a baseline.

To make our baseline slightly better, we can try to reduce our model's complexity by using some regularization: Ridge and Lasso Regression.


In [14]:
# Fit regularized linear models - Ridge and Lasso
from sklearn.linear_model import Lasso, Ridge

# default hyperparameters used
print("=== Ridge Regression ===")
fit_linear_model(Ridge(alpha=1.0), X_train_bow, y_train, X_val_bow, y_val)

print("=== Lasso Regression ===")
fit_linear_model(Lasso(alpha=1.0), X_train_bow, y_train, X_val_bow, y_val)

=== Ridge Regression ===
Train R2: 0.5465154048775713, Train MSE: 21.53589979948549
Validation R2: -0.2468215870331516, Validation MSE: 57.93140677709266
=== Lasso Regression ===
Train R2: 0.0, Train MSE: 47.48981559929589
Validation R2: -0.0007537706181983683, Validation MSE: 46.49829163396627


Regularization seems to have improved our baseline's performance significantly compared to vanilla OLS, but still nowhere near where where we'd like to be. Using Lasso (which is implicitly a form of feature selection as well, since certain parameters will simply go to 0 during training), we've managed to get an R^2 of approximately 0, which is approximately what we would expect if simply using the mean of the training dataset. It's simply better to predict the mean at the moment.
