In [1]:
import pandas as pd
import matplotlib.dates as mdates

In [2]:
# read data
data = pd.read_csv(r"../Data/Aggregated_Final_Lin_Rakeen_Fixed_returns.csv")

In [3]:
def calculat_magnitude_metrics(results):
    """Calculate the magnitude metrics."""
    results["daily_price_change_in_dollars"] = results["closing_price"].diff()
    # magnitude bins less than -1320, -1320, -989, -990, -659, -660, -329, 3320, -1, 0, 330, 330, 660, 660 ,990 ,990 ,1320 and up
    bins = [-np.inf, -1320, -989, -659, -329, -1, 330, 660, 990, 1320, np.inf]
    labels = [
        1,
        2,
        3,
        4,
        5,
        6,
        7,
        8,
        9,
        10,
    ]

    results["magnitude_bins"] = pd.cut(
        results["daily_price_change_in_dollars"], bins=bins, labels=labels
    )
    results[["magnitude_bins", "daily_price_change_in_dollars"]]
    results["predicted_price_change_in_dollars"] = results["y_pred"] * results[
        "open_price"
    ].shift(1)
    results["predicted_magnitude_bins"] = pd.cut(
        results["predicted_price_change_in_dollars"], bins=bins, labels=labels
    )
    results[
        [
            "predicted_magnitude_bins",
            "magnitude_bins",
        ]
    ]

    results = results.dropna()

    # calculate F1 score of this multi class classification
    from sklearn.metrics import f1_score

    f1 = f1_score(
        results["magnitude_bins"],
        results["predicted_magnitude_bins"],
        average="weighted",
    )

    print(f"Magnitude F1 Score: {f1}")

    # calculate accuracy
    accuracy = (
        results["magnitude_bins"] == results["predicted_magnitude_bins"]
    ).sum() / len(results)

    print(f"Magnitude Accuracy: {accuracy}")

In [4]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import mean_squared_error


def train_evaluate(mdl, X_train, y_train, X_test, y_test):
    """Train, evaluate and plot a random forest model."""
    # model = RandomForestRegressor()
    # model = LinearRegression()
    model = mdl
    # fit model
    model.fit(X_train, y_train)

    # make predictions
    y_pred = model.predict(X_test)

    # evaluate model
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")

    r_squared = model.score(X_test, y_test)
    print(f"R Squared: {r_squared}")

    # make a dataframe with y_test and y_pred
    results = pd.DataFrame({"y_test": y_test, "y_pred": y_pred})

    # compute residuals
    results["residuals"] = results["y_test"] - results["y_pred"]

    return model, results

In [5]:
def calculat_direction_metrics(results):
    """Calculate the direction metrics."""
    # calculate metrics for direction
    results["actual_direction"] = results["y_test"].apply(lambda x: 1 if x > 0 else 0)
    results["Pred_direction"] = results["y_pred"].apply(lambda x: 1 if x > 0 else 0)

    tp = results[
        (results["actual_direction"] == 1) & (results["Pred_direction"] == 1)
    ].shape[0]
    fp = results[
        (results["actual_direction"] == 0) & (results["Pred_direction"] == 1)
    ].shape[0]
    fn = results[
        (results["actual_direction"] == 1) & (results["Pred_direction"] == 0)
    ].shape[0]
    tn = results[
        (results["actual_direction"] == 0) & (results["Pred_direction"] == 0)
    ].shape[0]

    # print(f"Result of {str(model)}, {str(feature)}:")
    # print(f"True Positives: {tp}")
    # print(f"False Positives: {fp}")
    # print(f"False Negatives: {fn}")
    # print(f"True Negatives: {tn}")

    # calculate precision
    precision = tp / (tp + fp)
    # print(f"Precision: {precision}")

    # calculate recall
    recall = tp / (tp + fn)
    # print(f"Recall: {recall}")

    # calculate f1 score
    f1 = 2 * (precision * recall) / (precision + recall)
    print(f"Direction F1 Score: {f1}")

    # calculate accuracy
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print(f"Direction Accuracy: {accuracy}")

    return tp, fp, fn, tn, precision, recall, f1, accuracy

In [6]:
# copy without warnings
pd.options.mode.chained_assignment = None

In [7]:
# # make sweetviz report
# import sweetviz as sv

# report = sv.analyze(data)

# # save report
# report.show_html("../EDA/fixed_return_report.html")

In [8]:
# drop magnitude and Direction columns
data = data.drop(columns=["magnitude", "Direction"])
data.head(15)

Unnamed: 0,date_dt,positive_count_bitcoin,negative_count_bitcoin,neutral_count_bitcoin,positive_avg_score_bitcoin,negative_avg_score_bitcoin,neutral_avg_score_bitcoin,total_news_count_bitcoin,total_news_score_bitcoin,signal_bitcoin,...,T5YIE_lag1,T5YIE_lag2,S&P500_Close_lag1,S&P500_Close_lag2,VIX_Close_lag1,VIX_Close_lag2,S&P_Return_lag1,S&P_Return_lag2,gg_trend,Daily Return
0,2021-04-22,1.0,1.0,5.0,0.642198,0.476258,0.73161,7.0,0.165941,0.0,...,2.52,2.56,4134.939941,4163.259766,18.68,17.290001,-0.006802,-0.005307,0.137663,-0.040582
1,2021-04-23,1.0,1.0,5.0,0.642198,0.476258,0.73161,7.0,0.165941,0.0,...,2.53,2.52,4173.419922,4134.939941,17.5,18.68,0.009306,-0.006802,0.137663,-0.013001
2,2021-04-24,1.0,11.0,3.0,0.586267,0.906848,0.899653,15.0,-9.389061,-0.833333,...,2.42,2.53,4134.97998,4173.419922,18.709999,17.5,-0.009211,0.009306,0.137663,-0.02062
3,2021-04-25,1.0,11.0,3.0,0.586267,0.906848,0.899653,15.0,-9.389061,-0.833333,...,2.44,2.42,4180.169922,4134.97998,17.33,18.709999,0.010929,-0.009211,0.018928,-0.021133
4,2021-04-26,1.0,11.0,3.0,0.586267,0.906848,0.899653,15.0,-9.389061,-0.833333,...,2.44,2.44,4180.169922,4180.169922,17.33,17.33,0.0,0.010929,0.018928,0.09748
5,2021-04-27,4.0,0.0,4.0,0.667552,0.0,0.856313,8.0,2.670206,1.0,...,2.44,2.44,4180.169922,4180.169922,17.33,17.33,0.0,0.0,0.018928,0.018548
6,2021-04-28,4.0,0.0,4.0,0.667552,0.0,0.856313,8.0,2.670206,1.0,...,2.47,2.44,4187.620117,4180.169922,17.639999,17.33,0.001782,0.0,0.018928,-0.003794
7,2021-04-29,4.0,0.0,4.0,0.667552,0.0,0.856313,8.0,2.670206,1.0,...,2.51,2.47,4186.720215,4187.620117,17.559999,17.639999,-0.000215,0.001782,0.018928,-0.02343
8,2021-04-30,3.0,1.0,4.0,0.75842,0.778473,0.95634,8.0,1.496786,0.5,...,2.54,2.51,4183.180176,4186.720215,17.280001,17.559999,-0.000846,-0.000215,0.018928,0.075415
9,2021-05-01,3.0,1.0,4.0,0.75842,0.778473,0.95634,8.0,1.496786,0.5,...,2.56,2.54,4211.470215,4183.180176,17.610001,17.280001,0.006763,-0.000846,0.018928,0.001348


**Linear Regression Implementation**

In [9]:
# linear regression model
from sklearn.linear_model import LinearRegression


data.set_index("date_dt", inplace=True)
data.index = pd.to_datetime(data.index)

# convert return to moving average
data["Daily_Return_rolling_15_avg"] = data["Daily Return"].rolling(window=15).mean()
data.dropna(inplace=True)
data.head(20)

Unnamed: 0_level_0,positive_count_bitcoin,negative_count_bitcoin,neutral_count_bitcoin,positive_avg_score_bitcoin,negative_avg_score_bitcoin,neutral_avg_score_bitcoin,total_news_count_bitcoin,total_news_score_bitcoin,signal_bitcoin,TWITTER_SENTIMENT_DAILY_AVG,...,T5YIE_lag2,S&P500_Close_lag1,S&P500_Close_lag2,VIX_Close_lag1,VIX_Close_lag2,S&P_Return_lag1,S&P_Return_lag2,gg_trend,Daily Return,Daily_Return_rolling_15_avg
date_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-05-06,0.0,1.0,2.0,0.0,0.827074,0.900657,3.0,-0.827074,-1.0,-0.0058,...,2.6,4164.660156,4192.660156,19.48,18.309999,-0.006678,0.002748,0.122346,-0.018055,0.003011
2021-05-07,0.0,1.0,2.0,0.0,0.827074,0.900657,3.0,-0.827074,-1.0,-0.0075,...,2.62,4167.589844,4164.660156,19.15,19.48,0.000703,-0.006678,0.122346,0.016877,0.006842
2021-05-08,0.0,1.0,2.0,0.0,0.827074,0.900657,3.0,-0.827074,-1.0,-0.0228,...,2.68,4201.620117,4167.589844,18.389999,19.15,0.008165,0.000703,0.122346,0.024922,0.00937
2021-05-09,0.0,1.0,2.0,0.0,0.827074,0.900657,3.0,-0.827074,-1.0,-0.0228,...,2.66,4232.600098,4201.620117,16.690001,18.389999,0.007373,0.008165,0.105151,-0.009766,0.010093
2021-05-10,2.0,1.0,0.0,0.869209,0.913061,0.0,3.0,0.825356,0.333333,-0.0228,...,2.65,4232.600098,4232.600098,16.690001,16.690001,0.0,0.007373,0.105151,-0.041596,0.008729
2021-05-11,2.0,1.0,0.0,0.869209,0.913061,0.0,3.0,0.825356,0.333333,0.0048,...,2.65,4232.600098,4232.600098,16.690001,16.690001,0.0,0.0,0.105151,0.01501,0.003231
2021-05-12,2.0,0.0,0.0,0.949838,0.0,0.0,2.0,1.899676,1.0,0.0153,...,2.65,4188.430176,4232.600098,19.66,16.690001,-0.010436,0.0,0.105151,-0.142967,-0.007536
2021-05-13,2.0,0.0,0.0,0.949838,0.0,0.0,2.0,1.899676,1.0,0.013,...,2.71,4152.100098,4188.430176,21.84,19.66,-0.008674,-0.010436,0.105151,0.011443,-0.006521
2021-05-14,1.0,8.0,2.0,0.906305,0.798987,0.774572,11.0,-5.485593,-0.777778,-0.0168,...,2.68,4063.040039,4152.100098,27.59,21.84,-0.021449,-0.008674,0.105151,0.0033,-0.004739
2021-05-15,0.0,1.0,0.0,0.0,0.779942,0.0,1.0,-0.779942,-1.0,0.0148,...,2.72,4112.5,4063.040039,23.129999,27.59,0.012173,-0.021449,0.105151,-0.064599,-0.014073


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1049 entries, 2021-05-06 to 2024-03-19
Data columns (total 67 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   positive_count_bitcoin          1049 non-null   float64
 1   negative_count_bitcoin          1049 non-null   float64
 2   neutral_count_bitcoin           1049 non-null   float64
 3   positive_avg_score_bitcoin      1049 non-null   float64
 4   negative_avg_score_bitcoin      1049 non-null   float64
 5   neutral_avg_score_bitcoin       1049 non-null   float64
 6   total_news_count_bitcoin        1049 non-null   float64
 7   total_news_score_bitcoin        1049 non-null   float64
 8   signal_bitcoin                  1049 non-null   float64
 9   TWITTER_SENTIMENT_DAILY_AVG     1049 non-null   float64
 10  TWITTER_PUBLICATION_COUNT       1049 non-null   float64
 11  TWITTER_NEG_SENTIMENT_COUNT     1049 non-null   float64
 12  TWITTER_POS_SENT

## LR model with base features

In [11]:
from sklearn.metrics import mean_squared_error
import joblib

data = data.dropna()

# Featues chosen based on pearson correlation
cols = [
    "stoch_%K",
    "stoch_%D",
    "S&P500 Return",
    "rsi",
    "negative_count_bitcoin",
    "positive_count_bitcoin",
    "total_news_score_bitcoin",
    "negative_avg_score_bitcoin",
    "positive_avg_score_bitcoin",
    "signal_bitcoin",
    "gg_trend",
    "Daily_Return_rolling_15_avg",
]
X = data[cols].drop(columns=["Daily_Return_rolling_15_avg"])
# cumulative sum for last 100 days of news sentiment
X["total_news_score_lag_100_sum_bit"] = (
    data["total_news_score_bitcoin"].rolling(100).sum()
)
X["total_news_score_lag_100_sum_coin"] = (
    data["total_news_score_coinbase"].rolling(100).sum()
)
y = data["Daily_Return_rolling_15_avg"]

# drop null values
X = X.dropna()
y = y[y.index.isin(X.index)]

split_date = "2023-03-01"
X_train = X[X.index < split_date]
X_test = X[X.index >= split_date]
y_train = y[y.index < split_date]
y_test = y[y.index >= split_date]

# fit model
model = LinearRegression()
model.fit(X_train, y_train)

# make predictions
y_pred = model.predict(X_test)

# evaluate model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

r_squared = model.score(X_test, y_test)
print(f"R Squared: {r_squared}")

# print adjusted r squared
n = X_test.shape[0]
p = X_test.shape[1]
adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print(f"Adjusted R Squared: {adj_r_squared}")

# save model
# joblib.dump(model, "../Modeling/models/LR_model_ideal.pkl")

# save X features
baseline_features = X.columns

Mean Squared Error: 1.812880543811274e-05
R Squared: 0.5703388500450819
Adjusted R Squared: 0.5552833380520524


In [12]:
daily_close = data[data.index >= split_date]["Close"].shift(-1)
daily_open = data[data.index >= split_date]["Open"].shift(-1)

In [13]:
mdl_sets = {}
mdl_sets["LinearRegression"] = LinearRegression()
mdl_sets["RandomForestRegressor"] = RandomForestRegressor()

for m_key, m_value in mdl_sets.items():
    print(f"Results for {m_key}, baseline_features:")

    split_date = "2023-03-01"
    X_train = X[X.index < split_date]
    X_test = X[X.index >= split_date]
    y_train = y[y.index < split_date]
    y_test = y[y.index >= split_date]

    model, results = train_evaluate(
        m_value,
        X_train,
        y_train,
        X_test,
        y_test,
    )
    calculat_direction_metrics(results)
    # use the prdicted moving average to make a trading strategy
    results["predicted_direction"] = results["y_pred"].apply(
        lambda x: 1 if x > 0 else 0
    )

    # if predicted direction is 1 for 15 days in a row, buy
    results["buy"] = results["predicted_direction"].shift(1) == 1 & (
        results["predicted_direction"].shift(2) == 1
    ) & (results["predicted_direction"].shift(3) == 1) & (
        results["predicted_direction"].shift(4) == 1
    )

    # if predicted direction is 0 for three days in a row, sell
    results["sell"] = results["predicted_direction"].shift(1) == 0 & (
        results["predicted_direction"].shift(2) == 0
    ) & (results["predicted_direction"].shift(3) == 0) & (
        results["predicted_direction"].shift(4) == 0
    )

    results["closing_price"] = daily_close

    results["open_price"] = daily_open

    # if declinining in the last 5 days, sell
    results["sell_2"] = results["sell"] | (
        results["closing_price"].shift(1) < results["closing_price"].shift(2)
    ) & (results["closing_price"].shift(2) < results["closing_price"].shift(3)) & (
        results["closing_price"].shift(3) < results["closing_price"].shift(4)
    ) & (
        results["closing_price"].shift(4) < results["closing_price"].shift(5)
    ) & (
        results["closing_price"].shift(5) < results["closing_price"].shift(6)
    ) & (
        results["closing_price"].shift(6) < results["closing_price"].shift(7)
    ) & (
        results["closing_price"].shift(7) < results["closing_price"].shift(8)
    ) & (
        results["closing_price"].shift(8) < results["closing_price"].shift(9)
    ) & (
        results["closing_price"].shift(9) < results["closing_price"].shift(10)
    )

    results["buy_previous"] = results["buy"].shift(1)
    results["change_buy"] = False
    results["change_sell"] = False
    calculat_magnitude_metrics(results)
    print("\n")

Results for LinearRegression, baseline_features:
Mean Squared Error: 1.812880543811274e-05
R Squared: 0.5703388500450819
Direction F1 Score: 0.8765957446808511
Direction Accuracy: 0.8493506493506493
Magnitude F1 Score: 0.17105296147005072
Magnitude Accuracy: 0.21148825065274152


Results for RandomForestRegressor, baseline_features:
Mean Squared Error: 1.7103081258618475e-05
R Squared: 0.5946489918248344
Direction F1 Score: 0.8930041152263375
Direction Accuracy: 0.8649350649350649
Magnitude F1 Score: 0.16836412725556862
Magnitude Accuracy: 0.21671018276762402


