In [1]:
import pandas as pd
import numpy as np
import fastparquet
import pyarrow

In [2]:
df_tweet = pd.read_parquet("english_crypto_tweets_2014_2019.parquet")
df_news = pd.read_parquet("crypto_news_2013_2018_n_2021_2023.parquet")
df_price = pd.read_parquet("crypto_prices_2014_2021.parquet")

In [3]:
df_tweet.info()

In [4]:
df_news.info()

In [5]:
df_price.info()

In [6]:
from datetime import timedelta
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from scipy.stats import spearmanr

In [7]:
# 2) Sentiment scoring
analyzer = SentimentIntensityAnalyzer()

def score_series(df, text_col, out_col):
    df[out_col] = df[text_col].apply(lambda t: analyzer.polarity_scores(t)["compound"])
    return df

df_tweet = score_series(df_tweet, "text", "tweet_sentiment")
df_news  = score_series(df_news,  "text", "news_sentiment")

In [8]:
# 1) First try to parse full YYYY-MM-DD (and time if present)
parsed = pd.to_datetime(df_tweet["date"], errors="coerce", infer_datetime_format=True)

# 2) Find the ones that failed (NaT) but are exactly 4 digits → treat as year
mask_year_only = parsed.isna() & df_tweet["date"].str.fullmatch(r"\d{4}")

# Build a YYYY-01-01 string for those
year_as_date_str = df_tweet.loc[mask_year_only, "date"] + "-01-01"
parsed.loc[mask_year_only] = pd.to_datetime(year_as_date_str, format="%Y-%m-%d")

# 3) Assign back as Python dates
df_tweet["date"] = parsed.dt.date

# 4) (Optional) Drop any rows that still failed to parse
df_tweet = df_tweet[df_tweet["date"].notna()]

In [9]:
df_tweet.head()

In [10]:
df_price["date"]  = pd.to_datetime(df_price["Date"]).dt.date

In [11]:
# 3) Normalize & aggregate by date
df_news["date"] = pd.to_datetime(df_news["year"], format="%Y").dt.date

In [15]:
# daily mean sentiment
tweet_agg = (
    df_tweet.groupby("date")["tweet_sentiment"]
    .mean()
    .reset_index()
    .rename(columns={"sentiment_score": "tweet_sentiment"})
)

news_agg = (
    df_news.groupby("date")["news_sentiment"]
    .mean()
    .reset_index()
    .rename(columns={"sentiment_score": "news_sentiment"})
)

# 4) Merge with price
df = (
    df_price[["date", "Close"]]
    .merge(tweet_agg, on="date", how="left")
    .merge(news_agg, on="date", how="left")
    .fillna(0)   # fill missing sentiment days with 0
)

In [16]:
df["Close_lag1"]   = df["Close"].shift(1)
df["return_1d"]    = (df["Close"] - df["Close_lag1"]) / df["Close_lag1"]

# 4.2 rolling windows
for w in (3,7):
    df[f"tweet_sent_roll{w}"] = df["tweet_sentiment"].rolling(w).mean()
    df[f"news_sent_roll{w}"]  = df["news_sentiment"].rolling(w).mean()
    df[f"volatility_{w}d"]    = df["return_1d"].rolling(w).std()

# 4.3 simple moving average (SMA)
for w in (5,10):
    df[f"sma_{w}"] = df["Close"].rolling(w).mean()

# 4.4 RSI (14-day)
delta = df["Close"].diff()
gain  = delta.clip(lower=0)
loss  = -delta.clip(upper=0)
avg_gain = gain.rolling(14).mean()
avg_loss = loss.rolling(14).mean()
rs = avg_gain / avg_loss
df["rsi_14"] = 100 - (100 / (1 + rs))

In [17]:
corr_df = df.dropna(subset=[
    "tweet_sentiment","news_sentiment","tweet_sent_roll3","sma_5","rsi_14"
])

print("Pearson corr tweet vs Close:", 
      corr_df["tweet_sentiment"].corr(corr_df["Close"]))
print("Spearman corr  tweet vs Close:", 
      spearmanr(corr_df["tweet_sentiment"], corr_df["Close"])[0])

print("Pearson corr news vs Close:", 
      corr_df["news_sentiment"].corr(corr_df["Close"]))
print("Spearman corr  news vs Close:", 
      spearmanr(corr_df["news_sentiment"], corr_df["Close"])[0])

In [20]:
df["next_close"] = df["Close"].shift(-1)
df["label"]      = (df["next_close"] > df["Close"]).astype(int)
model_df = df.dropna(subset=["label"])

feature_cols = ["tweet_sentiment","news_sentiment","tweet_sent_roll3","tweet_sent_roll7","news_sent_roll3","news_sent_roll7","sma_5","sma_10","rsi_14","volatility_3d"]
X = model_df[feature_cols]
y = model_df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",
    random_state=42
)

rf.fit(X_train, y_train)

In [31]:
from sklearn.metrics import roc_curve

probs = rf.predict_proba(X_test)[:,1]
fpr, tpr, th = roc_curve(y_test, probs)
opt_thresh = th[np.argmax(tpr - fpr)]
y_pred = (probs >= opt_thresh).astype(int)

In [35]:
print("RandomForestClassifier")
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))

In [27]:
import xgboost as xgb

xclf = xgb.XGBClassifier(
    n_estimators=100,
    scale_pos_weight=(len(y_train[y_train==0])/len(y_train[y_train==1])),
    random_state=42,
    use_label_encoder=False,
    eval_metric="auc"
)
xclf.fit(X_train, y_train)

In [36]:
y_pred_xgb = xclf.predict(X_test)
print("XGBoost")
print("Test Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))
print("Test ROC AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))

In [34]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

tscv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(rf, X, y, cv=tscv, scoring="roc_auc")
print("CV AUC scores:", scores)

In [40]:
from xgboost import XGBClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Integer
xgb = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# 2) Specify the hyperparameter search space
search_spaces = {
    'max_depth': Integer(3, 10),
    'learning_rate': Real(1e-3, 1e-1, prior='log-uniform'),
    'subsample': Real(0.5, 1.0),
    'colsample_bytree': Real(0.5, 1.0),
    'reg_alpha': Real(1e-5, 1e2, prior='log-uniform'),
    'reg_lambda': Real(1e-5, 1e2, prior='log-uniform'),
    'min_child_weight': Integer(1, 10),
    'n_estimators': Integer(50, 300)
}

# 3) Use TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# 4) Set up Bayesian search with BayesSearchCV
bayes_cv = BayesSearchCV(
    estimator=xgb,
    search_spaces=search_spaces,
    n_iter=32,                    # number of parameter settings sampled
    scoring='roc_auc',
    cv=tscv,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

In [42]:
# 5) Run the search
bayes_cv.fit(X_train, y_train)

# 6) Output best parameters and score
print("Best AUC (CV):", bayes_cv.best_score_)
print("Best hyperparameters:", bayes_cv.best_params_)

In [43]:
# 7) Evaluate on test set
best_model = bayes_cv.best_estimator_
y_prob = best_model.predict_proba(X_test)[:, 1]
y_pred_bayes_xgb = (y_prob >= 0.5).astype(int)

print("Test ROC AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred_bayes_xgb))

In [46]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error

# Prepare
df["ret_1d"] = df["Close"].pct_change(1)
df["ret_next"] = df["ret_1d"].shift(-1)
reg_df = df.dropna(subset=["ret_next"])

Xr = reg_df[feature_cols]
yr = reg_df["ret_next"]
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.2, shuffle=False)

# 1) Define & train
hgb = HistGradientBoostingRegressor(
    max_iter=100,
    random_state=42
)
hgb.fit(Xr_train, yr_train)

# 2) Evaluate
yr_pred = hgb.predict(Xr_test)
print("RMSE:", root_mean_squared_error(yr_test, yr_pred))

In [48]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
search_spaces2 = {
    "learning_rate":    Real(1e-3, 1e-1, prior="log-uniform"),
    "max_iter":         Integer(50, 500),
    "max_depth":        Integer(2, 20),
    "min_samples_leaf": Integer(1, 50),
    "l2_regularization":Real(1e-5, 1e1, prior="log-uniform"),
}
tscv = TimeSeriesSplit(n_splits=5)
bayes_hgb = BayesSearchCV(
    estimator=HistGradientBoostingRegressor(random_state=42),
    search_spaces=search_spaces2,
    scoring="neg_root_mean_squared_error",
    cv=tscv,
    n_iter=32,     
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# 5) Run the search
bayes_hgb.fit(Xr_train, yr_train)

In [49]:
print("Best params:", bayes_hgb.best_params_)
print("Best CV RMSE:", -bayes_hgb.best_score_)

# 6) Evaluate on hold-out test set
best_hgb = bayes_hgb.best_estimator_
yr_pred2 = best_hgb.predict(Xr_test)
rmse_test = root_mean_squared_error(yr_test, yr_pred2)
print("Test RMSE:", rmse_test)

In [55]:
import joblib
joblib.dump(best_hgb, "best_hgb_model.pkl")