In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import re
import warnings
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import dask.dataframe as dd
import pandas as pd
nltk.download('vader_lexicon', quiet=True)

tweets_path = "Bitcoin_tweets.csv"
news_path = "cryptonews.csv"
df_tweets = dd.read_csv(
    tweets_path,
    dtype={
        "user_favourites": "object",
        "user_friends": "object",
        "user_followers": "object",
        "user_verified": "object"
    },
    on_bad_lines="skip",
    engine="python",
    blocksize="256MB"
)
df_news = dd.read_csv(
    news_path,
    dtype={"title": "object", "text": "object", "date": "object"},
    on_bad_lines="skip",
    engine="python"
)

print("✅ Tweets loaded:", len(df_tweets))
print("✅ News loaded:", len(df_news))

df_tweets["date"] = dd.to_datetime(df_tweets["date"], errors="coerce").dt.floor("d")
df_news["date"] = dd.to_datetime(df_news["date"], errors="coerce").dt.floor("d")

df_tweets = df_tweets.dropna(subset=["date"])
df_news = df_news.dropna(subset=["date"])

df_tweets = df_tweets.sample(frac=0.15, random_state=42)
df_news = df_news.sample(frac=0.3, random_state=42)


✅ Tweets loaded: 4693094
✅ News loaded: 31037


In [None]:
df_tweets = df_tweets.rename(columns={"text":"text_tweet"}) if "text" in df_tweets.columns else df_tweets
df_news   = df_news.rename(columns={"text":"text_news", "title":"title_news"}) if "text" in df_news.columns or "title" in df_news.columns else df_news
print("merging..")
merged = dd.merge(
    df_tweets[["date","text_tweet"]].rename(columns={"text_tweet":"text_tweet"}),
    df_news[["date","text_news","title_news"]].rename(columns={"text_news":"text_news","title_news":"title_news"}),
    on="date",
    how="outer"
)


🧩 merging tweets/news (dask)...


In [None]:
merged1 = None
merged1 = merged[["date","text_tweet","text_news","title_news"]].sample(frac=0.1, random_state=42).compute()
for c in ["text_tweet","text_news","title_news"]:
    if c not in merged1.columns:
        merged1[c] = ""
    else:
        merged1[c] = merged1[c].fillna("")

merged1["text_all"] = (
merged1["text_tweet"].astype(str) + " " +
merged1["title_news"].astype(str) + " " +
merged1["text_news"].astype(str)).str.strip()

def clean_text(s, keep_words=200):
    s = str(s).lower()
    s = re.sub(r"http\S+", " ", s)
    s = re.sub(r"@\w+", " ", s)
    s = re.sub(r"#", " ", s)
    s = re.sub(r"[^a-z\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    words = s.split()
    if len(words) > keep_words:
        words = words[:keep_words]
    return " ".join(words)
merged1["text_all"] = merged1["text_all"].apply(clean_text)
print("text cleaned")



Trying to compute a sample frac= 0.1
Computed merged_small rows: 555901
Cleaning texts (simple)...


In [None]:
print("Aggregating per-day (pandas)...")
daily = merged1.groupby("date").agg(
    text_all = ("text_all", lambda s: " ".join(filter(None, map(str, s)))),
    tweet_count = ("text_all", "count")
).reset_index()
daily["avg_word_count"] = daily["text_all"].apply(lambda t: 0 if not isinstance(t,str) or t.strip()=="" else len(t.split()) / max(1, t.count(" ")+1))
print("Downloading BTC prices from yfinance...")
start = pd.to_datetime(daily["date"].min()) - pd.Timedelta(days=5)
end   = pd.to_datetime(daily["date"].max()) + pd.Timedelta(days=2)
btc = yf.download("BTC-USD", start=start, end=end, progress=False).reset_index().rename(columns=str.lower)
btc['date'] = pd.to_datetime(btc['date']).dt.floor('d')
if isinstance(btc.columns, pd.MultiIndex):
    btc.columns = [
        "_".join([str(i) for i in col if (i is not None and str(i) != "")]).strip()
        for col in btc.columns
    ]
if 'date' not in [c.lower() for c in btc.columns]:
    btc = btc.reset_index()
btc.columns = [str(c).lower().replace(" ", "_") for c in btc.columns]
def find_col(key):
    for c in btc.columns:
        if key in c:
            return c
    return None
col_date   = find_col("date")
col_close  = find_col("close")
col_open   = find_col("open")
col_high   = find_col("high")
col_low    = find_col("low")
col_volume = find_col("volume")

needed = {"date":col_date, "close":col_close, "open":col_open, "high":col_high, "low":col_low, "volume":col_volume}
btc[col_close] = pd.to_numeric(btc[col_close], errors="coerce")
btc[col_open]  = pd.to_numeric(btc[col_open], errors="coerce")
btc[col_high]  = pd.to_numeric(btc[col_high], errors="coerce")
btc[col_low]   = pd.to_numeric(btc[col_low], errors="coerce")
btc[col_volume]= pd.to_numeric(btc[col_volume], errors="coerce")
btc = btc.rename(columns={
    col_date: "date",
    col_close: "close",
    col_open: "open",
    col_high: "high",
    col_low: "low",
    col_volume: "volume"
})
btc = btc.sort_values("date").reset_index(drop=True)
btc = btc.dropna(subset=["close"]).reset_index(drop=True)
btc["next_close"] = btc["close"].shift(-1)
btc = btc.dropna(subset=["next_close"]).reset_index(drop=True)
btc["y"] = (btc["next_close"].values > btc["close"].values).astype(int)
btc["return_1d"] = btc["close"].pct_change().fillna(0)
btc["ma_3"] = btc["close"].rolling(3).mean().fillna(method="bfill")
btc["ma_7"] = btc["close"].rolling(7).mean().fillna(method="bfill")
btc["volatility_7d"] = btc["return_1d"].rolling(7).std().fillna(0)
btc["range"] = ((btc["high"] - btc["low"]) / btc["open"].replace(0, np.nan)).fillna(0)

print("btc rows:", len(btc))
print("columns:", btc.columns.tolist())
print(btc[['date','open','high','low','close','volume','y']].head())

dataset = pd.merge(daily, btc[["date","open","high","low","close","volume","y","return_1d","ma_3","ma_7","volatility_7d","range"]], on="date", how="inner")
dataset = dataset.sort_values("date").reset_index(drop=True)
dataset = dataset[dataset["text_all"].str.strip().astype(bool)].reset_index(drop=True)
print("dataset rows:", dataset.shape[0])
sa = SentimentIntensityAnalyzer()
dataset["sentiment_compound"] = dataset["text_all"].apply(lambda t: sa.polarity_scores(t)["compound"] if isinstance(t,str) and t.strip() else 0.0)




Aggregating per-day (pandas)...
Downloading BTC prices from yfinance...
btc processed rows: 1052
columns now: ['date', 'close', 'high', 'low', 'open', 'volume', 'next_close', 'y', 'return_1d', 'ma_3', 'ma_7', 'volatility_7d', 'range']
        date          open          high           low         close  \
0 2021-01-31  34270.878906  34288.332031  32270.175781  33114.359375   
1 2021-02-01  33114.578125  34638.214844  32384.228516  33537.175781   
2 2021-02-02  33533.199219  35896.882812  33489.218750  35510.289062   
3 2021-02-03  35510.820312  37480.187500  35443.984375  37472.089844   
4 2021-02-04  37475.105469  38592.175781  36317.500000  36926.066406   

        volume  y  
0  52754542671  1  
1  61400400660  1  
2  63088585433  1  
3  61166818159  0  
4  68838074392  1  
Dataset rows after merge: 650
After dropping empty text rows: 650
Computing simple sentiment (VADER) per day...


In [None]:
import numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

df = dataset.copy().sort_values("date").reset_index(drop=True)
def add_lags_and_rolls(df):
    df = df.copy()
    for lag in (1,2,3):
        df[f"return_lag{lag}"] = df["return_1d"].shift(lag).fillna(0)
        df[f"tweet_count_lag{lag}"] = df["tweet_count"].shift(lag).fillna(0)
        df[f"sentiment_lag{lag}"] = df["sentiment_compound"].shift(lag).fillna(0)
    df["sentiment_roll3"] = df["sentiment_compound"].rolling(3).mean().shift(1).fillna(0)
    df["tweet_count_roll3"] = df["tweet_count"].rolling(3).mean().shift(1).fillna(0)
    df["return_roll3"] = df["return_1d"].rolling(3).mean().shift(1).fillna(0)
    df["prev_close"] = df["close"].shift(1).fillna(method="bfill")
    df["prev_return"] = (df["close"] - df["prev_close"]) / df["prev_close"]
    df["ma_3_diff"] = df["close"] - df["ma_3"]
    df["ma_7_diff"] = df["close"] - df["ma_7"]
    return df
df = add_lags_and_rolls(df)
def build_features(df, tfidf_max=1000, ngram=(1,2), min_df=2):
    vec = TfidfVectorizer(max_features=tfidf_max, stop_words="english", ngram_range=ngram, min_df=min_df)
    X_text = vec.fit_transform(df["text_all"].astype(str)).toarray()
    num_cols = ["open","high","low","close","volume","return_1d","ma_3","ma_7","volatility_7d","range",
                "tweet_count","avg_word_count","sentiment_compound",
                "return_lag1","return_lag2","return_lag3",
                "tweet_count_lag1","tweet_count_lag2","tweet_count_lag3",
                "sentiment_lag1","sentiment_lag2","sentiment_lag3",
                "sentiment_roll3","tweet_count_roll3","return_roll3",
                "prev_return","ma_3_diff","ma_7_diff"]
    present = [c for c in num_cols if c in df.columns]
    X_num = df[present].fillna(0).values
    scaler = StandardScaler()
    X_num_s = scaler.fit_transform(X_num)
    X = np.hstack([X_text, X_num_s])
    return X, vec, scaler, present
thresholds = [0.0065, 0.005,0.0055,0.006,0.007,0.0075]
for thr in thresholds:
    df_thr = df.copy()
    df_thr["next_close"] = df_thr["close"].shift(-1)
    df_thr = df_thr.dropna(subset=["next_close"]).reset_index(drop=True)
    df_thr["y_thr"] = ((df_thr["next_close"] - df_thr["close"]) / df_thr["close"] > thr).astype(int)
    X, vec, scaler, num_cols_used = build_features(df_thr, tfidf_max=1000, ngram=(1,2), min_df=2)
    y = df_thr["y_thr"].values
    n = X.shape[0]
    split = int(0.8 * n)
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]
    param_grid ={"n_estimators":[160,180,200,220,240,260,280,300 ,320,340,360,380,400,420,440],
                "max_depth":[3,4,5,6,7,8,9,10,12],
                "min_samples_split":[4,6,7,8,9,10,11,12,13,14,15]}
    tscv = TimeSeriesSplit(n_splits=4)
    rf = RandomForestClassifier(class_weight='balanced_subsample', random_state=42, n_jobs=-1)
    grid = GridSearchCV(rf, param_grid, cv=tscv, scoring="accuracy", n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    best = grid.best_estimator_
    print("Best params:", grid.best_params_, "Best CV:", grid.best_score_)
    y_pred = best.predict(X_test)
    print("Final Test acc:", accuracy_score(y_test, y_pred))
    acc = accuracy_score(y_test, y_pred)
    print(f"thr={thr:.4f} -> test acc={acc:.4f}")


Fitting 4 folds for each of 1485 candidates, totalling 5940 fits
Best params: {'max_depth': 8, 'min_samples_split': 6, 'n_estimators': 180} Best CV: 0.6213592233009709
Final Test acc: 0.6461538461538462
thr=0.0065 -> test acc=0.6462
Fitting 4 folds for each of 1485 candidates, totalling 5940 fits
Best params: {'max_depth': 3, 'min_samples_split': 11, 'n_estimators': 240} Best CV: 0.5898058252427184
Final Test acc: 0.6615384615384615
thr=0.0050 -> test acc=0.6615
Fitting 4 folds for each of 1485 candidates, totalling 5940 fits
Best params: {'max_depth': 10, 'min_samples_split': 8, 'n_estimators': 160} Best CV: 0.5995145631067961
Final Test acc: 0.6384615384615384
thr=0.0055 -> test acc=0.6385
Fitting 4 folds for each of 1485 candidates, totalling 5940 fits
Best params: {'max_depth': 3, 'min_samples_split': 14, 'n_estimators': 220} Best CV: 0.6067961165048543
Final Test acc: 0.6692307692307692
thr=0.0060 -> test acc=0.6692
Fitting 4 folds for each of 1485 candidates, totalling 5940 fits


In [None]:
df_final = df.copy()
df_final["next_close"] = df_final["close"].shift(-1)
df_final = df_final.dropna(subset=["next_close"]).reset_index(drop=True)
df_final["y"] = ((df_final["next_close"] - df_final["close"]) / df_final["close"] > 0.0075).astype(int)
X, vec, scaler, num_cols_used = build_features(df_final, tfidf_max=1500, ngram=(1,2), min_df=2)
y = df_final["y"].values
split = int(0.8 * len(y))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]
rf = RandomForestClassifier(class_weight='balanced_subsample',
                            random_state=42,
                            max_depth=5,
                            min_samples_split=10,
                            n_estimators=180,
                            n_jobs=-1)
rf.fit(X_train , y_train)
y_pred = rf.predict(X_test)
print("Final Test acc:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Final Test acc: 0.676923076923077
              precision    recall  f1-score   support

           0       0.69      0.97      0.81        91
           1       0.00      0.00      0.00        39

    accuracy                           0.68       130
   macro avg       0.35      0.48      0.40       130
weighted avg       0.49      0.68      0.57       130

