In [36]:
# --- Import stuff
import pandas as pd
import zipfile
import os
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

In [37]:
def unzip_and_load_crypto(zip_folder_path):
    all_crypto = []
    zip_files = glob(os.path.join(zip_folder_path, '*.zip'))

    for zip_file in zip_files:
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            extract_path = os.path.splitext(zip_file)[0]
            os.makedirs(extract_path, exist_ok=True)
            zip_ref.extractall(extract_path)

        csv_files = glob(os.path.join(extract_path, '*.csv'))
        for csv in csv_files:
            df = pd.read_csv(csv)
            all_crypto.append(df)

    crypto_df = pd.concat(all_crypto, ignore_index=True)
    return crypto_df


In [38]:
def load_reddit_sentiment(sentiment_folder_path):
    all_reddit = []
    csv_files = glob(os.path.join(sentiment_folder_path, '*.csv'))

    for csv in csv_files:
        if os.path.isfile(csv):  # Fix: only open real files
            df = pd.read_csv(csv)
            all_reddit.append(df)

    reddit_df = pd.concat(all_reddit, ignore_index=True)
    return reddit_df

In [39]:
def preprocess_data(crypto_df, reddit_df):
    crypto_df['timestamp'] = pd.to_datetime(crypto_df['timestamp']) # Converts timestampl colums from string to datetime objs
    reddit_df['timestamp'] = pd.to_datetime(reddit_df['timestamp'])

    crypto_df.set_index('timestamp', inplace=True) # Set timestamp as the index of each DataFrame
    reddit_df.set_index('timestamp', inplace=True)

    # THIS SETS HOUR INTERVALSA, MAY CHANGE
    crypto_hourly = crypto_df['close'].resample('h').last()  # Hour intervals, for now
    reddit_hourly = reddit_df['sentiment'].resample('h').mean()

    # Shift the crypto closing prices 6 hours backwards, for every hour we now have price 6 hours into FUTURE
    future_price = crypto_hourly.shift(-6)  # 6 hours into future

    # This DA DATA, contains everything listed under me
    data = pd.DataFrame({
        'sentiment': reddit_hourly,
        'close_price': crypto_hourly,
        'future_close_price': future_price
    })

    data.dropna(inplace=True) # Drops any rows where data is missing, Can happen if at beggining or end, or if no reddit post synced with kraken data
    
    return data


In [40]:
def create_lagged_features(data, max_lag_hours=12):
    for lag in range(1, max_lag_hours + 1):
        data[f'sentiment_prev_{lag}h'] = data['sentiment'].shift(lag)
        data[f'close_price_prev_{lag}h'] = data['close_price'].shift(lag)

    # Create target: price movement 6 hours later
    data['target'] = (data['future_close_price'] > data['close_price']).astype(int)

    data.dropna(inplace=True)
    return data

In [41]:
def split_data(data):
    feature_cols = [col for col in data.columns if 'sentiment' in col or 'close_price_prev' in col]
    X = data[feature_cols]
    y = data['target']

    train_end = int(len(X) * 0.7)
    valid_end = int(len(X) * 0.85)

    X_train = X.iloc[:train_end]
    y_train = y.iloc[:train_end]

    X_valid = X.iloc[train_end:valid_end]
    y_valid = y.iloc[train_end:valid_end]

    X_test = X.iloc[valid_end:]
    y_test = y.iloc[valid_end:]

    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [42]:
def train_and_evaluate(X_train, X_valid, X_test, y_train, y_valid, y_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_valid_scaled = scaler.transform(X_valid)
    X_test_scaled = scaler.transform(X_test)

    print("[INFO] Training SVM model...")
    svm_model = SVC(kernel='rbf')
    svm_model.fit(X_train_scaled, y_train)

    y_test_pred_svm = svm_model.predict(X_test_scaled)

    print("\nSVM Model Test Set Performance:")
    print(classification_report(y_test, y_test_pred_svm))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_pred_svm))
    print(f"Accuracy: {accuracy_score(y_test, y_test_pred_svm):.4f}")
    print(f"Precision: {precision_score(y_test, y_test_pred_svm):.4f}")
    print(f"Recall: {recall_score(y_test, y_test_pred_svm):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_test_pred_svm):.4f}")

    print("\n[INFO] Training XGBoost model...")
    xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    xgb_model.fit(X_train, y_train)

    y_test_pred_xgb = xgb_model.predict(X_test)

    print("\nXGBoost Model Test Set Performance:")
    print(classification_report(y_test, y_test_pred_xgb))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_pred_xgb))
    print(f"Accuracy: {accuracy_score(y_test, y_test_pred_xgb):.4f}")
    print(f"Precision: {precision_score(y_test, y_test_pred_xgb):.4f}")
    print(f"Recall: {recall_score(y_test, y_test_pred_xgb):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_test_pred_xgb):.4f}")

    return svm_model, xgb_model


In [43]:
def main():
    crypto_folder = 'data/'  # update this
    reddit_folder = 'data/'   # update this

    print("[INFO] Loading crypto data...")
    crypto_df = unzip_and_load_crypto(crypto_folder)

    print("[INFO] Loading reddit sentiment data...")
    reddit_df = load_reddit_sentiment(reddit_folder)

    print("[INFO] Preprocessing data...")
    data = preprocess_data(crypto_df, reddit_df)

    print("[INFO] Creating lagged features...")
    data = create_lagged_features(data, max_lag_hours=12)

    print("[INFO] Splitting data...")
    X_train, X_valid, X_test, y_train, y_valid, y_test = split_data(data)

    print("[INFO] Training and evaluating model...")
    model = train_and_evaluate(X_train, X_valid, X_test, y_train, y_valid, y_test)

    print("[INFO] Done.")

In [44]:
if __name__ == "__main__":
    main()

[INFO] Loading crypto data...
[INFO] Loading reddit sentiment data...
[INFO] Preprocessing data...
[INFO] Creating lagged features...
[INFO] Splitting data...
[INFO] Training and evaluating model...
[INFO] Training SVM model...

SVM Model Test Set Performance:
              precision    recall  f1-score   support

           0       0.73      0.22      0.34        50
           1       0.30      0.81      0.44        21

    accuracy                           0.39        71
   macro avg       0.52      0.51      0.39        71
weighted avg       0.61      0.39      0.37        71

Confusion Matrix:
[[11 39]
 [ 4 17]]
Accuracy: 0.3944
Precision: 0.3036
Recall: 0.8095
F1 Score: 0.4416

[INFO] Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Model Test Set Performance:
              precision    recall  f1-score   support

           0       0.73      0.32      0.44        50
           1       0.31      0.71      0.43        21

    accuracy                           0.44        71
   macro avg       0.52      0.52      0.44        71
weighted avg       0.60      0.44      0.44        71

Confusion Matrix:
[[16 34]
 [ 6 15]]
Accuracy: 0.4366
Precision: 0.3061
Recall: 0.7143
F1 Score: 0.4286
[INFO] Done.
