In [37]:
# --- Import stuff
import pandas as pd
import zipfile
import os
from glob import glob
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import numpy as np
import json
import numpy as np
from datetime import datetime

In [38]:
def unzip_and_load_crypto(zip_folder_path):
    all_crypto = []
    zip_files = glob(os.path.join(zip_folder_path, '*.zip'))

    for zip_file in zip_files:
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            extract_path = os.path.splitext(zip_file)[0]
            os.makedirs(extract_path, exist_ok=True)
            zip_ref.extractall(extract_path)

        csv_files = glob(os.path.join(extract_path, '*.csv'))
        for csv in csv_files:
            df = pd.read_csv(csv)
            all_crypto.append(df)

    crypto_df = pd.concat(all_crypto, ignore_index=True)
    return crypto_df


In [39]:
def load_reddit_sentiment(sentiment_folder_path):
    all_reddit = []
    csv_files = glob(os.path.join(sentiment_folder_path, '*.csv'))

    for csv in csv_files:
        if os.path.isfile(csv):  # Fix: only open real files
            df = pd.read_csv(csv)
            all_reddit.append(df)

    reddit_df = pd.concat(all_reddit, ignore_index=True)
    return reddit_df

In [40]:
def preprocess_data(crypto_df, reddit_df):
    crypto_df['timestamp'] = pd.to_datetime(crypto_df['timestamp']) # Converts timestampl colums from string to datetime objs
    reddit_df['timestamp'] = pd.to_datetime(reddit_df['timestamp'])

    crypto_df.set_index('timestamp', inplace=True) # Set timestamp as the index of each DataFrame
    reddit_df.set_index('timestamp', inplace=True)

    # THIS SETS HOUR INTERVALSA, MAY CHANGE
    crypto_hourly = crypto_df['close'].resample('h').last()  # Hour intervals, for now
    reddit_hourly = reddit_df['sentiment'].resample('h').mean()

    # Shift the crypto closing prices 6 hours backwards, for every hour we now have price 6 hours into FUTURE
    future_price = crypto_hourly.shift(-6)  # 6 hours into future

    # This DA DATA, contains everything listed under me
    data = pd.DataFrame({
        'sentiment': reddit_hourly,
        'close_price': crypto_hourly,
        'future_close_price': future_price
    })

    data.dropna(inplace=True) # Drops any rows where data is missing, Can happen if at beggining or end, or if no reddit post synced with kraken data
    
    return data


In [41]:
def create_lagged_features(data, max_lag_hours=12):
    for lag in range(1, max_lag_hours + 1):
        data[f'sentiment_prev_{lag}h'] = data['sentiment'].shift(lag)
        data[f'close_price_prev_{lag}h'] = data['close_price'].shift(lag)

    # Create target: price movement 6 hours later
    data['target'] = (data['future_close_price'] > data['close_price']).astype(int)

    data.dropna(inplace=True)
    return data

In [42]:
def split_data(data):
    feature_cols = [col for col in data.columns if 'sentiment' in col or 'close_price_prev' in col]
    X = data[feature_cols]
    y = data['target']

    train_end = int(len(X) * 0.7)
    valid_end = int(len(X) * 0.85)

    X_train = X.iloc[:train_end]
    y_train = y.iloc[:train_end]

    X_valid = X.iloc[train_end:valid_end]
    y_valid = y.iloc[train_end:valid_end]

    X_test = X.iloc[valid_end:]
    y_test = y.iloc[valid_end:]

    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [43]:
def train_and_evaluate(X_train, X_valid, X_test, y_train, y_valid, y_test, output_file='testresults.json'):
    # Import numpy if not already imported
    import numpy as np
    
    # Scale the features for better model performance
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_valid_scaled = scaler.transform(X_valid)
    X_test_scaled = scaler.transform(X_test)
    
    # Dictionary to store performance metrics
    performance_metrics = {}
    
    # ---- Improved SVM Model ----
    print("[INFO] Training improved SVM model...")
    
    # Use a different kernel (linear instead of rbf)
    # Linear kernels often work better for time-series financial data with many features
    svm_model = SVC(kernel='linear', probability=True, C=1.0, class_weight='balanced')
    svm_model.fit(X_train_scaled, y_train)
    
    # Evaluate on validation set
    svm_valid_pred = svm_model.predict(X_valid_scaled)
    svm_valid_acc = accuracy_score(y_valid, svm_valid_pred)
    print(f"SVM Validation Accuracy: {svm_valid_acc:.4f}")
    
    # Get predictions on test set
    y_test_pred_svm = svm_model.predict(X_test_scaled)
    
    # Store SVM metrics
    performance_metrics['svm'] = {
        'accuracy': accuracy_score(y_test, y_test_pred_svm),
        'precision': precision_score(y_test, y_test_pred_svm),
        'recall': recall_score(y_test, y_test_pred_svm),
        'f1_score': f1_score(y_test, y_test_pred_svm),
        'confusion_matrix': confusion_matrix(y_test, y_test_pred_svm).tolist()
    }
    
    print("\nSVM Model Test Set Performance:")
    print(classification_report(y_test, y_test_pred_svm))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_pred_svm))
    print(f"Accuracy: {performance_metrics['svm']['accuracy']:.4f}")
    print(f"Precision: {performance_metrics['svm']['precision']:.4f}")
    print(f"Recall: {performance_metrics['svm']['recall']:.4f}")
    print(f"F1 Score: {performance_metrics['svm']['f1_score']:.4f}")
    
    # ---- Random Forest Model ----
    print("\n[INFO] Training Random Forest model...")
    rf_model = RandomForestClassifier(
        n_estimators=100, 
        max_depth=5,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=42
    )
    rf_model.fit(X_train, y_train)
    
    # Evaluate on validation set
    rf_valid_pred = rf_model.predict(X_valid)
    rf_valid_acc = accuracy_score(y_valid, rf_valid_pred)
    print(f"Random Forest Validation Accuracy: {rf_valid_acc:.4f}")
    
    # Get predictions on test set
    y_test_pred_rf = rf_model.predict(X_test)
    
    # Store RF metrics
    performance_metrics['random_forest'] = {
        'accuracy': accuracy_score(y_test, y_test_pred_rf),
        'precision': precision_score(y_test, y_test_pred_rf),
        'recall': recall_score(y_test, y_test_pred_rf),
        'f1_score': f1_score(y_test, y_test_pred_rf),
        'confusion_matrix': confusion_matrix(y_test, y_test_pred_rf).tolist()
    }
    
    print("\nRandom Forest Model Test Set Performance:")
    print(classification_report(y_test, y_test_pred_rf))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_pred_rf))
    print(f"Accuracy: {performance_metrics['random_forest']['accuracy']:.4f}")
    print(f"Precision: {performance_metrics['random_forest']['precision']:.4f}")
    print(f"Recall: {performance_metrics['random_forest']['recall']:.4f}")
    print(f"F1 Score: {performance_metrics['random_forest']['f1_score']:.4f}")
    
    # ---- XGBoost Model ----
    print("\n[INFO] Training tuned XGBoost model...")
    xgb_model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),  # Adjust for class imbalance
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False
    )
    xgb_model.fit(X_train, y_train)
    
    # Evaluate on validation set
    xgb_valid_pred = xgb_model.predict(X_valid)
    xgb_valid_acc = accuracy_score(y_valid, xgb_valid_pred)
    print(f"XGBoost Validation Accuracy: {xgb_valid_acc:.4f}")
    
    # Get predictions on test set
    y_test_pred_xgb = xgb_model.predict(X_test)
    
    # Store XGBoost metrics
    performance_metrics['xgboost'] = {
        'accuracy': accuracy_score(y_test, y_test_pred_xgb),
        'precision': precision_score(y_test, y_test_pred_xgb),
        'recall': recall_score(y_test, y_test_pred_xgb),
        'f1_score': f1_score(y_test, y_test_pred_xgb),
        'confusion_matrix': confusion_matrix(y_test, y_test_pred_xgb).tolist()
    }
    
    print("\nXGBoost Model Test Set Performance:")
    print(classification_report(y_test, y_test_pred_xgb))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_pred_xgb))
    print(f"Accuracy: {performance_metrics['xgboost']['accuracy']:.4f}")
    print(f"Precision: {performance_metrics['xgboost']['precision']:.4f}")
    print(f"Recall: {performance_metrics['xgboost']['recall']:.4f}")
    print(f"F1 Score: {performance_metrics['xgboost']['f1_score']:.4f}")
    
    # ---- Ensemble approach ----
    print("\n[INFO] Creating ensemble prediction...")
    
    # Create ensemble prediction (majority voting)
    ensemble_pred = np.zeros(len(y_test))
    
    # Get probability predictions
    svm_proba = svm_model.predict_proba(X_test_scaled)[:, 1]
    rf_proba = rf_model.predict_proba(X_test)[:, 1]
    xgb_proba = xgb_model.predict_proba(X_test)[:, 1]
    
    # Weighted average of probabilities (giving more weight to the best performing model)
    # Check which model had best validation accuracy and weight accordingly
    best_val_acc = max(svm_valid_acc, rf_valid_acc, xgb_valid_acc)
    
    if best_val_acc == svm_valid_acc:
        weights = [0.5, 0.25, 0.25]  # SVM gets more weight
    elif best_val_acc == rf_valid_acc:
        weights = [0.25, 0.5, 0.25]  # Random Forest gets more weight
    else:
        weights = [0.25, 0.25, 0.5]  # XGBoost gets more weight
    
    # Calculate weighted probabilities
    ensemble_proba = (
        weights[0] * svm_proba + 
        weights[1] * rf_proba + 
        weights[2] * xgb_proba
    )
    
    # Convert to binary predictions
    ensemble_pred = (ensemble_proba > 0.5).astype(int)
    
    # Store ensemble metrics
    performance_metrics['ensemble'] = {
        'accuracy': accuracy_score(y_test, ensemble_pred),
        'precision': precision_score(y_test, ensemble_pred),
        'recall': recall_score(y_test, ensemble_pred),
        'f1_score': f1_score(y_test, ensemble_pred),
        'confusion_matrix': confusion_matrix(y_test, ensemble_pred).tolist(),
        'weights': weights
    }
    
    print("\nEnsemble Model Test Set Performance:")
    print(classification_report(y_test, ensemble_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, ensemble_pred))
    print(f"Accuracy: {performance_metrics['ensemble']['accuracy']:.4f}")
    print(f"Precision: {performance_metrics['ensemble']['precision']:.4f}")
    print(f"Recall: {performance_metrics['ensemble']['recall']:.4f}")
    print(f"F1 Score: {performance_metrics['ensemble']['f1_score']:.4f}")
    
    # Extract feature importances from Random Forest
    feature_importances = None
    if hasattr(rf_model, 'feature_importances_'):
        print("\nRandom Forest Feature Importance:")
        feature_importances = {}
        for feature, importance in zip(X_train.columns, rf_model.feature_importances_):
            feature_importances[feature] = importance
            print(f"{feature}: {importance:.4f}")
    
    # Save results to file
    save_results_to_file(
        models={'svm': svm_model, 'rf': rf_model, 'xgb': xgb_model},
        performance_metrics=performance_metrics,
        feature_importances=feature_importances,
        filename=output_file
    )
    
    return {
        'svm_model': svm_model, 
        'rf_model': rf_model, 
        'xgb_model': xgb_model,
        'ensemble_weights': weights,
        'scaler': scaler,
        'performance_metrics': performance_metrics
    }

In [44]:
def save_results_to_file(self, models, performance_metrics, feature_importances=None, filename='testresults.json'):
    """
    Save model evaluation results to an external file.
    
    Args:
        models: Dictionary containing trained models
        performance_metrics: Dictionary containing accuracy, precision, recall, and f1 scores
        feature_importances: Optional dictionary containing feature importance
        filename: Name of the output file (default is 'testresults.json')
    
    Returns:
        None
    """
    
    # Convert numpy arrays and other non-serializable objects to lists
    def serialize_item(item):
        if isinstance(item, np.ndarray):
            return item.tolist()
        elif isinstance(item, np.float32) or isinstance(item, np.float64):
            return float(item)
        elif isinstance(item, np.int32) or isinstance(item, np.int64):
            return int(item)
        else:
            return item
    
    # Create results dictionary
    results = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'performance_metrics': performance_metrics
    }
    
    # Add feature importances if available
    if feature_importances:
        results['feature_importances'] = {k: serialize_item(v) for k, v in feature_importances.items()}
    
    # Determine file extension and save accordingly
    file_ext = filename.split('.')[-1].lower()
    
    if file_ext == 'json':
        # Save as JSON
        with open(filename, 'w') as f:
            json.dump(results, f, indent=4)
    elif file_ext in ['csv', 'txt']:
        # Save as CSV or text file
        import csv
        with open(filename, 'w', newline='') as f:
            if file_ext == 'csv':
                writer = csv.writer(f)
                # Write headers
                writer.writerow(['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
                # Write data
                for model, metrics in performance_metrics.items():
                    writer.writerow([model, metrics['accuracy'], metrics['precision'], 
                                    metrics['recall'], metrics['f1_score']])
            else:  # text file
                f.write(f"Model Evaluation Results - {results['timestamp']}\n\n")
                for model, metrics in performance_metrics.items():
                    f.write(f"Model: {model}\n")
                    f.write(f"Accuracy: {metrics['accuracy']}\n")
                    f.write(f"Precision: {metrics['precision']}\n")
                    f.write(f"Recall: {metrics['recall']}\n")
                    f.write(f"F1 Score: {metrics['f1_score']}\n\n")
                
                if feature_importances:
                    f.write("Feature Importances:\n")
                    for feature, importance in feature_importances.items():
                        f.write(f"{feature}: {importance}\n")
    else:
        # Default to pickle for other extensions
        import pickle
        with open(filename, 'wb') as f:
            pickle.dump(results, f)
    
    print(f"[INFO] Results saved to {filename}")

In [45]:
def main():
    crypto_folder = 'data/'  # update this
    reddit_folder = 'data/'   # update this
    output_file = 'testresults.json'  # you can change to .csv, .txt, or .pkl

    print("[INFO] Loading crypto data...")
    crypto_df = unzip_and_load_crypto(crypto_folder)

    print("[INFO] Loading reddit sentiment data...")
    reddit_df = load_reddit_sentiment(reddit_folder)

    print("[INFO] Preprocessing data...")
    data = preprocess_data(crypto_df, reddit_df)

    print("[INFO] Creating lagged features...")
    data = create_lagged_features(data, max_lag_hours=12)

    print("[INFO] Splitting data...")
    X_train, X_valid, X_test, y_train, y_valid, y_test = split_data(data)

    print("[INFO] Training and evaluating model...")
    model = train_and_evaluate(X_train, X_valid, X_test, y_train, y_valid, y_test, output_file)

    print("[INFO] Done.")

In [46]:
if __name__ == "__main__":
    main()

[INFO] Loading crypto data...
[INFO] Loading reddit sentiment data...
[INFO] Preprocessing data...
[INFO] Creating lagged features...
[INFO] Splitting data...
[INFO] Training and evaluating model...
[INFO] Training improved SVM model...
SVM Validation Accuracy: 0.6111

SVM Model Test Set Performance:
              precision    recall  f1-score   support

           0       0.81      0.41      0.55        51
           1       0.36      0.77      0.49        22

    accuracy                           0.52        73
   macro avg       0.58      0.59      0.52        73
weighted avg       0.67      0.52      0.53        73

Confusion Matrix:
[[21 30]
 [ 5 17]]
Accuracy: 0.5205
Precision: 0.3617
Recall: 0.7727
F1 Score: 0.4928

[INFO] Training Random Forest model...
Random Forest Validation Accuracy: 0.5278

Random Forest Model Test Set Performance:
              precision    recall  f1-score   support

           0       0.79      0.22      0.34        51
           1       0.32      0.86

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Validation Accuracy: 0.6111

XGBoost Model Test Set Performance:
              precision    recall  f1-score   support

           0       0.76      0.37      0.50        51
           1       0.33      0.73      0.46        22

    accuracy                           0.48        73
   macro avg       0.55      0.55      0.48        73
weighted avg       0.63      0.48      0.49        73

Confusion Matrix:
[[19 32]
 [ 6 16]]
Accuracy: 0.4795
Precision: 0.3333
Recall: 0.7273
F1 Score: 0.4571

[INFO] Creating ensemble prediction...

Ensemble Model Test Set Performance:
              precision    recall  f1-score   support

           0       0.83      0.29      0.43        51
           1       0.35      0.86      0.49        22

    accuracy                           0.47        73
   macro avg       0.59      0.58      0.46        73
weighted avg       0.69      0.47      0.45        73

Confusion Matrix:
[[15 36]
 [ 3 19]]
Accuracy: 0.4658
Precision: 0.3455
Recall: 0.8636
F1 S