In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_auc_score
from xgboost import XGBClassifier

# Load and sort data
df = pd.read_csv('/Users/cyruskurd/Documents/grad_programming/AML/Project work/combined_data_with_y.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp')

# Define maxout parameters
num_days_maxout = 3
threshold = 1.095 ** num_days_maxout

# Feature Engineering
df['SMA_5'] = df['close'].rolling(window=5).mean()
df['SMA_10'] = df['close'].rolling(window=10).mean()
df['SMA_20'] = df['close'].rolling(window=20).mean()
df['Bollinger_Upper'] = df['SMA_20'] + (df['close'].rolling(window=20).std() * 2)
df['Bollinger_Lower'] = df['SMA_20'] - (df['close'].rolling(window=20).std() * 2)
df['EMA_10'] = df['close'].ewm(span=10, adjust=False).mean()

# RSI Calculation
delta = df['close'].diff(1)
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()
rs = avg_gain / avg_loss
df['RSI'] = 100 - (100 / (1 + rs))

# Additional Indicators
df['ATR'] = df['close'].rolling(window=14).std()
df['Volume_SMA_10'] = df['vol'].rolling(window=10).mean()
df['Volume_Spike'] = (df['vol'] > df['Volume_SMA_10']).astype(int)
df['Rolling_Std_20'] = df['close'].rolling(window=20).std()

# Drop rows with NaN values
df.dropna(inplace=True)

# Define features and target
X = df[['SMA_5', 'SMA_10', 'SMA_20', 'EMA_10', 'Bollinger_Upper', 'Bollinger_Lower', 'RSI', 'ATR', 'Volume_Spike', 'Rolling_Std_20']]
y = df['y']

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test-validation split
split_date = '2020-01-01'

# Convert y to binary labels based on a threshold
binary_threshold = 0.1
y_binary = (y >= binary_threshold).astype(int)

# Redefine the train-test split with the binary labels
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_scaled[df['timestamp'] < split_date],
    y_binary[df['timestamp'] < split_date],
    test_size=0.2, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

# Handle class imbalance using scale_pos_weight
negative_counts = (y_train == 0).sum()
positive_counts = (y_train == 1).sum()
scale_pos_weight = negative_counts / positive_counts

# Train XGBoost classifier
xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42, n_estimators=100,
                          max_depth=5, learning_rate=0.1, subsample=0.8)

xgb_model.fit(X_train, y_train)

# Predictions
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
threshold = 0.5
y_pred = (y_pred_proba >= threshold).astype(int)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"XGBoost Accuracy: {accuracy:.4f}")
print(f"XGBoost F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print(classification_report(y_test, y_pred))


XGBoost Accuracy: 0.5482
XGBoost F1 Score: 0.0796
ROC AUC Score: 0.6380
Confusion Matrix:
 [[1025130  855388]
 [  20663   37875]]
              precision    recall  f1-score   support

           0       0.98      0.55      0.70   1880518
           1       0.04      0.65      0.08     58538

    accuracy                           0.55   1939056
   macro avg       0.51      0.60      0.39   1939056
weighted avg       0.95      0.55      0.68   1939056



In [8]:
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# function to display the class distribution of each resampled dataset
def print_class_distribution(y, label):
    counter = Counter(y)
    print(f"{label} class distribution: {counter}")

# function to evaluate the model performance
def evaluate_model(model, X_test, y_test, threshold=0.5):
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba >= threshold).astype(int)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print("Confusion Matrix:\n", conf_matrix)
    print(classification_report(y_test, y_pred))
    return accuracy, f1, roc_auc

# Dictionary to store results for each resampling method
results = {}

# Original Model (no resampling, with scale_pos_weight)
print("Original Model:")
xgb_model_original = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42, n_estimators=100,
                                   max_depth=5, learning_rate=0.1, subsample=0.8)
xgb_model_original.fit(X_train, y_train)
results['Original'] = evaluate_model(xgb_model_original, X_test, y_test)

# Oversampling
print("\nOversampling:")
oversampler = RandomOverSampler(random_state=42)
X_train_over, y_train_over = oversampler.fit_resample(X_train, y_train)
print_class_distribution(y_train_over, "Oversampled Train")
xgb_model_over = XGBClassifier(random_state=42, n_estimators=100, max_depth=5, learning_rate=0.1, subsample=0.8)
xgb_model_over.fit(X_train_over, y_train_over)
results['Oversampling'] = evaluate_model(xgb_model_over, X_test, y_test)

# Undersampling
print("\nUndersampling:")
undersampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)
print_class_distribution(y_train_under, "Undersampled Train")
xgb_model_under = XGBClassifier(random_state=42, n_estimators=100, max_depth=5, learning_rate=0.1, subsample=0.8)
xgb_model_under.fit(X_train_under, y_train_under)
results['Undersampling'] = evaluate_model(xgb_model_under, X_test, y_test)

# SMOTE
print("\nSMOTE:")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print_class_distribution(y_train_smote, "SMOTE Train")
xgb_model_smote = XGBClassifier(random_state=42, n_estimators=100, max_depth=5, learning_rate=0.1, subsample=0.8)
xgb_model_smote.fit(X_train_smote, y_train_smote)
results['SMOTE'] = evaluate_model(xgb_model_smote, X_test, y_test)

# Display summary of results for each method
for method, scores in results.items():
    print(f"\n{method} - Accuracy: {scores[0]:.4f}, F1 Score: {scores[1]:.4f}, ROC AUC: {scores[2]:.4f}")

Original Model:
Accuracy: 0.5482
F1 Score: 0.0796
ROC AUC Score: 0.6380
Confusion Matrix:
 [[1025130  855388]
 [  20663   37875]]
              precision    recall  f1-score   support

           0       0.98      0.55      0.70   1880518
           1       0.04      0.65      0.08     58538

    accuracy                           0.55   1939056
   macro avg       0.51      0.60      0.39   1939056
weighted avg       0.95      0.55      0.68   1939056


Oversampling:
Oversampled Train class distribution: Counter({0: 5642325, 1: 5642325})
Accuracy: 0.5498
F1 Score: 0.0796
ROC AUC Score: 0.6377
Confusion Matrix:
 [[1028297  852221]
 [  20776   37762]]
              precision    recall  f1-score   support

           0       0.98      0.55      0.70   1880518
           1       0.04      0.65      0.08     58538

    accuracy                           0.55   1939056
   macro avg       0.51      0.60      0.39   1939056
weighted avg       0.95      0.55      0.68   1939056


Undersampling: