In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import shuffle
import ast

tickers = ['XLF', 'XLU', 'QQQ', 'SPY', 'XLP', 'EWZ', 'EWH', 'XLY', 'XLE']

In [2]:
def handle_class_imbalance(train_df):
    """
    Handles class imbalance by oversampling the minority classes.
    """
    # Separate the classes
    class_0 = train_df[train_df['Label'] == 0]
    class_1 = train_df[train_df['Label'] == 1]
    class_2 = train_df[train_df['Label'] == 2]

    # Calculate the size of the largest class
    max_size = max(len(class_0), len(class_1), len(class_2))

    # Oversample the minority classes
    class_0_oversampled = class_0.sample(max_size, replace=True, random_state=42)
    class_1_oversampled = class_1.sample(max_size, replace=True, random_state=42)
    class_2_oversampled = class_2.sample(max_size, replace=True, random_state=42)

    # Combine the oversampled classes
    balanced_train_df = pd.concat([class_0_oversampled, class_1_oversampled, class_2_oversampled])

    # Shuffle the data
    balanced_train_df = shuffle(balanced_train_df, random_state=42)

    return balanced_train_df

In [5]:
def train_cnn(train_df, test_df, label_encoder, params, model):
    """
    Trains and evaluates a CNN on the given train and test data.
    """
    print("Training is starting ...")

    # Extract features and labels
    X_train = train_df.drop(columns=['Label']).values
    y_train = train_df['Label'].values
    X_test = test_df.drop(columns=['Label']).values
    y_test = test_df['Label'].values

    # Reshape the features into 15 × 15 matrices
    X_train = np.array([np.stack(row).reshape(15, 15) for row in X_train])
    X_test = np.array([np.stack(row).reshape(15, 15) for row in X_test])

    # Encode the labels into categorical values
    y_train = to_categorical(y_train, num_classes=params["num_classes"])
    y_test = to_categorical(y_test, num_classes=params["num_classes"])

    # Reshape the input data to include a channel dimension
    X_train = X_train.reshape(X_train.shape[0], params["input_w"], params["input_h"], 1)
    X_test = X_test.reshape(X_test.shape[0], params["input_w"], params["input_h"], 1)

    # Train the model
    model.fit(X_train, y_train, batch_size=params["batch_size"], epochs=params["epochs"], verbose=1, validation_data=(X_test, y_test))

    # Evaluate the model
    score = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test loss: {score[0]}")
    print(f"Test accuracy: {score[1]}")

    # Generate predictions
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    # Print classification report and confusion matrix
    print("Classification Report:")
    report = classification_report(y_true_classes, y_pred_classes, target_names=label_encoder.classes_, output_dict=True)
    print(report)

    print("Confusion Matrix:")
    print(confusion_matrix(y_true_classes, y_pred_classes))

    return report

In [11]:
# Define the CNN model
model = Sequential()
model.add(Input(shape=(15, 15, 1)))  # Explicitly define the input shape
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

# Main loop for processing tickers
for ticker in tickers:

    classification_reports = []

    for year in range(2019, 2024+1):
        df = pd.read_csv(f'./data/test_years/{ticker}_{year}.csv')
    
        # Convert string representations of lists back to Python lists
        for col in df.columns:
            if col not in ['Date', 'Label']:
                df[col] = df[col].apply(ast.literal_eval)

        # Encode the labels before resampling
        label_encoder = LabelEncoder()
        df['Label'] = label_encoder.fit_transform(df['Label'])

        # Split into training (2014–2018) and testing (2019) sets
        train_df = df[df['Date'].str[:4].astype(int) < year]
        test_df = df[df['Date'].str[:4].astype(int) == year]

        # Drop the Date column
        train_df = train_df.drop(columns=['Date'])
        test_df = test_df.drop(columns=['Date'])

        # Handle class imbalance in the training set
        train_df = handle_class_imbalance(train_df)

        # Define CNN parameters
        params = {
            "input_w": 15,
            "input_h": 15,
            "num_classes": 3,
            "batch_size": 32,
            "epochs": 20
        }

        # Train and evaluate the CNN
        report = train_cnn(train_df, test_df, label_encoder, params, model)

        # Add ticker and year to the report
        for label, metrics in report.items():
            if label not in ['accuracy', 'macro avg', 'weighted avg']:
                classification_reports.append({
                    'Ticker': ticker,
                    'Year': year,
                    'Class': label,
                    'Precision': metrics['precision'],
                    'Recall': metrics['recall'],
                    'F1-Score': metrics['f1-score'],
                    'Support': metrics['support']
                })

    # Save classification reports to CSV
    classification_reports_df = pd.DataFrame(classification_reports)
    classification_reports_df.to_csv(f'./results/greyscale/results_{ticker}.csv', index=False)

Training is starting ...
Epoch 1/20
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.5210 - loss: 0.9260 - val_accuracy: 0.7817 - val_loss: 0.5485
Epoch 2/20
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8350 - loss: 0.4206 - val_accuracy: 0.8571 - val_loss: 0.3355
Epoch 3/20
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8853 - loss: 0.3201 - val_accuracy: 0.8770 - val_loss: 0.3019
Epoch 4/20
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9142 - loss: 0.2334 - val_accuracy: 0.8413 - val_loss: 0.3523
Epoch 5/20
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9333 - loss: 0.1947 - val_accuracy: 0.8095 - val_loss: 0.4091
Epoch 6/20
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9401 - loss: 0.1664 - val_accuracy: 0.8452 - val_loss: 0.35