In [None]:
import random
import pandas as pd
import ydf
import wandb

Train a random forest model

Current best model num_trees=500 & l2_regularization=0.00

In [None]:
FLAGS = dict(num_trees=50, 
             loss_fn="MULTINOMIAL_LOG_LIKELIHOOD", 
             l2_penalty=0.001)

wandb.init(
    project = "stonks",
    config = {
        "num_trees": FLAGS['num_trees'],
        "l2_penalty": FLAGS['l2_penalty'],
        "architecture": "GradientBoostedTrees",
    }
)

models = []
# Loop through each set of training and validation data
for i in range(11):  # Assuming i ranges from 0 to 12
    # Load training and validation data
    train_file = f"covset0/train_{i}.csv"
    valid_file = f"covset0/valid_{i}.csv"

    train_df = pd.read_csv(train_file)
    valid_df = pd.read_csv(valid_file)

    # Separate features and labels
    X_train = train_df.drop(columns=['DELTA_20_QUINTILES'])  # Replace with actual label column
    y_train = train_df['DELTA_20_QUINTILES']

    X_valid = valid_df.drop(columns=['DELTA_20_QUINTILES'])  # Replace with actual label column
    y_valid = valid_df['DELTA_20_QUINTILES']

    # Combine X and y for YDF
    train_data = pd.concat([X_train, y_train], axis=1)
    valid_data = pd.concat([X_valid, y_valid], axis=1)

    # Specify the label column for YDF
    label = "DELTA_20_QUINTILES"

    # Train the Gradient Boosted Trees model
    model = ydf.GradientBoostedTreesLearner(task=ydf.Task.CLASSIFICATION, 
                                    label=label,
                                    l2_regularization=FLAGS['l2_penalty'],
                                    num_trees=FLAGS['num_trees'],
                                    loss=FLAGS['loss_fn']).train(train_data, valid=valid_df)


    # Evaluate the model
    evaluation = model.evaluate(valid_data)
    
    models.append(model)

    # Log the results with WandB
    wandb.log({
        'tv_set': i,
        'Cross_Entropy': evaluation.loss,  # Log cross-entropy or other metrics
    })

    print(f"Completed training and validation for fold {i}")

# Finalize WandB run
wandb.finish()

In [None]:
model.predict_proba(valid)

Train a Random Forest Model

In [None]:
# Loop through each set of training and validation data
norm = False
subdir = 'norm' if norm else 'unnorm'
learners = []
for i in range(11):  # Assuming i ranges from 0 to 12
    # Load training and validation data
    train_file = f"covset0/{subdir}/train_{i}.csv"
    valid_file = f"covset0/{subdir}/valid_{i}.csv"

    train_df = pd.read_csv(train_file)
    valid_df = pd.read_csv(valid_file)

    # Separate features and labels
    X_train = train_df.drop(columns=['DELTA_20_QUINTILES'])  # Replace with actual label column
    y_train = train_df['DELTA_20_QUINTILES']

    X_valid = valid_df.drop(columns=['DELTA_20_QUINTILES'])  # Replace with actual label column
    y_valid = valid_df['DELTA_20_QUINTILES']dd

    # Combine X and y for YDF
    train_data = pd.concat([X_train, y_train], axis=1)
    valid_data = pd.concat([X_valid, y_valid], axis=1)

    # Specify the label column for YDF
    label = "DELTA_20_QUINTILES"

    # Train the Gradient Boosted Trees model
    learner = ydf.RandomForestLearner(task=ydf.Task.CLASSIFICATION,
                                    label=label).train(train_df)

    learners.append(learner)



In [None]:
model = learners[0]

In [None]:
dir(model)

In [None]:
model.variable_importances()

In [None]:
model.predict(valid_df)

In [None]:
evaluation = learners[0].analyze(valid_df)

In [None]:
evaluation

In [None]:
model.evaluate(valid_data)

Combine all in sample data into train_data 
and all out of sample data into valid_data 

In [1]:
import wandb
import ydf
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score

# wandb.init(project='stonks')

norm = False
learners = []
subdir = 'norm' if norm else 'unnorm'

tdf = []
vdf = []
for i in range(11):
    train_df = pd.read_csv(f'covset0/{subdir}/train_{i}.csv')
    valid_df = pd.read_csv(f'covset0/{subdir}/valid_{i}.csv')
    
    tdf.append(train_df)
    vdf.append(valid_df)

train_data = pd.concat(tdf, axis=0)
valid_data = pd.concat(vdf, axis=0)


In [2]:
label = 'DELTA_20_QUINTILES'
learner = ydf.RandomForestLearner(task=ydf.Task.CLASSIFICATION, label=label, num_trees=300).train(train_data)

Train model on 2772 examples
Model trained in 0:00:01.264144


In [27]:
label = 'DELTA_20_QUINTILES'

# learner = ydf.RandomForestLearner(task=ydf.Task.CLASSIFICATION, label=label, num_trees=10000,
#                                   winner_take_all=False, growing_strategy='BEST_FIRST_GLOBAL').train(train_data)

learner = ydf.GradientBoostedTreesLearner(task=ydf.Task.CLASSIFICATION,
                                        label=label,
                                        l2_regularization=0.000,
                                        growing_strategy='BEST_FIRST_GLOBAL',
                                        num_trees=300,
                                        loss='MULTINOMIAL_LOG_LIKELIHOOD').train(train_data, valid=valid_data)

valid_preds = learner.predict(valid_data)
preds = pd.DataFrame(valid_preds, columns=learner.label_classes())
preds['Probs'] = preds.max(axis=1)
preds['Predicted'] = preds.idxmax(axis=1)

# Ensure consistency in lengths and alignment
true_classes = valid_data["DELTA_20_QUINTILES"].reset_index(drop=True)
predicted_classes = preds['Predicted'].astype(int).reset_index(drop=True)

# Check if lengths match
assert len(true_classes) == len(predicted_classes), "Lengths of true and predicted classes do not match."

# Create the filter mask for classes 1 and 5
filter_mask = (true_classes.isin([1, 5])) | (predicted_classes.isin([1, 5]))

# Apply the filter
filtered_tclass = true_classes[filter_mask]
filtered_pclass = predicted_classes[filter_mask]

# Calculate accuracies
total_accuracy = accuracy_score(true_classes, predicted_classes)
onefive_accuracy = accuracy_score(filtered_tclass, filtered_pclass)

print("Accuracy: ", total_accuracy)
print("1/5 accuracy: ", onefive_accuracy)

Train model on 2772 training examples and 660 validation examples
Model trained in 0:00:01.363003
Accuracy:  0.5151515151515151
1/5 accuracy:  0.5128939828080229


In [32]:
import numpy as np

# Define the penalty matrix
penalty_matrix = np.array([
    [0, 0.5, 1, 1, 1],   # True class is 1
    [0.5, 0, 0.5, 1, 1], # True class is 2
    [1, 1, 0, 1, 1], # True class is 3
    [1, 1, 0.5, 0, 0.5], # True class is 4
    [1, 1, 1, 0.5, 0],   # True class is 5
])

# Convert the true and predicted classes to numpy arrays for easier indexing
true_classes = valid_data["DELTA_20_QUINTILES"].to_numpy()
predicted_classes = preds['Predicted'].astype(int).to_numpy()

# Initialize a list to store penalties for each prediction
penalties = []

# Loop through each prediction and calculate the penalty
for true_class, pred_class in zip(true_classes, predicted_classes):
    penalty = penalty_matrix[true_class - 1, pred_class - 1]
    penalties.append(penalty)

# Calculate total weighted accuracy
weighted_accuracy = 1 - np.mean(penalties)

# Print the weighted accuracy
print("Coping Accuracy: ", weighted_accuracy)

Coping Accuracy:  0.6522727272727273
