## Training for Random Forest begins here

In [1]:
import wandb
import ydf
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np


train_data = pd.read_csv(f'covariatesbig/train_top_5.csv')
valid_data = pd.read_csv(f'covariatesbig/valid_top_5.csv')


In [2]:
train_data.drop(['Ticker', 'PCT_CHANGE_20', 'index'], axis=1, inplace=True)

In [None]:
import numpy as np
label = 'TOP_5'

# learner = ydf.RandomForestLearner(task=ydf.Task.CLASSIFICATION, label=label, num_trees=10000,
#                                   winner_take_all=False, growing_strategy='BEST_FIRST_GLOBAL').train(train_data)

learner = (ydf.RandomForestLearner(task=ydf.Task.CLASSIFICATION,
                                        label='TOP_5',
                                        max_depth = 100,
                                        # growing_strategy='BEST_FIRST_GLOBAL',
                                        num_trees=100).train(train_data))

valid_preds = learner.predict(valid_data.drop('TOP_5', axis=1, inplace=False))
# Assuming valid_preds contains the probabilities of class 1
threshold = 0.5
predicted_classes = (valid_preds >= threshold).astype(int)

# Now create a DataFrame for the predictions
preds = pd.DataFrame({
    'Predicted': predicted_classes,
    'Probs': valid_preds
})

# Ensure consistency in lengths and alignment
true_classes = valid_data["TOP_5"].reset_index(drop=True)
predicted_classes = preds['Predicted'].reset_index(drop=True)

# Check if lengths match
assert len(true_classes) == len(predicted_classes), "Lengths of true and predicted classes do not match."

# Calculate accuracies
total_accuracy = np.mean(true_classes == predicted_classes)

print("Accuracy: ", total_accuracy)


Train model on 281484 examples


In [2]:
model = ydf.load_model('./models/1k_top25')

In [3]:
learner = model
covpred = pd.read_csv('covariatesbig/covpred_top_5.csv')

crossval_preds = learner.predict(covpred)
# Assuming valid_preds contains the probabilities of class 1
threshold = 0.5
predicted_classes = (crossval_preds >= threshold).astype(int)

# Now create a DataFrame for the predictions
preds = pd.DataFrame({
    'Predicted': predicted_classes,
    'Probs': crossval_preds,
    "Dates": covpred["Dates"]
})

preds.index = covpred.index


covpred['Predicted'] = preds['Predicted']
covpred['Confidence'] = preds['Probs']
crossvalpred_data = covpred.loc[:, ['Ticker', 'Predicted', "Confidence", "Dates"]]
crossvalpred_data[crossvalpred_data["Predicted"] == 1]


Unnamed: 0,Ticker,Predicted,Confidence,Dates
1746,SMGR,1,0.508333,2024-06-12
1748,SMGR,1,0.535,2024-06-14
1749,SMGR,1,0.53,2024-06-17
1750,SMGR,1,0.530833,2024-06-18
1751,SMGR,1,0.5325,2024-06-19
1752,SMGR,1,0.529166,2024-06-20
1753,SMGR,1,0.5175,2024-06-21
1754,SMGR,1,0.514166,2024-06-24
1755,SMGR,1,0.5425,2024-06-25
1756,SMGR,1,0.545,2024-06-26


In [None]:
learner.analyze(vali)

In [None]:
print('pred_classes', predicted_classes.shape)
print('valid_preds', len(valid_preds))
print('valid_data', len(valid_data))

In [None]:
valid_preds = learner.predict(valid_data.drop('TOP_5', axis=1, inplace=False))
# Assuming valid_preds contains the probabilities of class 1
threshold = 0.5
predicted_classes = (valid_preds >= threshold).astype(int)

# Now create a DataFrame for the predictions
preds = pd.DataFrame({
    'Predicted': predicted_classes,
    'Probs': valid_preds,
    'Dates': valid_data["index"],
    'Ticker': valid_data['Ticker'],
    'Return': valid_data['PCT_CHANGE_20']
})

crossval_data = preds.loc[:, ["Dates", 'Ticker', 'Predicted', 'Probs', 'Return']]
# Assuming the 'index' column contains the string you want to modify
crossval_data['index'] = crossval_data['Dates'].apply(lambda x: '-'.join(x.split('-')[:-1]))
positive_preds = crossval_data[crossval_data["Predicted"] == 1]

In [None]:
import pandas as pd

# Assuming positive_preds is your DataFrame
# Step 1: Create bins for the 'Probs' column
bins = range(50, 105, 5)  # Creates bins [50-55), [55-60), ..., [100-105)
labels = [f'{i}-{i+5}' for i in bins[:-1]]  # Create labels like '50-55', '55-60', ...

positive_preds['Prob_Bin'] = pd.cut(positive_preds['Probs'] * 100, bins=bins, labels=labels, right=False)

# Step 2: Group by the bins and calculate the statistics
stats_df = positive_preds.groupby('Prob_Bin')['Return'].agg(['mean', 'median', 'min', 'max', 'std']).reset_index()

stats_df

In [None]:
positive_preds.head().columns

In [None]:
learner.variable_importances()

In [None]:
# Assuming the 'index' column contains the string you want to modify
crossval_data['index'] = crossval_data['index'].apply(lambda x: '-'.join(x.split('-')[:-1]))

In [None]:
crossval_data = valid_data.loc[:, ["index", "VOLATILITY_20", 'TOP_5', 'Ticker', 'Predicted']]

In [None]:
crossval_data[crossval_data["Predicted"] == 1]

In [None]:
import pandas as pd

# Initialize an empty list to store the returns
all_returns = []

# Loop through each window in the DataFrame
for window in crossval_data['Window'].unique():
    # Filter for the current window
    window_df = crossval_data.loc[crossval_data['Window'] == window]

    # Select stocks that are predicted to be in the top 5th quintile
    predicted_top_5 = window_df[window_df['Predicted'] == 1]

    # If there are predicted stocks, calculate the mean return
    if not predicted_top_5.empty:
        # Calculate the portfolio return as the average of the returns for the selected stocks
        portfolio_return = predicted_top_5['PCT_CHANGE_20'].mean()
        all_returns.append(portfolio_return)

# Convert the list of returns into a pandas Series
all_returns_series = pd.Series(all_returns)

# Adjust the risk-free rate for the 20-day period
annual_risk_free_rate = 0.06  # Annual risk-free rate of 7%
risk_free_rate_20_day = (1 + annual_risk_free_rate)**(20/252) - 1  # Adjusting to 20 trading days

# Calculate the overall Sharpe Ratio
excess_returns = all_returns_series - risk_free_rate_20_day
overall_sharpe_ratio\
    = excess_returns.mean() / excess_returns.std()

# Output the overall Sharpe Ratio
print(f"Overall Sharpe Ratio across all windows: {overall_sharpe_ratio}")

In [None]:
variable_importance = learner.variable_importances()['SUM_SCORE']

# Unzip the variable importance into two lists
importance_values, features = zip(*variable_importance)

# Plotting
plt.figure(figsize=(10, 12))
plt.barh(features, importance_values, color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Variable Importance')
plt.gca().invert_yaxis()  # Highest importance at the top
plt.show()

In [None]:
train_data = pd.read_csv(f'covset3_full/train_quintiles.csv')
valid_data = pd.read_csv(f'covset3_full/valid_quintiles.csv')

label = 'DELTA_20_QUINTILES'
learner1 = (ydf.RandomForestLearner(task=ydf.Task.CLASSIFICATION,
                                   label=label,
                                   categorical_algorithm='CART',
                                   max_depth=25,
                                   # growing_strategy='BEST_FIRST_GLOBAL',
                                   num_trees=300).train(train_data))

valid_preds = learner1.predict(valid_data)
preds = pd.DataFrame(valid_preds, columns=learner1.label_classes())
preds['Probs'] = preds.max(axis=1)
preds['Predicted'] = preds.idxmax(axis=1)

# Ensure consistency in lengths and alignment
true_classes = valid_data["DELTA_20_QUINTILES"].reset_index(drop=True)
predicted_classes = preds['Predicted'].astype(int).reset_index(drop=True)

# Check if lengths match
assert len(true_classes) == len(predicted_classes), "Lengths of true and predicted classes do not match."

# Create the filter mask for classes 1 and 5
filter_mask = (true_classes.isin([1, 5])) | (predicted_classes.isin([1, 5]))

# Apply the filter
filtered_tclass = true_classes[filter_mask]
filtered_pclass = predicted_classes[filter_mask]

# Calculate accuracies
total_accuracy = accuracy_score(true_classes, predicted_classes)
onefive_accuracy = accuracy_score(filtered_tclass, filtered_pclass)

print("Accuracy: ", total_accuracy)
print("1/5 accuracy: ", onefive_accuracy)

# Define the penalty matrix
weight = 0.5
penalty_matrix = np.array([
    [0, weight, 1, 1, 1],  # True class is 1
    [weight, 0, 1, 1, 1],  # True class is 2
    [1, 1, 0, 1, 1],  # True class is 3
    [1, 1, 1, 0, weight],  # True class is 4
    [1, 1, 1, weight, 0],  # True class is 5
])

# Convert the true and predicted classes to numpy arrays for easier indexing
true_classes = valid_data["DELTA_20_QUINTILES"].to_numpy()
predicted_classes = preds['Predicted'].astype(int).to_numpy()

# Initialize a list to store penalties for each prediction
penalties = []

# Loop through each prediction and calculate the penalty
for true_class, pred_class in zip(true_classes, predicted_classes):
    penalty = penalty_matrix[true_class - 1, pred_class - 1]
    penalties.append(penalty)

# Calculate total weighted accuracy
weighted_accuracy = 1 - np.mean(penalties)

# Print the weighted accuracy
print("Coping Accuracy: ", weighted_accuracy)
valid_data

In [None]:
learner1.variable_importances()['SUM_SCORE']

In [None]:
preds