In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
import numpy as np

# Load the data
file_path = 'final_straddle_data_normalized.csv'
df = pd.read_csv(file_path)

# Remove the month columns
month_columns = [col for col in df.columns if 'month' in col.lower()]
df = df.drop(columns=month_columns)

# Define features and target
features = df.drop(columns=['date', 'delta_neutral_long_straddle_returns', 'mean_daily_delta_neutral_long_straddle_returns'])
target = df['mean_daily_delta_neutral_long_straddle_returns']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10]
}

# Define MAE as the scoring metric
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Initialize the Random Forest model
rf = RandomForestRegressor(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=30, scoring=mae_scorer, cv=3, random_state=42, n_jobs=1, verbose=2)

# Perform random search
random_search.fit(X_train, y_train)

# Get the best parameters and the corresponding MAE
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"\nBest Hyperparameters: {best_params}")
print(f"Best Average MAE on Validation Folds: {best_score:.4f}")

# Train the best model on the entire training set
best_rf = random_search.best_estimator_
best_rf.fit(X_train, y_train)

# Predict on the test set
y_pred = best_rf.predict(X_test)

# Calculate and print the MAE on the test set
test_mae = mean_absolute_error(y_test, y_pred)
print(f"Test Set MAE: {test_mae:.4f}")

# Print feature importances
feature_importances = pd.Series(best_rf.feature_importances_, index=features.columns).sort_values(ascending=False)
print('Feature Importances:')
print(feature_importances)


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] END max_depth=20, max_features=log2, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=   2.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=   2.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=   1.8s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=100; total time=   1.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=100; total time=   1.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=100; total time=   1.5s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=10, min_samples_split=5, n_estimators=100; total time=   1.4s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=10, min_samples_split=5, n_estima

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, ParameterGrid
from sklearn.metrics import mean_absolute_error
import numpy as np

# Load the data
file_path = 'final_straddle_data_normalized.csv'
df = pd.read_csv(file_path)

# Define features and target
features = df.drop(columns=['date', 'delta_neutral_long_straddle_returns', 'mean_daily_delta_neutral_long_straddle_returns'])
target = df['mean_daily_delta_neutral_long_straddle_returns']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the expanded parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 15],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [5, 10]
}

# Define MAE as the scoring metric
def mae_scorer(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

# Custom cross-validation loop
kf = KFold(n_splits=5, shuffle=True, random_state=42)
best_params = None
best_score = float('inf')

for params in ParameterGrid(param_grid):
    train_scores = []
    val_scores = []
    
    for train_index, val_index in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        rf = RandomForestRegressor(**params)
        rf.fit(X_tr, y_tr)
        
        y_tr_pred = rf.predict(X_tr)
        y_val_pred = rf.predict(X_val)
        
        train_mae = mae_scorer(y_tr, y_tr_pred)
        val_mae = mae_scorer(y_val, y_val_pred)
        
        train_scores.append(train_mae)
        val_scores.append(val_mae)
    
    mean_train_mae = np.mean(train_scores)
    std_train_mae = np.std(train_scores)
    mean_val_mae = np.mean(val_scores)
    std_val_mae = np.std(val_scores)
    
    print(f"Params: {params}")
    print(f"  Mean MAE on Train Folds: {mean_train_mae:.4f} (std: {std_train_mae:.4f})")
    print(f"  Mean MAE on Validation Folds: {mean_val_mae:.4f} (std: {std_val_mae:.4f})")
    
    if mean_val_mae < best_score:
        best_score = mean_val_mae
        best_params = params

print(f"\nBest Hyperparameters: {best_params}")
print(f"Best Average MAE on Validation Folds: {best_score:.4f}")

# Train the best model on the entire training set
best_rf = RandomForestRegressor(**best_params)
best_rf.fit(X_train, y_train)

# Predict on the test set
y_pred = best_rf.predict(X_test)

# Calculate and print the MAE on the test set
test_mae = mae_scorer(y_test, y_pred)
print(f"Test Set MAE: {test_mae:.4f}")

# Print feature importances
feature_importances = pd.Series(best_rf.feature_importances_, index=features.columns).sort_values(ascending=False)
print('Feature Importances:')
print(feature_importances)


Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 100}
  Mean MAE on Train Folds: 0.5415 (std: 0.0069)
  Mean MAE on Validation Folds: 0.6518 (std: 0.0191)
Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 200}
  Mean MAE on Train Folds: 0.5443 (std: 0.0051)
  Mean MAE on Validation Folds: 0.6561 (std: 0.0205)
Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 300}
  Mean MAE on Train Folds: 0.5410 (std: 0.0070)
  Mean MAE on Validation Folds: 0.6532 (std: 0.0200)
Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 100}
  Mean MAE on Train Folds: 0.5408 (std: 0.0071)
  Mean MAE on Validation Folds: 0.6514 (std: 0.0229)
Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 200}
  Me

KeyboardInterrupt: 

In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, ParameterGrid
from sklearn.metrics import mean_absolute_error
import numpy as np

# Load the data
file_path = 'final_straddle_data_normalized.csv'
df = pd.read_csv(file_path)

# Define features and target
features = df.drop(columns=['date', 'delta_neutral_long_straddle_returns', 'mean_daily_delta_neutral_long_straddle_returns'])
target = df['mean_daily_delta_neutral_long_straddle_returns']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the expanded parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 15],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [5, 10]
}

# Define MAE as the scoring metric
def mae_scorer(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

# Custom cross-validation loop
kf = KFold(n_splits=5, shuffle=True, random_state=42)
best_params = None
best_score = float('inf')

for params in ParameterGrid(param_grid):
    train_scores = []
    val_scores = []
    
    for train_index, val_index in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        rf = RandomForestRegressor(**params)
        rf.fit(X_tr, y_tr)
        
        y_tr_pred = rf.predict(X_tr)
        y_val_pred = rf.predict(X_val)
        
        train_mae = mae_scorer(y_tr, y_tr_pred)
        val_mae = mae_scorer(y_val, y_val_pred)
        
        train_scores.append(train_mae)
        val_scores.append(val_mae)
    
    mean_train_mae = np.mean(train_scores)
    std_train_mae = np.std(train_scores)
    mean_val_mae = np.mean(val_scores)
    std_val_mae = np.std(val_scores)
    
    print(f"Params: {params}")
    print(f"  Mean MAE on Train Folds: {mean_train_mae:.4f} (std: {std_train_mae:.4f})")
    print(f"  Mean MAE on Validation Folds: {mean_val_mae:.4f} (std: {std_val_mae:.4f})")
    
    if mean_val_mae < best_score:
        best_score = mean_val_mae
        best_params = params

print(f"\nBest Hyperparameters: {best_params}")
print(f"Best Average MAE on Validation Folds: {best_score:.4f}")

# Train the best model on the entire training set
best_rf = RandomForestRegressor(**best_params)
best_rf.fit(X_train, y_train)

# Predict on the test set
y_pred = best_rf.predict(X_test)

# Calculate and print the MAE on the test set
test_mae = mae_scorer(y_test, y_pred)
print(f"Test Set MAE: {test_mae:.4f}")

# Print feature importances
feature_importances = pd.Series(best_rf.feature_importances_, index=features.columns).sort_values(ascending=False)
print('Feature Importances:')
print(feature_importances)


Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 100}
  Mean MAE on Train Folds: 0.5458 (std: 0.0063)
  Mean MAE on Validation Folds: 0.6570 (std: 0.0212)
Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 200}
  Mean MAE on Train Folds: 0.5427 (std: 0.0060)
  Mean MAE on Validation Folds: 0.6516 (std: 0.0226)



KeyboardInterrupt



In [11]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, KFold, ParameterGrid
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the data
file_path = 'final_straddle_data_normalized.csv'
df = pd.read_csv(file_path)

# Define features and target
features = df.drop(columns=['date', 'delta_neutral_long_straddle_returns', 'mean_daily_delta_neutral_long_straddle_returns'])
target = df['mean_daily_delta_neutral_long_straddle_returns']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [5, 10, 20]
}

# Define RMSE as the scoring metric
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Custom cross-validation loop
kf = KFold(n_splits=5, shuffle=True, random_state=42)
best_params = None
best_score = float('inf')

for params in ParameterGrid(param_grid):
    train_scores = []
    val_scores = []
    
    for train_index, val_index in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        gbr = GradientBoostingRegressor(**params, random_state=42)
        gbr.fit(X_tr, y_tr)
        
        y_tr_pred = gbr.predict(X_tr)
        y_val_pred = gbr.predict(X_val)
        
        train_rmse = rmse_scorer(y_tr, y_tr_pred)
        val_rmse = rmse_scorer(y_val, y_val_pred)
        
        train_scores.append(train_rmse)
        val_scores.append(val_rmse)
    
    mean_train_rmse = np.mean(train_scores)
    std_train_rmse = np.std(train_scores)
    mean_val_rmse = np.mean(val_scores)
    std_val_rmse = np.std(val_scores)
    
    print(f"Params: {params}")
    print(f"  Mean RMSE on Train Folds: {mean_train_rmse:.4f} (std: {std_train_rmse:.4f})")
    print(f"  Mean RMSE on Validation Folds: {mean_val_rmse:.4f} (std: {std_val_rmse:.4f})")
    
    if mean_val_rmse < best_score:
        best_score = mean_val_rmse
        best_params = params

print(f"\nBest Hyperparameters: {best_params}")
print(f"Best Average RMSE on Validation Folds: {best_score:.4f}")

# Train the best model on the entire training set
best_gbr = GradientBoostingRegressor(**best_params, random_state=42)
best_gbr.fit(X_train, y_train)

# Predict on the test set
y_pred = best_gbr.predict(X_test)

# Calculate and print the RMSE on the test set
test_rmse = rmse_scorer(y_test, y_pred)
print(f"Test Set RMSE: {test_rmse}")

# Print feature importances
feature_importances = pd.Series(best_gbr.feature_importances_, index=features.columns).sort_values(ascending=False)
print('Feature Importances:')
print(feature_importances)


Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 100}
  Mean RMSE on Train Folds: 1.5728 (std: 0.0198)
  Mean RMSE on Validation Folds: 1.6281 (std: 0.0785)
Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 200}
  Mean RMSE on Train Folds: 1.4984 (std: 0.0205)
  Mean RMSE on Validation Folds: 1.5943 (std: 0.0686)
Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 300}
  Mean RMSE on Train Folds: 1.4361 (std: 0.0176)
  Mean RMSE on Validation Folds: 1.5613 (std: 0.0598)
Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 100}
  Mean RMSE on Train Folds: 1.5728 (std: 0.0198)
  Mean RMSE on Validation Folds: 1.6281 (std: 0.0785)
Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 200}
  Mean

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1117d5a90>>
Traceback (most recent call last):
  File "/Users/matthewbellick/anaconda3/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1117d5a90>>
Traceback (most recent call last):
  File "/Users/matthewbellick/anaconda3/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
KeyboardInterrupt: 

KeyboardInterrupt



In [None]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# Load the data
file_path = 'final_straddle_data_normalized.csv'
df = pd.read_csv(file_path)

# Define features and target
features = df.drop(columns=['date', 'delta_neutral_long_straddle_returns', 'mean_daily_delta_neutral_long_straddle_returns'])
target = df['mean_daily_delta_neutral_long_straddle_returns']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': [200, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [5, 10, 20],
    'subsample': [0.8, 1.0]
}

# Define RMSE as the scoring metric
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Custom scorer
scorer = make_scorer(rmse_scorer, greater_is_better=False)

# Initialize the RandomizedSearchCV with 20 iterations
gbr = GradientBoostingRegressor(random_state=42)
random_search = RandomizedSearchCV(estimator=gbr, param_distributions=param_dist, n_iter=30, scoring=scorer, cv=5, random_state=42, n_jobs=-1, verbose=1)

# Perform random search
random_search.fit(X_train, y_train)

# Get the best parameters and the corresponding RMSE
best_params = random_search.best_params_
best_score = -random_search.best_score_

print(f"\nBest Hyperparameters: {best_params}")
print(f"Best Average RMSE on Validation Folds: {best_score:.4f}")

# Train the best model on the entire training set
best_gbr = random_search.best_estimator_
best_gbr.fit(X_train, y_train)

# Predict on the test set
y_pred = best_gbr.predict(X_test)

# Calculate and print the RMSE on the test set
test_rmse = rmse_scorer(y_test, y_pred)
print(f"Test Set RMSE: {test_rmse:.4f}")

# Print feature importances
feature_importances = pd.Series(best_gbr.feature_importances_, index=features.columns).sort_values(ascending=False)
print('Feature Importances:')
print(feature_importances)


In [2]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
import numpy as np

# Load the data
file_path = 'final_straddle_data_normalized_condensed.csv'
df = pd.read_csv(file_path)
df['date'] = pd.to_datetime(df['date'])

# Remove rows with dates in 2023 for the holdout set
holdout_set = df[df['date'].dt.year == 2023]
train_set = df[df['date'].dt.year != 2023]

# Remove the month columns
month_columns = [col for col in train_set.columns if 'month' in col.lower()]
train_set = train_set.drop(columns=month_columns)
holdout_set = holdout_set.drop(columns=month_columns)

# Define features and target
features_train = train_set.drop(columns=['date', 'delta_neutral_long_straddle_returns', 'mean_daily_delta_neutral_long_straddle_returns'])
target_train = train_set['mean_daily_delta_neutral_long_straddle_returns']

features_holdout = holdout_set.drop(columns=['date', 'delta_neutral_long_straddle_returns', 'mean_daily_delta_neutral_long_straddle_returns'])
target_holdout = holdout_set['mean_daily_delta_neutral_long_straddle_returns']

# Define the parameter grid for hyperparameter tuning
param_dist = {
    'n_neighbors': list(range(1, 31)),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # p=1 for Manhattan distance, p=2 for Euclidean distance
}

# Define MAE as the scoring metric
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Initialize the KNN model
knn = KNeighborsRegressor()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=knn, param_distributions=param_dist, n_iter=30, scoring=mae_scorer, cv=5, random_state=42, n_jobs=1, verbose=0, return_train_score=True)

# Perform random search
random_search.fit(features_train, target_train)

# Print the average MAE on train folds and test folds for each hyperparameter set tested
results = random_search.cv_results_
for i in range(len(results['params'])):
    print(f"Hyperparameters: {results['params'][i]}")
    print(f"Mean Train MAE: {-results['mean_train_score'][i]:.4f}")
    print(f"Mean Test MAE: {-results['mean_test_score'][i]:.4f}\n")

# Get the best parameters and the corresponding MAE
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"\nBest Hyperparameters: {best_params}")
print(f"Best Average MAE on Validation Folds: {-best_score:.4f}")

Hyperparameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 12}
Mean Train MAE: 0.5088
Mean Test MAE: 0.8918

Hyperparameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 12}
Mean Train MAE: 0.0000
Mean Test MAE: 0.9209

Hyperparameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 2}
Mean Train MAE: 0.2339
Mean Test MAE: 1.0259

Hyperparameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 14}
Mean Train MAE: 0.0000
Mean Test MAE: 0.9111

Hyperparameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': 7}
Mean Train MAE: 0.4660
Mean Test MAE: 0.9596

Hyperparameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 17}
Mean Train MAE: 0.5651
Mean Test MAE: 0.8712

Hyperparameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 19}
Mean Train MAE: -0.0000
Mean Test MAE: 0.8702

Hyperparameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': 3}
Mean Train MAE: 0.3002
Mean Test MAE: 1.0350

Hyperparameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 11}
Mean Train MAE: 0.4927
Mean 

In [5]:
# Train the best model on the entire training set
best_knn = random_search.best_estimator_
best_knn.fit(features_train, target_train)

# Predict on the holdout set
y_holdout_pred = best_knn.predict(features_holdout)

# Calculate and print the MAE on the holdout set
holdout_mae = mean_absolute_error(target_holdout, y_holdout_pred)
print(f"Holdout Set MAE: {holdout_mae:.4f}")

Holdout Set MAE: 0.7656


In [6]:
# Get the best model
best_knn = random_search.best_estimator_

# Predict on the holdout set
features_holdout = holdout_set.drop(columns=['date', 'delta_neutral_long_straddle_returns', 'mean_daily_delta_neutral_long_straddle_returns'])
holdout_set['predicted_mean_daily_delta_neutral_long_straddle_returns'] = best_knn.predict(features_holdout)

# Save the updated holdout set to a new CSV file
holdout_set.to_csv('final_straddle_data_raw_with_predictions_knn.csv', index=False)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- predicted_mean_daily_delta_neutral_long_straddle_returns
