In [2]:
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.linear_model import Lasso,Ridge,LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_predict

In [19]:
def regressionMetrics(y, yhat):
    res = {
        'MSE': metrics.mean_squared_error(y, yhat),
        'RMSE': np.sqrt(metrics.mean_squared_error(y, yhat)),
        'MAE': metrics.mean_absolute_error(y, yhat),
        'R2': metrics.r2_score(y, yhat)
    }

    # Check if all values are > -1 to safely apply log
    if np.all(y > -1) and np.all(yhat > -1):
        rmsle = np.sqrt(metrics.mean_squared_log_error(y, yhat))
        res['RMSLE'] = rmsle
    else:
        res['RMSLE'] = 'Not applicable (values ≤ -1)'

    return res

In [70]:
df = pd.read_pickle('df_with_ev_ind.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2075 entries, 9 to 2083
Data columns (total 88 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   ds                             2075 non-null   datetime64[ns]
 1   SPY_return                     2075 non-null   float64       
 2   SPY_Volume                     2075 non-null   float64       
 3   AAPL                           2075 non-null   float64       
 4   MSFT                           2075 non-null   float64       
 5   GOOG                           2075 non-null   float64       
 6   GLD                            2075 non-null   float64       
 7   SLV                            2075 non-null   float64       
 8   ^TNX                           2075 non-null   float64       
 9   DX-Y.NYB                       2075 non-null   float64       
 10  JPY=X                          2075 non-null   float64       
 11  EUR=X                 

In [26]:
df_add = pd.read_pickle('combined_cleaned_add.pkl')
df_add.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2084 entries, 2017-01-03 to 2025-04-16
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   SPY_Close            2084 non-null   float64
 1   SPY_Volume           2084 non-null   int64  
 2   AAPL                 2084 non-null   float64
 3   MSFT                 2084 non-null   float64
 4   GOOG                 2084 non-null   float64
 5   GLD                  2084 non-null   float64
 6   SLV                  2084 non-null   float64
 7   ^TNX                 2084 non-null   float64
 8   DX-Y.NYB             2084 non-null   float64
 9   JPY=X                2084 non-null   float64
 10  EUR=X                2084 non-null   float64
 11  USO                  2084 non-null   float64
 12  UNG                  2084 non-null   float64
 13  BTC-USD              2084 non-null   float64
 14  CPER                 2084 non-null   float64
 15  ^VIX                

In [56]:
y = df['SPY_return']
x = df.drop(['SPY_return', 'ds','^VIX_rank'], axis=1)
x.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2075 entries, 9 to 2083
Data columns (total 85 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   SPY_Volume                     2075 non-null   float64
 1   AAPL                           2075 non-null   float64
 2   MSFT                           2075 non-null   float64
 3   GOOG                           2075 non-null   float64
 4   GLD                            2075 non-null   float64
 5   SLV                            2075 non-null   float64
 6   ^TNX                           2075 non-null   float64
 7   DX-Y.NYB                       2075 non-null   float64
 8   JPY=X                          2075 non-null   float64
 9   EUR=X                          2075 non-null   float64
 10  USO                            2075 non-null   float64
 11  UNG                            2075 non-null   float64
 12  BTC-USD                        2075 non-null   float6

In [31]:
split = int(len(y)*0.9)
x_train = x.iloc[:split]
x_test = x.iloc[split:]
y_train = y.iloc[:split]
y_test = y.iloc[split:]

In [53]:
from sklearn.metrics import mean_absolute_error, make_scorer

# Parameters
best_w = None
best_mae = float('inf')

num_samples = len(y_train)
results = []

# Test weight range from 1.0 to 3.0 in 0.1 steps
for w in np.arange(1.0, 3.1, 0.1):
    # Create sample weights linearly increasing from 1 to w
    weights = np.linspace(1, w, num_samples)
    weights = weights / np.mean(weights)  # Normalize to mean=1

    # Fit XGBoost with weights
    model = xgb.XGBRegressor(**{'gamma': 0.1, 'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 200, 'reg_alpha': 0, 'subsample': 0.6, 'n_jobs': -1})
    model.fit(x_train, y_train, sample_weight=weights)

    # Predict and evaluate on test set
    predictions = model.predict(x_test)
    mae = mean_absolute_error(y_test, predictions)

    results.append((w, mae))
    print(f"w={w:.1f}, MAE={mae:.4f}")

    # Track best
    if mae < best_mae:
        best_mae = mae
        best_w = w

print(f"\n✅ Best weight with added slope w={best_w:.2f} with MAE={best_mae:.4f}")


w=1.0, MAE=0.2711
w=1.1, MAE=0.2798
w=1.2, MAE=0.2728
w=1.3, MAE=0.2723
w=1.4, MAE=0.2696
w=1.5, MAE=0.2609
w=1.6, MAE=0.2601
w=1.7, MAE=0.2640
w=1.8, MAE=0.2687
w=1.9, MAE=0.2638
w=2.0, MAE=0.2725
w=2.1, MAE=0.2696
w=2.2, MAE=0.2711
w=2.3, MAE=0.2761
w=2.4, MAE=0.2700
w=2.5, MAE=0.2662
w=2.6, MAE=0.2674
w=2.7, MAE=0.2659
w=2.8, MAE=0.2735
w=2.9, MAE=0.2692
w=3.0, MAE=0.2720

✅ Best weight with added slope w=1.60 with MAE=0.2601


In [32]:
# Recreate sample weights using best_w = 3
num_samples = len(y_train)
weights = np.linspace(1, 2.2, num_samples)
weights = weights / np.mean(weights)  # Normalize to mean=1


xg_ind = xgb.XGBRegressor(**{'gamma': 0.1, 'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 200, 'reg_alpha': 0, 'subsample': 0.6, 'n_jobs': -1})
xg_ind.fit(x_train, y_train,sample_weight=weights)
predicted_xg_ind2 = xg_ind.predict(x_test)
regressionMetrics(y_test,predicted_xg_ind2)

{'MSE': 0.18803462398801446,
 'RMSE': np.float64(0.4336295930722608),
 'MAE': 0.26035164470630606,
 'R2': 0.8930192031822027,
 'RMSLE': 'Not applicable (values ≤ -1)'}

# Feture selection

In [57]:
model = xgb.XGBRegressor(n_estimators=200, random_state=42)
model.fit(x, y)

# Create DataFrame of importance
feature_importance = pd.DataFrame({
    'feature': x.columns,
    'importance': model.feature_importances_
}).sort_values(by='importance', ascending=False)

# Select top N
selected_features = feature_importance[feature_importance['importance'] > 0.005]['feature'].tolist()

In [65]:
!pip install --force-reinstall --no-deps matplotlib==3.8.4
!pip install --force-reinstall numpy==1.26.4



Collecting matplotlib==3.8.4
  Downloading matplotlib-3.8.4-cp310-cp310-win_amd64.whl.metadata (5.9 kB)
Downloading matplotlib-3.8.4-cp310-cp310-win_amd64.whl (7.7 MB)
   ---------------------------------------- 0.0/7.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/7.7 MB ? eta -:--:--
   --------- ------------------------------ 1.8/7.7 MB 7.7 MB/s eta 0:00:01
   ---------------------------------------- 7.7/7.7 MB 19.7 MB/s eta 0:00:00
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.10.1



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
error: uninstall-no-record-file

× Cannot uninstall matplotlib 3.10.1
╰─> The package's contents are unknown: no RECORD file was found for matplotlib.

hint: You might be able to recover from this via: pip install --force-reinstall --no-deps matplotlib==3.10.1


Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp310-cp310-win_amd64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp310-cp310-win_amd64.whl (15.8 MB)
Installing collected packages: numpy


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\OMER\\PycharmProjects\\Project\\.venv\\Lib\\site-packages\\numpy\\linalg\\_umath_linalg.cp310-win_amd64.pyd'
Check the permissions.


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [66]:
!pip install seaborn --upgrade


Collecting numpy!=1.24.0,>=1.20 (from seaborn)
  Using cached numpy-2.2.5-cp310-cp310-win_amd64.whl.metadata (60 kB)
Using cached numpy-2.2.5-cp310-cp310-win_amd64.whl (12.9 MB)
Installing collected packages: numpy


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\OMER\\PycharmProjects\\Project\\.venv\\Lib\\site-packages\\numpy\\fft\\_pocketfft_umath.cp310-win_amd64.pyd'
Check the permissions.


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import seaborn as sns
import matplotlib.pyplot as plt


ModuleNotFoundError: No module named 'matplotlib.backends.registry'

In [60]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance')
plt.xlabel('Importance')

ModuleNotFoundError: No module named 'matplotlib.backends.registry'

In [58]:
print(len(selected_features))

15


In [59]:
from sklearn.inspection import permutation_importance

result = permutation_importance(model, x, y, n_repeats=10, random_state=42)
sorted_idx = result.importances_mean.argsort()[::-1]
important_features = x.columns[sorted_idx[:30]]
print(important_features)

Index(['IWM', 'MSFT', 'AAPL', '^VIX', 'GOOG', 'stdev', 'EMA_20', 'high-low',
       '^GDAXI', 'EMA_50', '^VIX_dummy', 'avgreturn5d', 'std_price', '^N225',
       'USO', '^FTSE', 'SPY_Volume', 'CPER', 'SLV', 'BTC-USD', 'EMA_200',
       'before_high-low', 'JPY=X', 'UNG', '^TNX', 'EUR=X', 'GLD',
       'days_since_fomc', 'yield_curve', 'SPY_Streak'],
      dtype='object')


In [50]:
y_s = df['SPY_return']
x_s = df[selected_features]
x_s.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2075 entries, 9 to 2083
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   IWM              2075 non-null   float64
 1   MSFT             2075 non-null   float64
 2   stdev            2075 non-null   float64
 3   ^VIX             2075 non-null   float64
 4   ^GDAXI           2075 non-null   float64
 5   AAPL             2075 non-null   float64
 6   GOOG             2075 non-null   float64
 7   GDP_vs_forecast  2075 non-null   float64
 8   EMA_20           2075 non-null   float64
 9   EMA_50           2075 non-null   float64
 10  EMA_50_200       2075 non-null   int64  
 11  high-low         2075 non-null   float64
 12  ^VIX_dummy       2075 non-null   int64  
 13  Actual_FOMC      2075 non-null   float64
 14  SPY_Streak       2075 non-null   int64  
dtypes: float64(12), int64(3)
memory usage: 259.4 KB


In [51]:
split = int(len(y)*0.9)
x_train = x_s.iloc[:split]
x_test = x_s.iloc[split:]
y_train = y_s.iloc[:split]
y_test = y_s.iloc[split:]

In [54]:
# Recreate sample weights using best_w = 3
num_samples = len(y_train)
weights = np.linspace(1, 1.6, num_samples)
weights = weights / np.mean(weights)  # Normalize to mean=1


xg_ind = xgb.XGBRegressor(**{'gamma': 0.1, 'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 200, 'reg_alpha': 0, 'subsample': 0.6, 'n_jobs': -1})
xg_ind.fit(x_train, y_train,sample_weight=weights)
predicted_xg_ind2 = xg_ind.predict(x_test)
regressionMetrics(y_test,predicted_xg_ind2)

{'MSE': 0.17655572132912936,
 'RMSE': np.float64(0.4201853416400069),
 'MAE': 0.26008966302007364,
 'R2': 0.8995500331272225,
 'RMSLE': 'Not applicable (values ≤ -1)'}