In [27]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import statsmodels.api as sm

In [24]:
def load_scores():
  df = pd.read_csv('numerai_hyperparams_scores.csv')
  return df
df = load_scores()
df.columns

Index(['total_time', 'round_number', 'boosting_type', 'class_weight',
       'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth',
       'min_child_samples', 'min_child_weight', 'min_split_gain',
       'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state',
       'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin',
       'subsample_freq', 'feature_fraction', 'seed', 'correlation',
       'corr_sharpe', 'corr_mean', 'corr_std', 'max_drawdown',
       'feature_exposure', 'max_feature_exposure'],
      dtype='object')

In [72]:
df['learning_rate_n_estimators_product'] = df['learning_rate'] * df['n_estimators']
df['feature_fraction_learning_rate_ratio'] = df['learning_rate'] / df['feature_fraction']
independent_variables = ['feature_fraction','n_estimators','learning_rate', 'max_depth','learning_rate_n_estimators_product','feature_fraction_learning_rate_ratio']
dependent_variables = ['correlation'] # you might want the n_estimators/learning_rate column since that might matter. 

def normalize(df, variables):
    """
        Normalize so that you can properly interperate the coeffiencets impact on correlation and corr_sharpe
    """
    x = df[variables].values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    return pd.DataFrame(x_scaled, columns=variables)
    
independent_variables_df = normalize(df,independent_variables)
dependent_variables_df =  normalize(df,dependent_variables)

In [73]:
def compute_regression(dependent_variables_df, independent_variables_df):
    model = sm.OLS(dependent_variables_df, independent_variables_df)
    results = model.fit()
    print(results.summary())
    
compute_regression(dependent_variables_df,independent_variables_df)

                                 OLS Regression Results                                
Dep. Variable:            correlation   R-squared (uncentered):                   0.895
Model:                            OLS   Adj. R-squared (uncentered):              0.894
Method:                 Least Squares   F-statistic:                              1026.
Date:                Wed, 21 Apr 2021   Prob (F-statistic):                        0.00
Time:                        09:40:04   Log-Likelihood:                         -6.5908
No. Observations:                 728   AIC:                                      25.18
Df Residuals:                     722   BIC:                                      52.72
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                                           coef    std err          t      P>|t|      [0.025      0.975]
---------------

In [75]:
summary_df = df[independent_variables + dependent_variables +['corr_sharpe']]
best_models = summary_df[np.logical_and(summary_df['correlation']>0.024,summary_df['corr_sharpe']>.9)]
# you need to bound learning rate * n_estimators (betwen 70 and 90)
best_models.describe()

Unnamed: 0,feature_fraction,n_estimators,learning_rate,max_depth,learning_rate_n_estimators_product,feature_fraction_learning_rate_ratio,correlation,corr_sharpe
count,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0
mean,0.111095,2900.0,0.031603,3.991379,82.206897,0.28379,0.024632,0.952266
std,0.035221,547.881295,0.023859,0.75176,17.272586,0.138363,0.000483,0.029382
min,0.09,200.0,0.013,3.0,18.0,0.066667,0.0241,0.9059
25%,0.095,3000.0,0.024,4.0,72.0,0.236364,0.0243,0.92755
50%,0.105,3000.0,0.028,4.0,84.0,0.263636,0.0245,0.9479
75%,0.115,3000.0,0.032,4.0,93.0,0.305263,0.0249,0.9712
max,0.38,5000.0,0.248,9.0,120.0,1.411111,0.0264,1.0552


In [76]:
best_models.sort_values(by=['correlation'],ascending=False)



Unnamed: 0,feature_fraction,n_estimators,learning_rate,max_depth,learning_rate_n_estimators_product,feature_fraction_learning_rate_ratio,correlation,corr_sharpe
135,0.095,3000,0.031,4,93.0,0.326316,0.0264,1.0080
65,0.095,3000,0.028,4,84.0,0.294737,0.0259,0.9844
120,0.095,3000,0.028,4,84.0,0.294737,0.0259,0.9844
389,0.255,200,0.248,4,49.6,0.972549,0.0259,1.0004
115,0.095,3000,0.027,4,81.0,0.284211,0.0258,1.0030
...,...,...,...,...,...,...,...,...
31,0.100,3000,0.034,3,102.0,0.340000,0.0241,0.9471
46,0.100,3000,0.040,3,120.0,0.400000,0.0241,1.0134
30,0.095,3000,0.034,3,102.0,0.357895,0.0241,0.9171
28,0.110,3000,0.032,3,96.0,0.290909,0.0241,0.9493
