In [1]:
import pandas as pd
import numpy as np
import csv


d_train=pd.read_csv("/Users/lucasmuzynoski/Projects/Numerai2/numerai_datasets/numerai_training_data.csv")

from catboost import Pool, CatBoostRegressor, cv

feature_names = [
        f for f in d_train.columns if f.startswith("feature")
    ]
from sklearn.model_selection import KFold
from random import sample
from scipy.stats import spearmanr


def split_generator(nfolds=4):
    era_range=list(np.unique(d_train["era"]))
    num_test_eras=round(1/nfolds*120)   
    for k in range(nfolds):
        test_eras=sample(era_range,num_test_eras)
        test_indices=[]
        train_indices=[]
        i=0
        for row in d_train["era"]:
            if row in test_eras:
                test_indices.append(i)
            else:
                train_indices.append(i)
            i=i+1        
        yield(train_indices,test_indices)


def ranks( x, break_ties=False ):
    n = len(x)
    t = np.arange(n)
    s = sorted( t, key=x.__getitem__ )

    if not break_ties:
        t[1:] = np.cumsum(x[s[1:]] != x[s[:-1]])

    r = t.copy()
    np.put( r, s, t )
    return r

class SpearmanRMetric(object):
    def get_final_error(self, error, weight):
        return (error / (weight + 1e-38))

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        approx_ranks=ranks(approx)
        target_ranks=ranks(target)
        mean_approx_rank=np.mean(approx_ranks)
        mean_target_rank=np.mean(target_ranks)

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += w * ((approx_ranks[i] - mean_approx_rank)*(target_ranks[i]-mean_target_rank))
        
        weight_sum=weight_sum*np.std(target_ranks)*np.std(approx_ranks)
        return error_sum, weight_sum

class SpearmanRMetric2(object):
    def get_final_error(self, error, weight):
        return (error / (weight + 1e-38))

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        approx_ranks=ranks(approx)
        target_ranks=ranks(target)
        mean_approx_rank=np.mean(approx_ranks)
        mean_target_rank=np.mean(target_ranks)

        if weight is None:
            error_sum=np.sum((approx_ranks-mean_approx_rank)*(target_ranks-mean_target_rank))
            weight_sum=len(approx)*np.std(target_ranks)*np.std(approx_ranks)
        else:
            error_sum=np.sum(weight*(approx_ranks-mean_approx_rank)*(target_ranks-mean_target_rank))
            weight_sum=np.sum(weight)*np.std(target_ranks)*np.std(approx_ranks)

        """
        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += w * ((approx_ranks[i] - mean_approx_rank)*(target_ranks[i]-mean_target_rank))
        
        weight_sum=weight_sum*np.std(target_ranks)*np.std(approx_ranks)
        """
        return error_sum, weight_sum


In [2]:

class EpsObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            #der1 = targets[index] - approxes[index]
            #der2 = -1
            delta=targets[index]-approxes[index]
            if delta < -.125:
                der1=-1
            elif delta > .125:
                der1=1
            else:
                der1 =0
            
            der2=0

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result

class EpsSquaredObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            #der1 = targets[index] - approxes[index]
            #der2 = -1
            delta=targets[index]-approxes[index]
            if delta < -.125:
                der1= delta +.125
                der2=-1
            elif delta > .125:
                der1= delta-.125
                der2=-1
            else:
                der1 =0
                der2 =0

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result

In [31]:
#feature_names.append("era")

In [3]:
d_validation=pd.read_csv("/Users/lucasmuzynoski/Projects/Numerai2/numerai_datasets/numerai_tournament_data.csv")
d_validation=d_validation[d_validation["data_type"]=="validation"]
d_total=d_train.append(d_validation)

In [4]:
import optuna
pool_data_train=Pool(data=d_total[feature_names],label=d_total["target"])#,cat_features=["era"])
def catboost_cv_result(trial):
    params={
            'loss_function':'RMSE',
            'eval_metric':SpearmanRMetric2(),
            'learning_rate':0.25,
            'subsample':trial.suggest_uniform('subsample', 0.7, .85),
            'random_strength':trial.suggest_uniform('random_strength',.05,.8),
            'rsm':trial.suggest_uniform('rsm', .4,.9),
            'max_depth':trial.suggest_categorical('max_depth',[3]),
            'grow_policy':trial.suggest_categorical('grow_policy',['SymmetricTree']),
            'l2_leaf_reg':trial.suggest_uniform('l2_leaf_reg',2,4.5)

        }
    cv_out=cv(params=params,pool=pool_data_train,nfold=3,shuffle=False,early_stopping_rounds=20,partition_random_seed=2021,verbose=False,type="TimeSeries")
    out=np.max(cv_out["test-SpearmanRMetric2-mean"])
    print(out)
    return(out)

In [37]:
study = optuna.create_study(direction='maximize')
study.optimize(catboost_cv_result, n_trials=10)

[32m[I 2021-05-15 09:31:30,491][0m A new study created in memory with name: no-name-16c23a1f-a29f-47c8-aa41-579d01c8cef9[0m
[32m[I 2021-05-15 09:35:48,974][0m Trial 0 finished with value: 0.030630818728474885 and parameters: {'subsample': 0.8311959104577901, 'random_strength': 0.11462248472899965, 'rsm': 0.47264550962458207, 'max_depth': 3, 'grow_policy': 'SymmetricTree', 'l2_leaf_reg': 3.746406275354606}. Best is trial 0 with value: 0.030630818728474885.[0m
Stopped by overfitting detector  (20 iterations wait)
0.030630818728474885
[32m[I 2021-05-15 09:40:20,641][0m Trial 1 finished with value: 0.030350890781832923 and parameters: {'subsample': 0.7188607256034955, 'random_strength': 0.5085106368154242, 'rsm': 0.8142436113716935, 'max_depth': 3, 'grow_policy': 'SymmetricTree', 'l2_leaf_reg': 3.513886958347865}. Best is trial 0 with value: 0.030630818728474885.[0m
Stopped by overfitting detector  (20 iterations wait)
0.030350890781832923
[32m[I 2021-05-15 09:46:23,377][0m Tria

In [33]:
study.best_params

{'subsample': 0.7446504209142342,
 'random_strength': 0.26701689362649106,
 'rsm': 0.7413899955520784,
 'max_depth': 3,
 'grow_policy': 'SymmetricTree',
 'l2_leaf_reg': 3.4934891057506308}

In [41]:
##cv with best params and lower learning rate
params={
            'loss_function':'RMSE',
            'eval_metric':SpearmanRMetric2(),
            'learning_rate':0.05,
            'subsample':.74,
            'rsm':.74,
            'random_strength':.267,
            'max_depth':3,
            'grow_policy':'SymmetricTree',
            'l2_leaf_reg':3.5

        }

cv_out=cv(params=params,pool=pool_data_train,nfold=4,shuffle=False,early_stopping_rounds=20,partition_random_seed=2021,verbose=True,metric_period=20,type="TimeSeries")

0:	learn: 0.0114380	test: 0.0046469	best: 0.0046469 (0)
20:	learn: 0.0358198	test: 0.0169220	best: 0.0169220 (20)	total: 17.4s	remaining: 13m 30s
40:	learn: 0.0448848	test: 0.0205971	best: 0.0212662 (30)
60:	learn: 0.0515680	test: 0.0232064	best: 0.0233937 (58)	total: 49.2s	remaining: 12m 38s
80:	learn: 0.0541878	test: 0.0241108	best: 0.0244095 (67)
100:	learn: 0.0583618	test: 0.0254634	best: 0.0256310 (99)	total: 1m 21s	remaining: 12m 7s
120:	learn: 0.0621319	test: 0.0265708	best: 0.0265708 (120)
140:	learn: 0.0656660	test: 0.0271234	best: 0.0271910 (137)
160:	learn: 0.0688177	test: 0.0282113	best: 0.0282113 (160)	total: 2m 10s	remaining: 11m 19s
180:	learn: 0.0719450	test: 0.0289686	best: 0.0289686 (180)
200:	learn: 0.0743997	test: 0.0293751	best: 0.0293894 (199)	total: 2m 42s	remaining: 10m 45s
220:	learn: 0.0770270	test: 0.0298162	best: 0.0298162 (220)
240:	learn: 0.0794876	test: 0.0301209	best: 0.0301478 (236)	total: 3m 14s	remaining: 10m 11s
260:	learn: 0.0821451	test: 0.0305387	

In [42]:
cv_out

Unnamed: 0,iterations,test-SpearmanRMetric2-mean,test-SpearmanRMetric2-std,train-SpearmanRMetric2-mean,train-SpearmanRMetric2-std,test-RMSE-mean,test-RMSE-std,train-RMSE-mean,train-RMSE-std
0,0,0.004647,0.003575,0.011438,0.002923,0.524866,0.00013,0.524868,4.4e-05
1,20,0.016922,0.006028,0.03582,0.007081,0.280802,0.000179,0.280797,4.3e-05
2,40,0.020597,0.006985,0.044885,0.009455,0.231469,0.000198,0.231427,3.1e-05
3,60,0.023206,0.006296,0.051568,0.010005,0.224333,0.000199,0.224253,2.4e-05
4,80,0.024111,0.006179,0.054188,0.008942,0.223385,0.000201,0.223256,4.2e-05
5,100,0.025463,0.006197,0.058362,0.010804,0.223252,0.000205,0.223071,6.6e-05
6,120,0.026571,0.007057,0.062132,0.011943,0.223227,0.000212,0.223002,8.7e-05
7,140,0.027123,0.007247,0.065666,0.013184,0.223219,0.000216,0.22295,0.000107
8,160,0.028211,0.007277,0.068818,0.014062,0.223211,0.000218,0.222906,0.000126
9,180,0.028969,0.007315,0.071945,0.015394,0.223206,0.000219,0.222864,0.000145


In [5]:
params_optim={
            'loss_function':'RMSE',
            'learning_rate':0.05,
            'subsample':.74,
            'rsm':.74,
            'random_strength':.267,
            'max_depth':3,
            'grow_policy':'SymmetricTree',
            'l2_leaf_reg':3.5,
            'iterations':416

        }
cb_mod=CatBoostRegressor(loss_function='RMSE',learning_rate=.05,subsample=.74,rsm=.74,random_strength=.267,max_depth=3,grow_policy='SymmetricTree',l2_leaf_reg=3.5,iterations=416)


In [6]:
cb_mod.fit(pool_data_train)

al: 2.5s	remaining: 34.6s
28:	learn: 0.2232383	total: 2.58s	remaining: 34.4s
29:	learn: 0.2232365	total: 2.66s	remaining: 34.2s
30:	learn: 0.2232344	total: 2.74s	remaining: 34s
31:	learn: 0.2232326	total: 2.82s	remaining: 33.9s
32:	learn: 0.2232310	total: 2.9s	remaining: 33.7s
33:	learn: 0.2232292	total: 2.99s	remaining: 33.6s
34:	learn: 0.2232276	total: 3.07s	remaining: 33.4s
35:	learn: 0.2232251	total: 3.15s	remaining: 33.2s
36:	learn: 0.2232235	total: 3.23s	remaining: 33.1s
37:	learn: 0.2232220	total: 3.32s	remaining: 33s
38:	learn: 0.2232204	total: 3.41s	remaining: 33s
39:	learn: 0.2232183	total: 3.5s	remaining: 32.9s
40:	learn: 0.2232168	total: 3.58s	remaining: 32.7s
41:	learn: 0.2232150	total: 3.65s	remaining: 32.5s
42:	learn: 0.2232134	total: 3.74s	remaining: 32.4s
43:	learn: 0.2232115	total: 3.82s	remaining: 32.3s
44:	learn: 0.2232102	total: 3.9s	remaining: 32.2s
45:	learn: 0.2232086	total: 3.98s	remaining: 32s
46:	learn: 0.2232070	total: 4.07s	remaining: 31.9s
47:	learn: 0.223

<catboost.core.CatBoostRegressor at 0x7f8e59bb2460>

In [7]:
feat_imp=cb_mod.get_feature_importance(prettified=True)
feat_imp.head()

Unnamed: 0,Feature Id,Importances
0,feature_dexterity6,4.547429
1,feature_charisma63,3.871688
2,feature_dexterity14,3.081201
3,feature_dexterity7,3.066723
4,feature_dexterity11,2.469868


In [33]:
i=5
feat_imp["Feature Id"][i]

'feature_wisdom35'

In [42]:
feature_select=[]
imp_threshold=.001

for i in range(feat_imp.shape[0]):
    if feat_imp.Importances[i] >imp_threshold:
        feature_select.append(feat_imp["Feature Id"][i])

In [43]:
len(feature_select)

228

In [44]:
##cv with best params and lower learning rate
pool_data_train_select=Pool(data=d_total[feature_select],label=d_total["target"])#,cat_features=["era"])
params={
            'loss_function':'RMSE',
            'eval_metric':SpearmanRMetric2(),
            'learning_rate':0.05,
            'subsample':.74,
            'rsm':.74,
            'random_strength':.267,
            'max_depth':3,
            'grow_policy':'SymmetricTree',
            'l2_leaf_reg':3.5

        }

cv_out=cv(params=params,pool=pool_data_train_select,nfold=4,shuffle=False,early_stopping_rounds=20,partition_random_seed=2021,verbose=True,metric_period=20,type="TimeSeries")

0:	learn: 0.0075625	test: 0.0021675	best: 0.0021675 (0)
20:	learn: 0.0346942	test: 0.0163128	best: 0.0163128 (20)	total: 17.1s	remaining: 13m 16s
40:	learn: 0.0451642	test: 0.0205936	best: 0.0205936 (40)
60:	learn: 0.0508378	test: 0.0229479	best: 0.0229479 (60)	total: 48.1s	remaining: 12m 20s
80:	learn: 0.0542406	test: 0.0244417	best: 0.0245455 (72)
100:	learn: 0.0584587	test: 0.0259193	best: 0.0259656 (99)	total: 1m 19s	remaining: 11m 43s
120:	learn: 0.0620501	test: 0.0268707	best: 0.0268707 (120)
140:	learn: 0.0650535	test: 0.0277068	best: 0.0277068 (140)	total: 1m 50s	remaining: 11m 10s
160:	learn: 0.0680640	test: 0.0286436	best: 0.0286436 (160)
180:	learn: 0.0707855	test: 0.0289453	best: 0.0289453 (180)	total: 2m 20s	remaining: 10m 37s
200:	learn: 0.0733384	test: 0.0293421	best: 0.0293421 (200)
220:	learn: 0.0761769	test: 0.0298040	best: 0.0298236 (219)	total: 2m 51s	remaining: 10m 5s
240:	learn: 0.0787150	test: 0.0302671	best: 0.0302671 (240)
260:	learn: 0.0810987	test: 0.0305750	

In [45]:
cb_mod3=CatBoostRegressor(loss_function='RMSE',learning_rate=.05,subsample=.74,rsm=.74,random_strength=.267,max_depth=3,grow_policy='SymmetricTree',l2_leaf_reg=3.5,iterations=563)
cb_mod3.fit(pool_data_train_select)

aining: 25.4s
176:	learn: 0.2230571	total: 11.6s	remaining: 25.3s
177:	learn: 0.2230563	total: 11.7s	remaining: 25.3s
178:	learn: 0.2230553	total: 11.8s	remaining: 25.2s
179:	learn: 0.2230544	total: 11.8s	remaining: 25.1s
180:	learn: 0.2230529	total: 11.9s	remaining: 25.1s
181:	learn: 0.2230522	total: 12s	remaining: 25s
182:	learn: 0.2230513	total: 12s	remaining: 24.9s
183:	learn: 0.2230501	total: 12.1s	remaining: 24.9s
184:	learn: 0.2230491	total: 12.2s	remaining: 24.8s
185:	learn: 0.2230485	total: 12.2s	remaining: 24.8s
186:	learn: 0.2230477	total: 12.3s	remaining: 24.7s
187:	learn: 0.2230468	total: 12.3s	remaining: 24.6s
188:	learn: 0.2230458	total: 12.4s	remaining: 24.6s
189:	learn: 0.2230451	total: 12.5s	remaining: 24.5s
190:	learn: 0.2230445	total: 12.5s	remaining: 24.4s
191:	learn: 0.2230438	total: 12.6s	remaining: 24.4s
192:	learn: 0.2230431	total: 12.7s	remaining: 24.3s
193:	learn: 0.2230423	total: 12.8s	remaining: 24.3s
194:	learn: 0.2230415	total: 12.8s	remaining: 24.2s
195:

<catboost.core.CatBoostRegressor at 0x7f8d9914d190>

In [47]:
cb_mod3.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,feature_dexterity6,4.199312
1,feature_charisma63,3.226303
2,feature_dexterity7,2.940585
3,feature_dexterity14,2.494980
4,feature_dexterity12,2.291343
...,...,...
223,feature_constitution14,0.000000
224,feature_charisma16,0.000000
225,feature_constitution59,0.000000
226,feature_wisdom9,0.000000


In [48]:
feat_imp=cb_mod3.get_feature_importance(prettified=True)
feature_select=[]
imp_threshold=.001

for i in range(feat_imp.shape[0]):
    if feat_imp.Importances[i] >imp_threshold:
        feature_select.append(feat_imp["Feature Id"][i])

In [50]:
len(feature_select)

217

In [52]:
pool_data_train_select=Pool(data=d_total[feature_select],label=d_total["target"])#,cat_features=["era"])
def catboost_cv_result(trial):
    params={
            'loss_function':'RMSE',
            'eval_metric':SpearmanRMetric2(),
            'learning_rate':0.2,
            'subsample':trial.suggest_uniform('subsample', 0.7, .85),
            'random_strength':trial.suggest_uniform('random_strength',.05,.5),
            'rsm':trial.suggest_uniform('rsm', .4,.9),
            'max_depth':trial.suggest_categorical('max_depth',[3]),
            'grow_policy':trial.suggest_categorical('grow_policy',['SymmetricTree']),
            'l2_leaf_reg':trial.suggest_uniform('l2_leaf_reg',2,4.5)

        }
    cv_out=cv(params=params,pool=pool_data_train,nfold=4,shuffle=False,early_stopping_rounds=20,partition_random_seed=2021,verbose=False,type="TimeSeries")
    out=np.max(cv_out["test-SpearmanRMetric2-mean"])
    print(out)
    return(out)
study = optuna.create_study(direction='maximize')
study.optimize(catboost_cv_result, n_trials=20)

[32m[I 2021-05-15 12:58:25,248][0m A new study created in memory with name: no-name-85ef25ea-8e02-4a6c-a371-1c7da1606552[0m
[32m[I 2021-05-15 13:05:46,082][0m Trial 0 finished with value: 0.032225114745059895 and parameters: {'subsample': 0.7256590581908812, 'random_strength': 0.12909651560314206, 'rsm': 0.7669782477945895, 'max_depth': 3, 'grow_policy': 'SymmetricTree', 'l2_leaf_reg': 3.4654229296673758}. Best is trial 0 with value: 0.032225114745059895.[0m
Stopped by overfitting detector  (20 iterations wait)
0.032225114745059895
[32m[I 2021-05-15 13:09:26,744][0m Trial 1 finished with value: 0.030444945774812215 and parameters: {'subsample': 0.7834055490285214, 'random_strength': 0.059000091015363056, 'rsm': 0.5272115449678446, 'max_depth': 3, 'grow_policy': 'SymmetricTree', 'l2_leaf_reg': 2.258623273342975}. Best is trial 0 with value: 0.032225114745059895.[0m
Stopped by overfitting detector  (20 iterations wait)
0.030444945774812215
[32m[I 2021-05-15 13:16:04,948][0m Tr

In [53]:
study.best_params

{'subsample': 0.8011901619595697,
 'random_strength': 0.4630861027411687,
 'rsm': 0.8345374012548077,
 'max_depth': 3,
 'grow_policy': 'SymmetricTree',
 'l2_leaf_reg': 2.053069874204704}

In [56]:
pool_data_train_select=Pool(data=d_total[feature_select],label=d_total["target"])#,cat_features=["era"])
params={
            'loss_function':'RMSE',
            'eval_metric':SpearmanRMetric2(),
            'learning_rate':0.03,
            'subsample':.8,
            'rsm':.83,
            'random_strength':.463,
            'max_depth':3,
            'grow_policy':'SymmetricTree',
            'l2_leaf_reg':2.05,
            'iterations':2500

        }

cv_out=cv(params=params,pool=pool_data_train_select,nfold=4,shuffle=False,early_stopping_rounds=100,partition_random_seed=2021,verbose=True,metric_period=20,type="TimeSeries")

0:	learn: 0.0078890	test: 0.0028891	best: 0.0028891 (0)
20:	learn: 0.0392517	test: 0.0195010	best: 0.0195010 (20)	total: 17.2s	remaining: 33m 53s
40:	learn: 0.0461622	test: 0.0222279	best: 0.0224140 (39)
60:	learn: 0.0485449	test: 0.0225391	best: 0.0227505 (42)	total: 48.4s	remaining: 32m 15s
80:	learn: 0.0493252	test: 0.0229372	best: 0.0230437 (79)
100:	learn: 0.0542291	test: 0.0247447	best: 0.0247867 (99)	total: 1m 19s	remaining: 31m 31s
120:	learn: 0.0556610	test: 0.0252821	best: 0.0256500 (115)
140:	learn: 0.0560563	test: 0.0247285	best: 0.0256500 (115)	total: 1m 50s	remaining: 30m 55s
160:	learn: 0.0570536	test: 0.0248593	best: 0.0256500 (115)
180:	learn: 0.0586231	test: 0.0258023	best: 0.0258023 (180)	total: 2m 22s	remaining: 30m 20s
200:	learn: 0.0607025	test: 0.0265155	best: 0.0265385 (198)
220:	learn: 0.0626772	test: 0.0270505	best: 0.0270505 (220)	total: 2m 53s	remaining: 29m 45s
240:	learn: 0.0644404	test: 0.0275816	best: 0.0275816 (240)
260:	learn: 0.0662029	test: 0.0282972

In [57]:
cb_mod4=CatBoostRegressor(loss_function='RMSE',learning_rate=.03,subsample=.8,rsm=.83,random_strength=.463,max_depth=3,grow_policy='SymmetricTree',l2_leaf_reg=2.05,iterations=1393)
cb_mod4.fit(pool_data_train_select)

1:	learn: 0.2227618	total: 1m 8s	remaining: 24.9s
1022:	learn: 0.2227615	total: 1m 8s	remaining: 24.9s
1023:	learn: 0.2227611	total: 1m 8s	remaining: 24.8s
1024:	learn: 0.2227608	total: 1m 8s	remaining: 24.7s
1025:	learn: 0.2227602	total: 1m 8s	remaining: 24.7s
1026:	learn: 0.2227599	total: 1m 9s	remaining: 24.6s
1027:	learn: 0.2227596	total: 1m 9s	remaining: 24.5s
1028:	learn: 0.2227593	total: 1m 9s	remaining: 24.5s
1029:	learn: 0.2227590	total: 1m 9s	remaining: 24.4s
1030:	learn: 0.2227586	total: 1m 9s	remaining: 24.3s
1031:	learn: 0.2227582	total: 1m 9s	remaining: 24.3s
1032:	learn: 0.2227579	total: 1m 9s	remaining: 24.2s
1033:	learn: 0.2227575	total: 1m 9s	remaining: 24.2s
1034:	learn: 0.2227571	total: 1m 9s	remaining: 24.1s
1035:	learn: 0.2227566	total: 1m 9s	remaining: 24s
1036:	learn: 0.2227562	total: 1m 9s	remaining: 24s
1037:	learn: 0.2227558	total: 1m 9s	remaining: 23.9s
1038:	learn: 0.2227553	total: 1m 9s	remaining: 23.8s
1039:	learn: 0.2227550	total: 1m 10s	remaining: 23.8s

<catboost.core.CatBoostRegressor at 0x7f8e3c1094f0>

In [66]:
cb_mod4.save_model(fname="/Users/lucasmuzynoski/Projects/Numerai2/cat_boost_model2.cbm")

In [77]:
pd.Series(feature_select).to_csv("/Users/lucasmuzynoski/Projects/Numerai2/cat_boost_model2_features.csv",index=False)

In [59]:
from sklearn import linear_model

lin_mod=linear_model.ElasticNetCV()
lin_mod.fit(X=d_total[feature_select],y=d_total["target"])

ElasticNetCV()

In [82]:
d_features=pd.read_csv("/Users/lucasmuzynoski/Projects/Numerai2/cat_boost_model2_features.csv")
d_features=d_features.iloc[:,0].to_list()

In [83]:
print(d_features)

['feature_dexterity6', 'feature_charisma63', 'feature_dexterity7', 'feature_dexterity14', 'feature_dexterity12', 'feature_wisdom35', 'feature_wisdom42', 'feature_dexterity4', 'feature_dexterity11', 'feature_charisma85', 'feature_charisma58', 'feature_intelligence4', 'feature_wisdom23', 'feature_charisma18', 'feature_strength34', 'feature_intelligence12', 'feature_strength4', 'feature_wisdom26', 'feature_wisdom13', 'feature_wisdom3', 'feature_charisma69', 'feature_strength1', 'feature_dexterity2', 'feature_strength14', 'feature_intelligence9', 'feature_charisma10', 'feature_charisma19', 'feature_wisdom4', 'feature_intelligence1', 'feature_strength19', 'feature_wisdom32', 'feature_intelligence3', 'feature_wisdom14', 'feature_constitution12', 'feature_strength10', 'feature_intelligence5', 'feature_charisma5', 'feature_wisdom22', 'feature_charisma45', 'feature_strength38', 'feature_constitution69', 'feature_intelligence2', 'feature_constitution58', 'feature_strength3', 'feature_charisma70'

In [60]:
lin_mod.coef_

array([-6.56612557e-03,  8.02855649e-03, -4.96029643e-03, -4.61443641e-03,
       -2.61751465e-03,  4.47212234e-03,  1.94304568e-03, -8.47714526e-04,
       -1.45892584e-03,  1.39602023e-03,  3.51100427e-03, -1.88133535e-03,
        4.43961305e-03,  2.95358171e-03,  4.89492636e-03,  4.85404006e-05,
        1.27938482e-03,  1.59925860e-03, -0.00000000e+00, -8.88052992e-04,
       -1.35283912e-03,  4.54188646e-03,  0.00000000e+00,  0.00000000e+00,
       -7.88736771e-04,  2.65082852e-03,  1.41672895e-03, -0.00000000e+00,
        3.50414072e-05,  3.16344636e-03,  0.00000000e+00, -4.18345895e-04,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  1.73334517e-03,
        0.00000000e+00,  1.14205533e-03,  8.86098867e-04,  0.00000000e+00,
       -0.00000000e+00, -2.07885203e-03, -3.88041822e-04,  0.00000000e+00,
        0.00000000e+00,  2.41226420e-03, -3.33210421e-03,  4.46050260e-04,
        0.00000000e+00,  5.85334000e-04,  3.05473156e-04,  3.76937345e-04,
        1.78368732e-03, -

In [50]:
from scipy.stats import spearmanr
##check validation correlation
pool_data_val=Pool(data=d_validation[feature_names])#,cat_features=["era"])
pred_validation=cat_model.predict(pool_data_val)
print(spearmanr(pred_validation,d_validation["target"]))

SpearmanrResult(correlation=0.020982005971389554, pvalue=6.753504860914537e-15)
