In [1]:
import yaml
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
import pandas as pd
import numpy as np

In [2]:
import os,sys
root_path = os.path.dirname(os.path.dirname(os.path.abspath('run.py')))
sys.path.append(root_path)

from cfg import cfg
from numerapi.numerapi import NumerAPI
import models
from data_utils import get_data_era_balanced,data_files,get_data, write_to_csv
import opt

model_list = [
    ('aecgan',models.aec_gan.AecAdvModel,dict(istrain=False)),
    ('aec',models.aec.AecModel,dict(istrain=False)),
    ('xg',models.xg.XgModel,dict(istrain=False)),
    ('aecganxg',models.aec_gan_xg.AecGanXgModel,dict(istrain=False)),# depends on model from AecAdvModel
    #('aecgs',models.aec_gan_stack.AecAdvStackModel,dict(istrain=False)),
]



WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


Using TensorFlow backend.


In [3]:
X_train,y_train,X_val,y_val = get_data_era_balanced(data_files[-1]['trainpath'])
X_test,y_test,_,_,_=get_data(data_files[-1]['testpath'])

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
(354192, 50) (354192,) (39421, 50) (39421,)


In [6]:
_X = np.concatenate([X_train,X_val,X_test],axis=0)
_y = np.concatenate([[0]*X_train.shape[0],[0]*X_val.shape[0],[1]*X_test.shape[0]],axis=0)


In [8]:
from sklearn import cross_validation as CV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score as AUC
from sklearn.metrics import accuracy_score as accuracy

In [10]:
n_estimators = 100
clf = RF( n_estimators = n_estimators, n_jobs = -1, verbose = True )
scores = CV.cross_val_score( clf, _X, _y,scoring = 'roc_auc', cv = 5, verbose = 1 )

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jo

In [11]:
print("mean AUC: {:.2%}, std: {:.2%} \n".format( scores.mean(), scores.std()))

mean AUC: 74.06%, std: 6.94% 



In [4]:
clsf=models.datatype_discr.DDiscrModel()
clsf.fit(X_train=X_train,y_train=y_train,X_validation=X_val,y_validation=y_val,X_test=X_test)
clsf.load()

Train on 318649 samples, validate on 39421 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


  return getattr(obj, method)(*args, **kwds)


In [None]:
X, Y, ids, eras, datatypes = get_data(data_files[-1]['trainpath'])
Xt, Yt, idst, erast, datatypest = get_data(data_files[-1]['testpath'])

In [None]:
pred, _= clsf.predict(X)
predt, _= clsf.predict(Xt)

In [None]:
np.median(pred),np.median(predt)

#_=plt.hist(pred,label='trian')
#_=plt.hist(predt,label='test')

In [None]:
plt.figure(figsize=(5,5))
for era in sorted(list(np.unique(eras))):
    inds = np.where(eras==era)
    a=np.median(pred[inds])
    plt.scatter(int(era.strip('era')),a)
    
for era in sorted(list(np.unique(erast))):
    inds = np.where(erast==era)
    a=np.median(predt[inds])
    if era not in ['eraX']:
        plt.scatter(int(era.strip('era')),a,color='black')
    else:
        plt.scatter(140,a,color='blue')
        
plt.figure(figsize=(10,5))
plt.subplot(121)
for era in sorted(list(np.unique(eras))):
    inds = np.where(eras==era)
    a=np.median(pred[inds])
    plt.scatter(int(era.strip('era')),a)
    
for era in sorted(list(np.unique(erast))):
    inds = np.where(erast==era)
    a=np.median(predt[inds])
    if era in ['eraX']:
        continue
    plt.scatter(int(era.strip('era')),a,color='black')
plt.ylim(0.2,0.5)

plt.subplot(122)
vert_hist = np.histogram(pred, bins=100,density=True)
plt.plot(vert_hist[0], vert_hist[1][:-1], '--',label='train',color='black')

inds = np.where(erast!='eraX')
vert_hist = np.histogram(predt[inds], bins=100,density=True)
plt.plot(vert_hist[0], vert_hist[1][:-1], '--',label='validation',color='red')

inds = np.where(erast!='eraX')
vert_hist = np.histogram(predt[inds], bins=100,density=True)
plt.plot(vert_hist[0], vert_hist[1][:-1], '-.',label='live',color='blue')
plt.ylim(0.2,0.5)
plt.legend()