In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import neuralNets
import time

def oneHotEncodeData3Classes(targets):
    j=0
    Y_val = np.zeros((targets.shape[0], 3))
    for j in range(targets.shape[0]):
        if targets[j] == 0:
            Y_val[j, 0] = 1
        elif targets[j] == 1:
            Y_val[j, 1] = 1
        elif targets[j] == 2:
            Y_val[j, 2] = 1
        else:
            print("something went wrong, new class", targets[j])
    return Y_val

In [None]:
validation_data = pd.read_csv("data/numerai_datasets_04.04.21/numerai_validation_data.csv")  
feature_cols = validation_data.columns[validation_data.columns.str.startswith('feature')]
X_val = validation_data[feature_cols].to_numpy()
Y_val = validation_data.target.to_numpy()

# determine between the 3 class model

In [None]:
modelNN_3classes = neuralNets.defineNN_3classes(X_val.shape[1])
modelNN_3classes.compile(optimizer='adam', loss='categorical_crossentropy', metrics='categorical_accuracy')
modelNN_3classes.load_weights("model_3class_99train_99val_4_4_21.h5")

In [None]:
pred_3class = modelNN_3classes.predict(X_val)
pred_3class = np.argmax(pred_3class, axis = 1) #convert one hot vectors to labels

# determine between 0 and 1


In [None]:
modelNN_01 = neuralNets.defineNN_2classes(X_val.shape[1])
modelNN_01.load_weights("model_01class_99train_99test_994val.h5")

pred_01 = np.rint(modelNN_01.predict(X_val))

# determine between 0.25 and 0.75

In [None]:
modelNN_025075 = neuralNets.defineNN_2classes(X_val.shape[1])
modelNN_025075.load_weights("model_025075_class_998train__998val.h5")

pred_025075 = np.rint(modelNN_025075.predict(X_val))

# merge results to overall prediction and calculate accuracy

In [None]:
n_targets  = pred_3class.shape[0]
prediction = np.zeros((n_targets, 1))

for i in range(n_targets):
    if pred_3class[i] == 0:                     # label either 0 or 1
        prediction[i] = pred_01[i]
    elif pred_3class[i] == 1:                # label either 0.25 or 0.75
        if pred_025075[i] == 0:
            prediction[i] = 0.25
        else:
            assert(pred_025075[i] == 1)
            prediction[i] = 0.75
    else:                                       # label is 0.5
        assert(pred_3class[i] == 2)
        prediction[i] = 0.5
    

In [None]:
n_correctPred = 0

n_025075_confusion = 0
n_01_confusion     = 0
n_3class_confusin  = 0

n_0   = validation_data.target.loc[validation_data.target == 0].shape[0]
n_025 = validation_data.target.loc[validation_data.target == 0.25].shape[0]
n_05  = validation_data.target.loc[validation_data.target == 0.5].shape[0]
n_075 = validation_data.target.loc[validation_data.target == 0.75].shape[0]
n_1   = validation_data.target.loc[validation_data.target == 1].shape[0]

indexArray = []

for i in range(n_targets):
    if prediction[i] == Y_val[i]:
        n_correctPred += 1
    else:
        is025075confusion = (prediction[i] == 0.25 and Y_val[i] == 0.75) or (prediction[i] == 0.75 and Y_val[i] == 0.25)
        is01confusion     = (prediction[i] == 1 and Y_val[i] == 0) or (prediction[i] == 1 and Y_val[i] == 0)
        
        if is025075confusion:
            n_025075_confusion += 1
            indexArray.append(i)
        elif is01confusion:
            n_01_confusion += 1
        else:
            n_3class_confusin += 1
        
accuracy = n_correctPred/n_targets

print("accuracy ", accuracy, "                   total number of examples =", n_targets)
print("0.25 vs 0.75 confusion nunber = ", n_025075_confusion, "    vs. total number ", n_025 + n_075, "in per ", n_025075_confusion/(n_025+n_075))
print("0    vs 1    confusion number = ", n_01_confusion, "       vs. total number ", n_0 + n_1, "in per ", n_01_confusion/(n_0+n_1))
print("3class confusion = ", n_3class_confusin)

In [None]:
validation_data_pred = validation_data.copy()
validation_data_pred["prediction"] = prediction

In [None]:
from scipy.stats import spearmanr
PREDICTION_NAME = "prediction"
def feature_exposures(df):
    feature_names = [f for f in df.columns
                     if f.startswith("feature")]
    #print(feature_names)
    exposures = []
    for f in feature_names:
        fe = spearmanr(df[PREDICTION_NAME], df[f])[0]
        #print(fe)
        exposures.append(fe)
    return np.array(exposures)


def max_feature_exposure(df):
    return np.max(np.abs(feature_exposures(df)))

def max_feature_exposure(featureExposure):
    return np.max(np.abs(featureExposure))


def feature_exposure(df):
    return np.sqrt(np.mean(np.square(feature_exposures(df))))

def feature_exposure(featureExposure):
    return np.sqrt(np.mean(np.square(featureExposure)))

In [None]:
featureExposures = feature_exposures(validation_data_pred)

In [None]:
maxExposure = max_feature_exposure(featureExposures)
featureExposureArray = feature_exposure(featureExposures)

print(maxExposure)
print(featureExposureArray)