In [None]:
%matplotlib notebook
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.utils import shuffle
from sklearn.utils import class_weight
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Conv1D, Flatten, MaxPooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from datetime import date, timedelta
import os

import neuralNets

def oneHotEncodeData3Classes(targets):
    j=0
    Y_val = np.zeros((targets.shape[0], 3))
    for j in range(targets.shape[0]):
        if targets[j] == 0:
            Y_val[j, 0] = 1
        elif targets[j] == 1:
            Y_val[j, 1] = 1
        elif targets[j] == 2:
            Y_val[j, 2] = 1
        else:
            print("something went wrong, new class", targets[j])
    return Y_val

labelArray = [0, 0.25, 0.5, 0.75, 1]

# Check training and validation data = all the same


In [None]:
training_data_1 = pd.read_csv("data/numerai_datasets_02.05.21/numerai_training_data.csv")
training_data_2 = pd.read_csv("data/numerai_datasets_31.01.21/numerai_training_data.csv")

In [None]:
training_data_0 = training_data_2
validation_data_0 = pd.read_csv("data/numerai_datasets_31.01.21/numerai_validation_data.csv")  

In [None]:
data0     = training_data_0.drop(columns = ["id", "era", "data_type"]).to_numpy()#[0:10]
val_data0 = validation_data_0.drop(columns = ["Unnamed: 0", "id", "era", "data_type"]).to_numpy()

start_date = date(2021,3, 28)
path = "data/numerai_datasets_"
test_date = start_date
test_path = path + test_date.strftime("%d.%m.%y")
delta = timedelta(days=7)

epochsPerData = 1
epochs = 1

for epoch in range(epochs):
    #start_date = date(2021,1, 31)
    path = "data/numerai_datasets_"
    test_date = start_date
    test_path = path + test_date.strftime("%d.%m.%y")
    delta = timedelta(days=7)
    
    while os.path.exists(test_path):
        path_to_csv = test_path + "/numerai_training_data.csv" 
        tic = time.time()
        training_data = pd.read_csv(str(path_to_csv))
        
        path_to_csv_val = test_path + "/numerai_validation_data.csv" 
        validation_data = pd.read_csv(str(path_to_csv_val))
        
        data_comp       = training_data.drop(columns = ["id", "era", "data_type"]).to_numpy()#[0:10]
        unnamed         = validation_data.columns[validation_data.columns.str.startswith('Unna')]
        validation_data = validation_data.drop(columns = unnamed)
        data_comp_val   = validation_data.drop(columns = ["id", "era", "data_type"]).to_numpy()
        toc = time.time()
        print("loaded data ",toc-tic, "date = ", test_path)
        print("max difference train= ", np.max(data0-data_comp), "max difference val =", np.max(val_data0-data_comp_val))
        
        test_date = test_date + delta
        test_path = path + test_date.strftime("%d.%m.%y")

In [None]:
eraArray = ['era1']
for era in training_data_1['era']:
    if eraArray[len(eraArray)-1] != era:
        eraArray.append(era)

In [None]:
era1_1 = training_data_1.loc[training_data_1.era == "era1"]
era1_2 = training_data_2.loc[training_data_2.era == "era1"]

In [None]:
data1 = training_data_1.drop(columns = ["id", "era", "data_type"]).to_numpy()#[0:10]
data2 = training_data_2.drop(columns = ["id", "era", "data_type"]).to_numpy()#[0:10]

In [None]:
np.max(data1-data2)

In [None]:
concat_training_data = pd.concat([training_data_1, training_data_2], axis = 0, ignore_index = True)
processed_training_data = concat_training_data.drop(columns = ["id", "era", "data_type"]).drop_duplicates()
print(processed_training_data.shape[0], concat_training_data.shape[0], training_data_1.shape)

In [None]:
eraNearestNeighbor = []

for era in eraArray:
    tic = time.time()
    era1_1 = training_data_1.loc[training_data_1.era == era]
    era1_2 = training_data_2.loc[training_data_2.era == era]
    data1 = era1_1.drop(columns = ["id", "era", "data_type"]).to_numpy()#[0:10]
    data2 = era1_2.drop(columns = ["id", "era", "data_type"]).to_numpy()#[0:10]

    nearestNeighbor = np.zeros((data1.shape[0], 2))
    if data1.shape[0] != data2.shape[0]:
        print(data1.shape, data2.shape, era)
    for i in range(data1.shape[0]):
    
        vector1 = data1[i, :]
        absSumVector = np.sum(np.absolute(vector1 - data2), axis = 1)
        index = np.argmin(absSumVector)
        nearestNeighbor[i,0] = index
        nearestNeighbor[i,1] = absSumVector[index]
    toc = time.time()   
    print("max difference", max(nearestNeighbor[:,1]), "min similarity", min(nearestNeighbor[:,1]), "elapsed time", toc-tic)
    eraNearestNeighbor.append(nearestNeighbor)
    
#     break
# print(((toc-tic) * data1.shape[0])/3600)

# check tournament data

- training data is always exactly the same
- no duplicates in the training data

Tournament Data:

- validation data is the same between all tournament datas
- no duplicates in validation data

- test data seems to be the same between at least 2 tournament datas, BUT different length (=new examples)
- test data has duplicates

- live data is similar between tournament datas, but NOT the same
- one data point is the same


In [None]:
def findNearestNeighborWihtoutTarget(pd_dataFrame1, pd_dataFrame2):
    data1 = pd_dataFrame1.drop(columns = ["id", "era", "data_type", "target"]).to_numpy()
    data2 = pd_dataFrame2.drop(columns = ["id", "era", "data_type", "target"]).to_numpy()
    return findNearestNeighborNP(data1,data2)
    
def findNearestNeighborNP(data1, data2):
    nearestNeighbor = np.zeros((data1.shape[0], 2))
    if data1.shape[0] != data2.shape[0]:
        print(data1.shape, data2.shape, era)
    for i in range(data1.shape[0]):
        vector1 = data1[i, :]
        absSumVector = np.sum(np.absolute(vector1 - data2), axis = 1)
        index = np.argmin(absSumVector)
        nearestNeighbor[i,0] = index
        nearestNeighbor[i,1] = absSumVector[index]
    return nearestNeighbor

In [None]:
tournament_data_1 = pd.read_csv("data/numerai_datasets_02.05.21/numerai_tournament_data.csv")
tournament_data_2 = pd.read_csv("data/numerai_datasets_25.04.21/numerai_tournament_data.csv")

In [None]:
typeArray = ["validation"]
counter = 0
for datType in tournament_data_1['data_type']:
    if typeArray[len(typeArray)-1] != datType:
        print(counter)
        typeArray.append(datType)
    counter += 1
print(typeArray)

In [None]:
data_1_val  = tournament_data_1.loc[tournament_data_1.data_type == "validation"]#.drop(columns = ["id", "era", "data_type"])
data_2_val  = tournament_data_2.loc[tournament_data_2.data_type == "validation"]#.drop(columns = ["id", "era", "data_type"])

data_1_test = tournament_data_1.loc[tournament_data_1.data_type == "test"]#.drop(columns = ["id", "era", "data_type"])
data_2_test = tournament_data_2.loc[tournament_data_2.data_type == "test"]#.drop(columns = ["id", "era", "data_type"])

#not the same
data_1_live = tournament_data_1.loc[tournament_data_1.data_type == "live"]#.drop(columns = ["id", "era", "data_type"])
data_2_live = tournament_data_2.loc[tournament_data_2.data_type == "live"]#.drop(columns = ["id", "era", "data_type"])

In [None]:
np.max(data_1_val.drop(columns = ["id", "era", "data_type"]).to_numpy() - data_2_val.drop(columns = ["id", "era", "data_type"]).to_numpy())

In [None]:
data_1_test_processed = data_1_test.drop(columns = ["id", "era", "data_type", "target"]).drop_duplicates()
print(data_1_test_processed.shape, data_1_test.shape)

In [None]:
data_2_test_processed = data_2_test.drop(columns = ["id", "era", "data_type", "target"]).drop_duplicates()
print(data_2_test_processed.shape, data_2_test.shape)

In [None]:
print()
print(data_1_test_processed.to_numpy()[0:10,:],data_2_test_processed.to_numpy()[0:10,:])

In [None]:
index = 1560702
max_test = np.max(data_1_test.drop(columns = ["id", "era", "data_type", "target"]).to_numpy()[0:index,:] - data_2_test.drop(columns = ["id", "era", "data_type", "target"]).to_numpy()[0:index,:])
print(max_test)

In [None]:
nearestNeighbor = findNearestNeighborWihtoutTarget(data_1_live, data_2_live)

In [None]:
fig = plt.figure()
plt.plot(nearestNeighbor[:,0], nearestNeighbor[:,1], "x")
print(nearestNeighbor, np.min(nearestNeighbor[:,1]))
counter = 0
for i in nearestNeighbor[:,1]:
    if i == 0:
        counter += 1
print(counter)

In [None]:
training_data_without_target = data1[:,0:data1.shape[1]-1]
print(training_data_without_target.shape)

In [None]:
training_data_without_target = data1[:,0:data1.shape[1]-2]

nearestNeighbor = np.zeros((data1.shape[0], 2))
    if data1.shape[0] != data2.shape[0]:
        print(data1.shape, data2.shape, era)
    for i in range(data1.shape[0]):
    
        vector1 = data1[i, :]
        absSumVector = np.sum(np.absolute(vector1 - data2), axis = 1)
        index = np.argmin(absSumVector)
        nearestNeighbor[i,0] = index
        nearestNeighbor[i,1] = absSumVector[index]