In [7]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
%matplotlib inline

----------

# FE_v1

Which is original features and basic feature engineering. 

In [3]:
# Load original data
train = pd.read_csv('CleanData/numerai_training_data.csv')
test = pd.read_csv('CleanData/numerai_tournament_data.csv')

# separate target column from test set 
train_y = pd.DataFrame(train.target)
train_y.reset_index(drop=True, inplace=True)
train.drop('target', axis=1, inplace=True)

# separate t_id from test set
t_id = pd.DataFrame(test.t_id)
t_id.reset_index(drop=True, inplace=True)
test.drop('t_id', axis=1, inplace=True)

# write out t_id and train_y dataframes
pd.DataFrame.to_csv(train_y, 'Data/train_y.csv', index=False)
pd.DataFrame.to_csv(t_id, 'Data/t_id.csv', index=False)

# Calculate FE_v1 statistics: 
train['feature_mean'] = train.iloc[:,0:21].mean(axis = 1)
train['feature_median'] = train.iloc[:,0:21].median(axis = 1)
train['feature_sum'] = train.iloc[:,0:21].sum(axis = 1)
train['feature_sd'] = train.iloc[:,0:21].std(axis = 1)
train['feature_min'] = train.iloc[:,0:21].min(axis = 1)
train['feature_max'] = train.iloc[:,0:21].max(axis = 1)

test['feature_mean'] = test.iloc[:,0:21].mean(axis = 1)
test['feature_median'] = test.iloc[:,0:21].median(axis = 1)
test['feature_sum'] = test.iloc[:,0:21].sum(axis = 1)
test['feature_sd'] = test.iloc[:,0:21].std(axis = 1)
test['feature_min'] = test.iloc[:,0:21].min(axis = 1)
test['feature_max'] = test.iloc[:,0:21].max(axis = 1)

# Write out train and test files, and add t_id to test set
test = pd.concat([t_id, test], axis=1)
pd.DataFrame.to_csv(train, 'Data/FE_v1_train.csv', index=False)
pd.DataFrame.to_csv(test, 'Data/FE_v1_test.csv', index=False)

----------

# FE_V2

Polynomial feature engineering, with degree=2 

It doesn't contain duplicated columns (feature1_feature1 for instance), but contains basic statistics and original dataset as well. 

In [18]:
# Load clean Data:
train = pd.read_csv('CleanData/numerai_training_data.csv')
test = pd.read_csv('CleanData/numerai_tournament_data.csv')

# Basic transformation - make train_y into a separate file
train_y = pd.DataFrame(train.loc[:,'target'])
train_y = train_y.reset_index(drop = True)

train.drop('target', axis = 1, inplace = True)

t_id = pd.DataFrame(test.t_id)
test.drop('t_id', inplace=True, axis=1)

from sklearn import preprocessing
poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

# transformation
train_poly = poly.fit_transform(train)
train_poly = pd.DataFrame(train_poly)

test_poly = poly.fit_transform(test)
test_poly = pd.DataFrame(test_poly)

# Feature engineering

train_poly['feature_mean'] = train_poly.iloc[:,0:231].mean(axis = 1)
train_poly['feature_median'] = train_poly.iloc[:,0:231].median(axis = 1)
train_poly['feature_sum'] = train_poly.iloc[:,0:231].sum(axis = 1)
train_poly['feature_sd'] = train_poly.iloc[:,0:231].std(axis = 1)
train_poly['feature_min'] = train_poly.iloc[:,0:231].min(axis = 1)
train_poly['feature_max'] = train_poly.iloc[:,0:231].max(axis = 1)

test_poly['feature_mean'] = test_poly.iloc[:,0:231].mean(axis = 1)
test_poly['feature_median'] = test_poly.iloc[:,0:231].median(axis = 1)
test_poly['feature_sum'] = test_poly.iloc[:,0:231].sum(axis = 1)
test_poly['feature_sd'] = test_poly.iloc[:,0:231].std(axis = 1)
test_poly['feature_min'] = test_poly.iloc[:,0:231].min(axis = 1)
test_poly['feature_max'] = test_poly.iloc[:,0:231].max(axis = 1)

# Add t_id column back to test dataset
test_poly = pd.concat([t_id, test_poly], axis=1)

# Write out results
pd.DataFrame.to_csv(train_poly, 'Data/FE_v2_train.csv', index=False)
pd.DataFrame.to_csv(test_poly, 'Data/FE_v2_test.csv', index=False)