In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.metrics import accuracy_score
from kaggle.competitions import twosigmanews

In [None]:
env = twosigmanews.make_env()
(market_train, news_train) = env.get_training_data()

In [None]:
market_train.columns

In [None]:
market_train.loc[market_train['assetCode'] == 'A.N', ['time', 'assetCode']]


In [None]:
target_col = ['returnsOpenNextMktres10']
cat_cols = ['assetCode']
num_cols = ['volume', 'close', 'open',
       'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
       'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
       'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
       'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']

In [None]:
from sklearn.model_selection import train_test_split

market_train = market_train.loc[pd.to_datetime(market_train['time']) >= pd.to_datetime('2009-01-01').tz_localize('UTC')]

train_indices, val_indices = train_test_split(market_train.index.values, test_size = 0.25, random_state = 23)

In [None]:
# Handles categorical variables

def encode(encoder, x):
    len_encoder = len(encoder)
    try:
        id = encoder[x]
    except KeyError:
        id = len_encoder
    return id

encoders = [{} for cat in cat_cols]

for i, cat in enumerate(cat_cols):
    print('encoding %s ...' % cat, end = ' ')
    encoders[i] = {l: id for id, l in enumerate(market_train.loc[train_indices, cat].astype(str).unique())}
    market_train[cat] = market_train[cat].astype(str).apply(lambda x: encode(encoders[i], x))
    print('Done')
    
embed_sizes = [len(encoder) + 1 for encoder in encoders]

In [None]:
encoders

In [None]:
# Handles numerical variables
from sklearn.preprocessing import StandardScaler
from datetime import datetime

market_train[num_cols] = market_train[num_cols].fillna(0)
print('scaling numerical columns')

scaler = StandardScaler()
print(market_train['time'].dtypes)
market_train[num_cols] = scaler.fit_transform(market_train[num_cols])

In [None]:
# Prepare data and get variables to calculate scoring metric
def get_input(market_train, indices):
    X_num = market_train.loc[indices, num_cols].values
    X = {'num': X_num}
    for cat in cat_cols:
        X[cat] = market_train.loc[indices, cat_cols].values
    y = (market_train.loc[indices, 'returnsOpenNextMktres10'] >= 0).values
    r = market_train.loc[indices, 'returnsOpenNextMktres10'].values
    u = market_train.loc[indices, 'universe']
    d = market_train.loc[indices, 'time'].dt.date
    return X, y, r, u, d # r, u, and d are used to calculate the scoring metric

X_train, y_train, r_train, u_train, d_train = get_input(market_train, train_indices)
X_valid, y_valid, r_valid, u_valid, d_valid = get_input(market_train, val_indices)

In [None]:
# Magic XG Boost Model
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)

model = XGBClassifier(n_jobs = 4, n_estimators = 47, max_depth = 6)
model.fit(X_train['num'], y_train.astype(int))
confidence_valid = model.predict(X_valid['num'])*2-1
score = accuracy_score(confidence_valid>0, y_valid)
print(score)

In [None]:
# Calculation of actual metric that is used for final score
r_valid = r_valid.clip(-1,1) # get rid out outliers
x_t_i = confidence_valid * r_valid * u_valid
data = {'day': d_valid, 'x_t_i' : x_t_i}
df = pd.DataFrame(data)
x_t = df.groupby('day').sum().values.flatten()
mean = np.mean(x_t)
std = np.std(x_t)
score_valid = mean / std
print(score_valid)

In [None]:
confidence_valid

In [None]:
y_valid

In [None]:
plt.hist(confidence_valid, bins = 'auto')
plt.title("predicted confidence")
plt.show()