In [5]:
import pandas as pd
%load_ext autoreload
%autoreload 2

In [None]:
import gc

import numpy as np

In [2]:
LOCAL = True

In [16]:
if LOCAL:
    import sys
    sys.path.append('./utils/*.*')
    from utils.preprocess import *
    market_train_df = pd.read_csv('./data/marketdata_sample.csv')
    news_train = pd.read_csv('./data/news_sample.csv')
    market_train_df['time'] = pd.to_datetime(market_train_df.time)
    news_train['time'] = pd.to_datetime(news_train.time)
else:
    from kaggle.competitions import twosigmanews
    from utils import *
    env = twosigmanews.make_env()
    market_train_df, news_train = env.get_training_data()
    market_train_df['time'] = pd.to_datetime(market_train_df.time)
    news_train['time'] = pd.to_datetime(news_train.time)

In [19]:
market_train_df.dtypes

time                        datetime64[ns]
assetCode                           object
assetName                           object
universe                           float64
volume                             float64
close                              float64
open                               float64
returnsClosePrevRaw1               float64
returnsOpenPrevRaw1                float64
returnsClosePrevMktres1            float64
returnsOpenPrevMktres1             float64
returnsClosePrevRaw10              float64
returnsOpenPrevRaw10               float64
returnsClosePrevMktres10           float64
returnsOpenPrevMktres10            float64
returnsOpenNextMktres10            float64
dtype: object

In [20]:
start = pd.to_datetime('2009-01-01')
market_train = market_train_df.loc[market_train_df['time']>= start].reset_index(drop=True)
news_train = news_train.loc[news_train['time'] >= start].reset_index(drop=True)

In [22]:
news_train = preprocess_news(news_train)
index_df = unstack_asset_codes(news_train)
index_df.head()
news_unstack = merge_news_on_index(news_train, index_df)
del news_train, index_df
gc.collect()
news_unstack.head(3)

Unnamed: 0,assetCode,time,sourceId,urgency,takeSequence,provider,bodySize,companyCount,headlineTag,marketCommentary,...,noveltyCount12H,noveltyCount24H,noveltyCount3D,noveltyCount5D,noveltyCount7D,volumeCounts12H,volumeCounts24H,volumeCounts3D,volumeCounts5D,volumeCounts7D


In [23]:
news_agg = group_news(news_unstack)
del news_unstack; gc.collect()
news_agg.head(3)

Unnamed: 0,assetCode,date,sourceId_mean,urgency_mean,takeSequence_mean,provider_mean,bodySize_mean,companyCount_mean,headlineTag_mean,marketCommentary_mean,...,noveltyCount12H_mean,noveltyCount24H_mean,noveltyCount3D_mean,noveltyCount5D_mean,noveltyCount7D_mean,volumeCounts12H_mean,volumeCounts24H_mean,volumeCounts3D_mean,volumeCounts5D_mean,volumeCounts7D_mean


In [24]:
market_train = process_date(market_train)
market_train = process_ma(market_train)

Defaulting to column, but this will raise an ambiguity error in a future version
  df[std_column] = df.groupby('assetCode')[col].apply(lambda x: x.rolling(window).std())
Defaulting to column, but this will raise an ambiguity error in a future version
  df[ma_column] = df.groupby('assetCode')[col].apply(lambda x: x.rolling(window).mean())


In [25]:
df = market_train.merge(news_agg, how='left', on=['assetCode', 'date'])
del market_train, news_agg
gc.collect()
df.head(3)

Defaulting to column, but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.


ValueError: You are trying to merge on object and float64 columns. If you wish to proceed you should use pd.concat

In [None]:
date = df.date
num_target = df.returnsOpenNextMktres10.astype('float32')
bin_target = (df.returnsOpenNextMktres10 >= 0).astype('int8')
universe = df.universe.astype('int8')
# Drop columns that are not features
df.drop(['returnsOpenNextMktres10', 'date', 'universe', 'assetCode', 'assetName', 'time'], 
        axis=1, inplace=True)
df = df.astype('float32')  # Set all remaining columns to float32 datatype
gc.collect()

train_index, test_index = train_test_split(df.index.values, test_size=0.1)

In [None]:
param_grid = {
    'learning_rate': [0.05, 0.02, 0.01],
    'num_leaves': [25, 38, 63],
    'n_estimators': [100, 200, 400],
    'min_child_samples': [5, 10, 20, 40, 100],
    'colsample_bytree': [0.8, 0.9, 1],
    'subsample': [0.8, 0.9, 1],
    'reg_alpha': [0.1, 0.2, 0.4, 0.6, 0.8],
    'reg_lambda': [0.1, 0.2, 0.4, 0.6, 0.8],
}

In [None]:
best_eval_score = 0
for i in range(50):
    params = {k: np.random.choice(v) for k, v in param_grid.items()}
    score = evaluate_model(df, bin_target, train_index, test_index, params)
    if score < best_eval_score or best_eval_score == 0:
        best_eval_score = score
        best_params = params
    print(best_eval_score)
print("Best evaluation logloss", best_eval_score)

In [None]:
# Train model with full data
clf = LGBMClassifier(**best_params)
clf.fit(df, bin_target)

test_df_columns = ['time', 'assetCode', 'assetName', 'volume', 'close', 'open',
                   'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
                   'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
                   'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
                   'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
base_df = market_train_df[market_train_df['time'] >= '2016-10-01']
base_df = base_df[test_df_columns]
base_df['id'] = -1
base_df.shape

In [None]:
def write_submission(model, env):
    days = env.get_prediction_days()
    day_id = 0
    market_obs_df_append = None
    for (market_obs_df, news_obs_df, predictions_template_df) in days:
        news_obs_df = preprocess_news(news_obs_df)
        # Unstack news
        index_df = unstack_asset_codes(news_obs_df)
        news_unstack = merge_news_on_index(news_obs_df, index_df)
        # Group and and get aggregations (mean)
        news_obs_agg = group_news(news_unstack)

        market_obs_df['id'] = day_id
        if market_obs_df_append is None:
            market_obs_df_append = base_df
            
        market_obs_df_append = pd.concat([market_obs_df_append,market_obs_df],
                                         ignore_index=True,
                                         sort=False)
        
        market_obs_process = process_date(market_obs_df_append)
        market_obs_process = process_ma(market_obs_process)
        market_obs_df = market_obs_process[market_obs_process['id']==day_id]
        # Join market and news frames
        obs_df = market_obs_df.merge(news_obs_agg, how='left', on=['assetCode', 'date'])
        del market_obs_df, news_obs_agg, news_obs_df, news_unstack, index_df
        gc.collect()
        obs_df = obs_df[obs_df.assetCode.isin(predictions_template_df.assetCode)]
        # Drop cols that are not features
        feats = [c for c in obs_df.columns if c not in ['date', 'assetCode', 'assetName', 'time', 'id']]

        preds = model.predict_proba(obs_df[feats])[:, 1] * 2 - 1
        sub = pd.DataFrame({'assetCode': obs_df['assetCode'], 'confidence': preds})
        predictions_template_df = predictions_template_df.merge(sub, how='left').drop(
            'confidenceValue', axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
        
        env.predict(predictions_template_df)
        if day_id == 59:
            market_obs_df_append.drop(
                market_obs_df_append.index[market_obs_df_append['id']==-1],
                inplace=True)
        elif day_id >= 60:
            market_obs_df_append.drop(
                market_obs_df_append.index[market_obs_df_append['id']==day_id-60],
                inplace=True)
        day_id += 1
        del obs_df, predictions_template_df, preds, sub
        gc.collect()
    env.write_submission_file()
    print('day_count',day_id)

write_submission(clf, env)

feat_importance = pd.DataFrame()
feat_importance["feature"] = df.columns
feat_importance["gain"] = clf.booster_.feature_importance(importance_type='gain')
feat_importance.sort_values(by='gain', ascending=False, inplace=True)
plt.figure(figsize=(8,10))
ax = sns.barplot(y="feature", x="gain", data=feat_importance)