In [1]:
import pandas as pd
%load_ext autoreload
%autoreload 2

In [2]:
import gc
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import numpy as np

In [3]:
LOCAL = True
TEST = False

In [13]:
if LOCAL:
    import sys
    sys.path.append('./utils/*.*')
    from utils.preprocess import *
    from utils.model import *
    market_train_df = pd.read_csv('./data/marketdata_sample.csv')
    news_train = pd.read_csv('./data/news_sample.csv')
    market_train_df['time'] = pd.to_datetime(market_train_df.time)
    news_train['time'] = pd.to_datetime(news_train.time)
else:
    import sys
    sys.path.append('../input/title-111/repository/modvala-2sigma-43ff6da/utils/*.*')
    from kaggle.competitions import twosigmanews
    from preprocess import *
    from model import *
    env = twosigmanews.make_env()
    market_train_df, news_train = env.get_training_data()
    market_train_df['time'] = pd.to_datetime(market_train_df.time)
    news_train['time'] = pd.to_datetime(news_train.time)

In [None]:
if TEST:
    market_train_df = market_train_df.sample(100000)

In [6]:
start = 2009
market_train = market_train_df.loc[market_train_df['time'].dt.year>= start].reset_index(drop=True)
news_train = news_train.loc[news_train['time'].dt.year >= start].reset_index(drop=True)

In [None]:
news_agg = prenews(news_train, TEST)
df = predata(news_agg, market_train, TEST)

In [None]:
date = df.date
num_target = df.returnsOpenNextMktres10.astype('float32')
bin_target = (df.returnsOpenNextMktres10 >= 0).astype('int8')
universe = df.universe.astype('int8')
# Drop columns that are not features
df.drop(['returnsOpenNextMktres10', 'date', 'universe', 'assetCode', 'assetName', 'time'], 
        axis=1, inplace=True)
df = df.astype('float32')  # Set all remaining columns to float32 datatype
gc.collect()

In [11]:
train_index, test_index = train_test_split(df.index.values, test_size=0.1)

NameError: name 'df' is not defined

In [None]:
param_grid = {
    'learning_rate': [0.05, 0.02, 0.01],
    'num_leaves': [25, 38, 63],
    'n_estimators': [100, 200, 400],
    'min_child_samples': [5, 10, 20, 40, 100],
    'colsample_bytree': [0.8, 0.9, 1],
    'subsample': [0.8, 0.9, 1],
    'reg_alpha': [0.1, 0.2, 0.4, 0.6, 0.8],
    'reg_lambda': [0.1, 0.2, 0.4, 0.6, 0.8],
}

In [None]:
best_eval_score = 0
for i in range(50):
    params = {k: np.random.choice(v) for k, v in param_grid.items()}
    score = evaluate_model(df, bin_target, train_index, test_index, params)
    if score < best_eval_score or best_eval_score == 0:
        best_eval_score = score
        best_params = params
    print(best_eval_score)
print("Best evaluation logloss", best_eval_score)

In [None]:
# Train model with full data
clf = LGBMClassifier(**best_params)
clf.fit(df, bin_target)

test_df_columns = ['time', 'assetCode', 'assetName', 'volume', 'close', 'open',
                   'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
                   'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
                   'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
                   'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
base_df = market_train_df[market_train_df['time'].dt.year >= 2016]
base_df = base_df[test_df_columns]
base_df['id'] = -1
base_df.shape

In [None]:
TEST=False

In [None]:
def write_submission(model, env):
    days = env.get_prediction_days()
    day_id = 0
    market_obs_df_append = None
    for (market_obs_df, news_obs_df, predictions_template_df) in days:
        news_agg = prenews(news_obs_df, TEST)
        
        market_obs_df['id'] = day_id
        if market_obs_df_append is None:
            market_obs_df_append = base_df
            
        market_obs_df_append = pd.concat([market_obs_df_append,market_obs_df],
                                         ignore_index=True,
                                         sort=False)
        ods_df = predata(news_agg, market_obs_df_append, TEST)
 
        obs_df = obs_df[obs_df.assetCode.isin(predictions_template_df.assetCode)]
        # Drop cols that are not features
        feats = [c for c in obs_df.columns if c not in ['date', 'assetCode', 'assetName', 'time', 'id']]

        preds = model.predict_proba(obs_df[feats])[:, 1] * 2 - 1
        sub = pd.DataFrame({'assetCode': obs_df['assetCode'], 'confidence': preds})
        predictions_template_df = predictions_template_df.merge(sub, how='left').drop(
            'confidenceValue', axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
        
        env.predict(predictions_template_df)
        if day_id == 59:
            market_obs_df_append.drop(
                market_obs_df_append.index[market_obs_df_append['id']==-1],
                inplace=True)
        elif day_id >= 60:
            market_obs_df_append.drop(
                market_obs_df_append.index[market_obs_df_append['id']==day_id-60],
                inplace=True)
        day_id += 1
        del obs_df, predictions_template_df, preds, sub
        gc.collect()
    env.write_submission_file()
    print('day_count',day_id)

write_submission(clf, env)

feat_importance = pd.DataFrame()
feat_importance["feature"] = df.columns
feat_importance["gain"] = clf.booster_.feature_importance(importance_type='gain')
feat_importance.sort_values(by='gain', ascending=False, inplace=True)
plt.figure(figsize=(8,10))
ax = sns.barplot(y="feature", x="gain", data=feat_importance)