In [None]:
## This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
from kaggle.competitions import twosigmanews
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection  import train_test_split
import lightgbm as lgb
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

### Import the environment and datasets.

In [None]:
env = twosigmanews.make_env()

(market_train_df, news_train_df) = env.get_training_data()

### Clean and reshape the data for use in model.

#### Remove data from the financial crisis.

In [None]:
def split_df_by_year(df):
    df['date'] = df['time'].dt.floor('D')
    df.set_index('date', inplace=True)
    df = df[(df.index >= '2010-01-01') & (df.index <= '2016-12-31')]
    df.reset_index(inplace=True)
    return df

market = split_df_by_year(market_train_df)
news = split_df_by_year(news_train_df)

del market_train_df
del news_train_df

#### Market dataframe:
#### 1. Drop rows with NANs in the market dataframe.
#### 2. To be able to merge on the column later, rename the 'assetCode' column of the market dataframe to 'assetCodes'.

In [None]:
def market_shape(df,dropna):
    if dropna:
        df.dropna(inplace=True)
    #df.rename(index=str, columns={"assetCode": "assetCodes"}, inplace=True)
    
    return df

#### News dataframe:
#### 1. Drop columns that won't be used in the model to save memory.
#### 2. The news dataframe has a column with multiple asset codes in one cell. Reshape the 'assetCodes' column of the news dataframe to expand each asset code into its own cell.
#### 3. To be able to merge on this column later, strip extra characters from the news dataframe 'assetCodes' column.
#### 4. Engineer features for proportion of words relevant to the asset and the position of the first mention of the asset in the news item.
#### 5. Aggegate the data from the news dataframe on assetCode and date, and create columns for statistics of the aggregated data.
#### 6. Format the aggregated columns.

In [None]:
def news_shape(df):
    df.drop(['sourceTimestamp', 'firstCreated', 'sourceId', 'headline', 'provider', 'subjects', 'audiences'], axis=1, inplace=True)
    
    #df = df.set_index(df.columns.drop('assetCodes',1).tolist()).assetCodes.str.split(' ', expand=True).stack().reset_index().rename(columns={0:'assetCodes'}).loc[:, df.columns]
    
    #df['assetCodes'] = df['assetCodes'].map(lambda x: x.lstrip('{' '\'').rstrip('}' '\'' ','))

    df['proportionRelevant'] = df['sentimentWordCount'] / df['wordCount']

    df['firstMentionPosition'] = df['firstMentionSentence'] / df['sentenceCount']
    
    df = df.groupby(['assetName', 'date']).agg({'urgency': [np.min, np.sum], 'takeSequence': np.max,
                    'companyCount': [np.min, np.max, np.mean, np.std],
                    'bodySize': [np.mean, np.std],
                    'wordCount': [np.mean, np.std],
                    'marketCommentary': [np.mean, np.std],
                    'sentimentClass' : [np.sum, np.mean, np.std],
                    'sentimentNegative': [np.mean, np.std],
                    'sentimentNeutral': [np.mean, np.std],
                    'sentimentPositive': [np.mean, np.std], 
                    'relevance' : [np.mean, np.std], 
                    'proportionRelevant' : [np.mean, np.std],
                    'firstMentionPosition' : [np.mean, np.std],
                    'noveltyCount12H' : [np.mean, np.std], 
                    'noveltyCount24H' : [np.mean, np.std],
                    'noveltyCount3D' : [np.mean, np.std], 
                    'noveltyCount5D' : [np.mean, np.std],
                    'noveltyCount7D' : [np.mean, np.std],
                    'volumeCounts12H' : [np.mean, np.std],
                    'volumeCounts24H' : [np.mean, np.std],
                    'volumeCounts3D' : [np.mean, np.std],
                    'volumeCounts5D' : [np.mean, np.std], 
                    'volumeCounts7D': [np.mean, np.std]})
    
    df = df.apply(np.float32)
    
    df.columns = ['_'.join(col).strip() for col in df.columns.values]
    
    return df

In [None]:
news.head()

#### Merge the market and news dataframes on asset code and date.

In [None]:
def merge(df1, df2):
    df = pd.merge(df1, df2, how='left', on =['assetName', 'date'])
    return df

#### Shape and merge the data.
#### Encode the assetCodes column as a unique numerical key to be used as a categorical feature in the model.
#### Encode the date column as catergorical features to be used in the model.
#### Remove all non-numerical data from the X dataframe.
#### Separate the target feature (10 day market residualized returns) into the y dataframe.

In [None]:
def get_data(market, news):
    data = shape_and_merge(market, news)
    X = data
    y = data['returnsOpenNextMktres10'].clip(-1, 1)
    X.drop(columns=['returnsOpenNextMktres10'], inplace=True)
    return X, y, data

def label_encode(series, min_count):
    vc = series.value_counts()
    le = {c:i for i, c in enumerate(vc.index[vc >= min_count])}
    return le

def shape_and_merge(market, news, dropna=True):
    market_shaped = market_shape(market, dropna)
    news_shaped = news_shape(news)
    df = merge(market_shaped, news_shaped)
    
    le = None
    if le is None:
        le_assetCode = label_encode(df['assetName'], min_count=10)
    else:
        le_assetCode = le
    
    df['assetName'] = df['assetName'].map(le_assetCode).fillna(-1).astype(int)
    df['dayofweek'], df['month'] = df.time.dt.dayofweek, df.time.dt.month
    return df





In [None]:
X, y, data = get_data(market, news)


In [None]:
X.shape, y.shape, data.shape

In [None]:
X.head()

### Preprocess the data into training and validation sets and scale features.

In [None]:
universe = data['universe']
time = data['time']

X.drop(['time', 'date', 'assetCode', 'universe'], axis=1, inplace=True)
n_train = int(X.shape[0] * 0.95)

X_train, y_train = X.iloc[:n_train], y.iloc[:n_train]
X_valid, y_valid = X.iloc[n_train:], y.iloc[n_train:]

u_valid = (universe.iloc[n_train:] > 0)
t_valid = time.iloc[n_train:]

X_valid = X_valid[u_valid]
y_valid = y_valid[u_valid]
t_valid = t_valid[u_valid]
del u_valid



In [None]:
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_valid = sc.transform(X_valid)



### Convert the datasets to LGB format, train the model, and calculate the score.

In [None]:
# Convert the datasets to LGB format
train_cols = X.columns.tolist()
categorical_cols = ['assetName', 'dayofweek', 'month'] 

# Note: y data is expected to be a pandas Series, as we will use its group_by function in `sigma_score`
dtrain = lgb.Dataset(X_train, y_train, feature_name=train_cols, categorical_feature=categorical_cols, free_raw_data=False)
dvalid = lgb.Dataset(X_valid, y_valid, feature_name=train_cols, categorical_feature=categorical_cols, free_raw_data=False)

dvalid.params = {
    'extra_time': t_valid.factorize()[0]
}

lgb_params = dict(
    objective = 'regression_l1',
    learning_rate = 0.05,
    num_leaves = 125,
    max_depth = -1,
    min_data_in_leaf = 1000,
    bagging_fraction = 0.75,
    bagging_freq = 2,
    feature_fraction = 0.5,
    lambda_l1 = 0.0,
    lambda_l2 = 1.0,
    max_bin = 125,
    metric = 'None', 
    seed = 0 
)

def score(preds, valid_data):
    df_time = valid_data.params['extra_time']
    labels = valid_data.get_label()
    
    x_t = preds * labels 
    x_t_sum = x_t.groupby(df_time).sum()
    score = x_t_sum.mean() / x_t_sum.std()

    return 'score', score, True

evals_result = {}
m = lgb.train(lgb_params, dtrain, num_boost_round=1000, valid_sets=(dvalid,), valid_names=('valid',), verbose_eval=25,
              early_stopping_rounds=100, feval=score, evals_result=evals_result)

df_result = pd.DataFrame(evals_result['valid'])

In [None]:
ax = df_result.plot(figsize=(12, 8))
ax.scatter(df_result['score'].idxmax(), df_result['score'].max(), marker='+', color='red')

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 2, figsize=(14, 14))
lgb.plot_importance(m, ax=ax[0])
lgb.plot_importance(m, ax=ax[1], importance_type='gain')
fig.tight_layout()

### Make predictions and submit.

In [None]:
days = env.get_prediction_days()

In [None]:
def make_predictions(market_obs_df, news_obs_df, predictions_template_df):
    market_obs_df['date'] = market_obs_df['time'].dt.floor('D')
    news_obs_df['date'] = news_obs_df['time'].dt.floor('D')
    x = shape_and_merge(market_obs_df, news_obs_df, False)
    x.drop(['time', 'date', 'assetCode'], axis=1, inplace=True)
    y = np.clip(m.predict(x), -1, 1)
    print(y.shape, x.shape, predictions_template_df.shape)
    predictions_template_df.confidenceValue = y
    return predictions_template_df.confidenceValue

In [None]:
i = 0
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    predictions_template_df.head()
    i += 1
    print(i)
    make_predictions(market_obs_df, news_obs_df, predictions_template_df)
    env.predict(predictions_template_df)
print('Done!')

env.write_submission_file()

#### Resources:
[A Simple Model Using the Market and News Data](http://www.kaggle.com/bguberfain/a-simple-model-using-the-market-and-news-data/notebook)

[What is LightGBM, How to implement it? How to fine tune the parameters?](http://https://medium.com/@pushkarmandot/https-medium-com-pushkarmandot-what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-60347819b7fc)

[Light GBM docs](http://media.readthedocs.org/pdf/lightgbm/latest/lightgbm.pdf)
