# Load Libraries for use in Binary Classification
First let's import the module and create an environment.

In [None]:
from kaggle.competitions import twosigmanews
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import datetime as dt
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
stop = set(stopwords.words('english'))


import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()

In [None]:
(market_df, news_df) = env.get_training_data()

# Simple Data Preprocessing: Assume no feature is useless and remove any NaN

In [None]:
#Lets combine some features in intuitive ways to reduce number of features
#Market Data Reduction
market_df['returnsOpenPrevRaw1_to_volume'] = market_df['returnsOpenPrevRaw1'] / market_df['volume'] #This normalizes the change in cost for one day to per stock [else larger companies could be unfairly weighted more]
market_df['close_to_open'] = market_df['close'] / market_df['open'] #Really what matters is how much closing differs, not each indepedantly
market_df['volume_to_mean'] = market_df['volume'] / market_df['volume'].mean() #This metric marks how much the company's total shares varies compared to the mean. If they're growing a lot or shrinking a lot, this will take that into account  

#News Data Reduction
news_df['asset_sentiment_count'] = news_df.groupby(['assetName', 'sentimentClass'])['time'].transform('count') #How positively is the environment where an asset is mentioned

#Merges Market Data and News Data into a single Training set
def data_prep(market_df,news_df):
    market_df['time'] = market_df.time.dt.date
    news_df['time'] = news_df.time.dt.hour
    news_df['sourceTimestamp']= news_df.sourceTimestamp.dt.hour
    news_df['firstCreated'] = news_df.firstCreated.dt.date
    news_df['assetCodes'] = news_df['assetCodes'].map(lambda x: list(eval(x))[0])
    news_df['headlineLen'] = news_df['headline'].apply(lambda x: len(x))
    news_df['assetCodesLen'] = news_df['assetCodes'].apply(lambda x: len(x))
    news_df['asset_sentence_mean'] = news_df.groupby(['assetName', 'sentenceCount'])['time'].transform('mean')
    lbl = {k: v for v, k in enumerate(news_df['headlineTag'].unique())}
    news_df['headlineTagT'] = news_df['headlineTag'].map(lbl)
    kcol = ['firstCreated', 'assetCodes']
    news_df = news_df.groupby(kcol, as_index=False).mean()

    market_df = pd.merge(market_df, news_df, how='left', left_on=['time', 'assetCode'], 
                            right_on=['firstCreated', 'assetCodes'])

    lbl = {k: v for v, k in enumerate(market_df['assetCode'].unique())}
    market_df['assetCodeT'] = market_df['assetCode'].map(lbl)
    
    market_df = market_df.dropna(axis=0) #Drops any NaN errors
    
    return market_df

market_train = data_prep(market_df, news_df)
up = market_train.returnsOpenNextMktres10 >= 0 #Keep conditions where 10 day market price increased for our decision boundary

#fcol = [c for c in market_train.columns if c not in ['assetCode', 'assetCodes', 'assetCodesLen', 'assetName', 'assetCodeT', 'volume_to_mean', 'sentence_word_count',
                                            # 'firstCreated', 'headline', 'headlineTag', 'marketCommentary', 'provider', 'returnsOpenPrevRaw1_to_volume',
                                            # 'returnsOpenNextMktres10', 'sourceId', 'subjects', 'time', 'time_x', 'universe','sourceTimestamp']]

fcol = [c for c in market_train.columns if c not in['returnsOpenNextMktres10',
                                                  'assetCode','assetName','assetCodesLen','assetCodes']] #Only drop strings, columns that are ALL NaN and 10 day market price, which we are predicting

X = market_train[fcol].values
up = up.values
r = market_train.returnsOpenNextMktres10.values

# Scaling of X values
mins = np.min(X, axis=0)
maxs = np.max(X, axis=0)
rng = maxs - mins
X = 1 - ((maxs - X) / rng)

In [None]:
market_train.head(5)

In [None]:
X_train, X_test, up_train, up_test, r_train, r_test = model_selection.train_test_split(X, up, r, test_size=0.2, random_state=99)

In [None]:
params = {'learning_rate': 0.025, 'boosting': 'goss', 'objective': 'binary', 'metric': 'binary_logloss', 'is_training_metric': True, 'seed': 42}
model = lgb.train(params, train_set=lgb.Dataset(X_train, label=up_train), num_boost_round=3000,
                  valid_sets=[lgb.Dataset(X_train, label=up_train), lgb.Dataset(X_test, label=up_test)],
                  verbose_eval=100, early_stopping_rounds=50)

In [None]:
#Test Model
y_pred = model.predict(X_test,num_iteration = model.best_iteration)
#Determine Accuracy
y_bool = y_pred >= 0 
print(y_bool)
print(up_test)
acc = ~(y_bool ^ up_test) #XNOR operation so 1 1 and 0 0 returns true, else false
print(np.sum(acc)/len(acc))

In [None]:
def generate_color():
    color = '#{:02x}{:02x}{:02x}'.format(*map(lambda x: np.random.randint(0, 255), range(3)))
    return color

df = pd.DataFrame({'imp': model.feature_importance(), 'col':fcol})
df = df.sort_values(['imp','col'], ascending=[True, False])
data = [df]
for dd in data:  
    colors = []
    for i in range(len(dd)):
         colors.append(generate_color())

    data = [
        go.Bar(
        orientation = 'h',
        x=dd.imp,
        y=dd.col,
        name='Features',
        textfont=dict(size=12),
            marker=dict(
            color= colors,
            line=dict(
                color='#000000',
                width=0.5
            ),
            opacity = 0.87
        )
    )
    ]
    layout= go.Layout(
        title= 'Feature Importance of LGB',
        xaxis= dict(title='Columns', ticklen=5, zeroline=False, gridwidth=2),
        showlegend=True
    )

    py.iplot(dict(data=data,layout=layout), filename='horizontal-bar')
    print(df)

# How does the model change if we remove data during the crash and detect for clarical errors

In [None]:
#Get fresh set of market and news data
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
#Recall from data analysis step, mean change gets crazy during financial crash and some entries are faulty [closing fluctuation is way too big for one day]

#I noticed in a couple kernels that there are occassional errors in dataset. This occurs when an 'openning' price is greatly different then the average
market_train_df['close_to_open'] =  np.abs(market_train_df['close'] / market_train_df['open']) #Normalized ratio of close price to open

market_train_df['assetName_mean_open'] = market_train_df.groupby('assetName')['open'].transform('mean')
market_train_df['assetName_mean_close'] = market_train_df.groupby('assetName')['close'].transform('mean')
threshold = 0.7

# if open price is too far from mean open price for this company, replace it. Otherwise replace close price.
for i, row in market_train_df.loc[market_train_df['close_to_open'] >= (1+threshold)].iterrows():
    if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
        market_train_df.iloc[i,5] = row['assetName_mean_open']
    else:
        market_train_df.iloc[i,4] = row['assetName_mean_close']
        
for i, row in market_train_df.loc[market_train_df['close_to_open'] <= 1-(threshold/2)].iterrows():
    if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
        market_train_df.iloc[i,5] = row['assetName_mean_open']
    else:
        market_train_df.iloc[i,4] = row['assetName_mean_close']
        
#Throw out data from financial crash
market_train_df = market_train_df.loc[market_train_df['time'] >= '2010-01-01 22:00:00+0000']
news_train_df = news_train_df.loc[news_train_df['time'] >= '2010-01-01 22:00:00+0000']

In [None]:
#Merges Market Data and News Data into a single Training set
#Market Data Reduction
market_train_df['returnsOpenPrevRaw1_to_volume'] = market_train_df['returnsOpenPrevRaw1'] / market_train_df['volume'] #This normalizes the change in cost for one day to per stock [else larger companies could be unfairly weighted more]
market_train_df['volume_to_mean'] = market_train_df['volume'] / market_train_df['volume'].mean() #This metric marks how much the company's total shares varies compared to the mean. If they're growing a lot or shrinking a lot, this will take that into account  

#News Data Reduction
news_train_df['asset_sentiment_count'] = news_train_df.groupby(['assetName', 'sentimentClass'])['time'].transform('count') #How positively is the environment where an asset is mentioned

def data_prep(market_df,news_df):
    market_df['time'] = market_df.time.dt.date
    news_df['time'] = news_df.time.dt.hour
    news_df['sourceTimestamp']= news_df.sourceTimestamp.dt.hour
    news_df['firstCreated'] = news_df.firstCreated.dt.date
    news_df['assetCodes'] = news_df['assetCodes'].map(lambda x: list(eval(x))[0])
    news_df['headlineLen'] = news_df['headline'].apply(lambda x: len(x))
    news_df['assetCodesLen'] = news_df['assetCodes'].apply(lambda x: len(x))
    news_df['asset_sentence_mean'] = news_df.groupby(['assetName', 'sentenceCount'])['time'].transform('mean')
    lbl = {k: v for v, k in enumerate(news_df['headlineTag'].unique())}
    news_df['headlineTagT'] = news_df['headlineTag'].map(lbl)
    kcol = ['firstCreated', 'assetCodes']
    news_df = news_df.groupby(kcol, as_index=False).mean()

    market_df = pd.merge(market_df, news_df, how='left', left_on=['time', 'assetCode'], 
                            right_on=['firstCreated', 'assetCodes'])

    lbl = {k: v for v, k in enumerate(market_df['assetCode'].unique())}
    market_df['assetCodeT'] = market_df['assetCode'].map(lbl)
    
    market_df = market_df.dropna(axis=0) #Drops any NaN errors
    
    return market_df

market_train_2 = data_prep(market_train_df, news_train_df)
up_2 = market_train_2.returnsOpenNextMktres10 >= 0 #Keep conditions where 10 day market price increased for our decision boundary

fcol = [c for c in market_train_2.columns if c not in['returnsOpenNextMktres10',
                                                  'assetCode','assetName','assetCodesLen','assetCodes']] #Only drop strings, columns that are ALL NaN and 10 day market price, which we are predicting

X_2 = market_train_2[fcol].values
up_2 = up_2.values
r_2 = market_train_2.returnsOpenNextMktres10.values

# Scaling of X values
mins = np.min(X_2, axis=0)
maxs = np.max(X_2, axis=0)
rng = maxs - mins
X_2 = 1 - ((maxs - X_2) / rng)

In [None]:
X_train_2, X_test_2, up_train_2, up_test_2, r_train_2, r_test_2 = model_selection.train_test_split(X_2, up_2, r_2, test_size=0.2, random_state=99)

params_2 = {'learning_rate': 0.025, 'boosting': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 'is_training_metric': True, 'seed': 42}
model_2 = lgb.train(params_2, train_set=lgb.Dataset(X_train_2, label=up_train_2), num_boost_round=3000,
                  valid_sets=[lgb.Dataset(X_train_2, label=up_train_2), lgb.Dataset(X_test_2, label=up_test_2)],
                  verbose_eval=50, early_stopping_rounds=50)

In [None]:
#Test Model
y_pred_2 = model_2.predict(X_test_2,num_iteration = model_2.best_iteration)
#Determine Accuracy
y_bool_2 = y_pred_2 >= 0 
acc_2 = ~(y_bool_2 ^ up_test_2) #XNOR operation so 1 1 and 0 0 returns true, else false
print(np.sum(acc_2)/len(acc_2))

In [None]:
def generate_color():
    color = '#{:02x}{:02x}{:02x}'.format(*map(lambda x: np.random.randint(0, 255), range(3)))
    return color

df = pd.DataFrame({'imp': model_2.feature_importance(), 'col':fcol})
df = df.sort_values(['imp','col'], ascending=[True, False])
data = [df]
for dd in data:  
    colors = []
    for i in range(len(dd)):
         colors.append(generate_color())

    data = [
        go.Bar(
        orientation = 'h',
        x=dd.imp,
        y=dd.col,
        name='Features',
        textfont=dict(size=12),
            marker=dict(
            color= colors,
            line=dict(
                color='#000000',
                width=0.5
            ),
            opacity = 0.87
        )
    )
    ]
    layout= go.Layout(
        title= 'Feature Importance of LGB',
        xaxis= dict(title='Columns', ticklen=5, zeroline=False, gridwidth=2),
        showlegend=True
    )

    py.iplot(dict(data=data,layout=layout), filename='horizontal-bar')
    print(df)

# Model with lowest importance features removed

In [None]:
#Get fresh set of market and news data
(market_reduced_df, news_reduced_df) = env.get_training_data()

#Lets combine some features in intuitive ways to reduce number of features
#Market Data Reduction
market_reduced_df['returnsOpenPrevRaw1_to_volume'] = market_reduced_df['returnsOpenPrevRaw1'] / market_reduced_df['volume'] #This normalizes the change in cost for one day to per stock [else larger companies could be unfairly weighted more]
market_reduced_df['close_to_open'] = market_reduced_df['close'] / market_reduced_df['open'] #Really what matters is how much closing differs, not each indepedantly
market_reduced_df['volume_to_mean'] = market_reduced_df['volume'] / market_reduced_df['volume'].mean() #This metric marks how much the company's total shares varies compared to the mean. If they're growing a lot or shrinking a lot, this will take that into account  

#News Data Reduction
news_reduced_df['asset_sentiment_count'] = news_reduced_df.groupby(['assetName', 'sentimentClass'])['time'].transform('count') #How positively is the environment where an asset is mentioned

#Merges Market Data and News Data into a single Training set
def data_prep(market_df,news_df):
    market_df['time'] = market_df.time.dt.date
    news_df['time'] = news_df.time.dt.hour
    news_df['sourceTimestamp']= news_df.sourceTimestamp.dt.hour
    news_df['firstCreated'] = news_df.firstCreated.dt.date
    news_df['assetCodes'] = news_df['assetCodes'].map(lambda x: list(eval(x))[0])
    news_df['headlineLen'] = news_df['headline'].apply(lambda x: len(x))
    news_df['assetCodesLen'] = news_df['assetCodes'].apply(lambda x: len(x))
    news_df['asset_sentence_mean'] = news_df.groupby(['assetName', 'sentenceCount'])['time'].transform('mean')
    lbl = {k: v for v, k in enumerate(news_df['headlineTag'].unique())}
    news_df['headlineTagT'] = news_df['headlineTag'].map(lbl)
    kcol = ['firstCreated', 'assetCodes']
    news_df = news_df.groupby(kcol, as_index=False).mean()

    market_df = pd.merge(market_df, news_df, how='left', left_on=['time', 'assetCode'], 
                            right_on=['firstCreated', 'assetCodes'])

    lbl = {k: v for v, k in enumerate(market_df['assetCode'].unique())}
    market_df['assetCodeT'] = market_df['assetCode'].map(lbl)
    
    market_df = market_df.dropna(axis=0) #Drops any NaN errors
    
    return market_df

market_train = data_prep(market_reduced_df, news_reduced_df)
up = market_train.returnsOpenNextMktres10 >= 0 #Keep conditions where 10 day market price increased for our decision boundary

fcol = [c for c in market_train.columns if c not in['returnsOpenNextMktres10',
                                                  'assetCode','assetName','assetCodesLen','assetCodes',
                                                   'volumn_to_mean','returnsOpenPrevRaw1_to_volume','firstCreated',
                                                   'sourceTimestamp','universe','marketCommentary']] #Only drop strings, columns that are ALL NaN and 10 day market price, which we are predicting

X_3 = market_train[fcol].values
up_3 = up.values
r_3 = market_train.returnsOpenNextMktres10.values

In [None]:
X_train_3, X_test_3, up_train_3, up_test_3, r_train_3, r_test_3 = model_selection.train_test_split(X_3, up_3, r_3, test_size=0.2, random_state=99)

In [None]:
params_3 = {'learning_rate': 0.025, 'boosting': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 'is_training_metric': True, 'seed': 42}
model_3 = lgb.train(params_3, train_set=lgb.Dataset(X_train_3, label=up_train_3), num_boost_round=3000,
                  valid_sets=[lgb.Dataset(X_train_3, label=up_train_3), lgb.Dataset(X_test_3, label=up_test_3)],
                  verbose_eval=100, early_stopping_rounds=50)

In [None]:
#Test Model
y_pred_3 = model_3.predict(X_test_3,num_iteration = model_3.best_iteration)
#Determine Accuracy
y_bool_3 = y_pred_3 >= 0 
acc_3 = ~(y_bool_3 ^ up_test_3) #XNOR operation so 1 1 and 0 0 returns true, else false
print(np.sum(acc_3)/len(acc_3))