In [1]:
import pandas as pd
import numpy as np
import datetime

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten, Convolution2D, Convolution1D, Reshape, Lambda, AveragePooling1D, AveragePooling2D, MaxPooling1D
from keras.layers import LSTM, SimpleRNN, GRU
from keras.regularizers import l1, l2, activity_l2, l1, activity_l1
from keras.preprocessing.text import Tokenizer
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import Merge
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping


import pickle
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split


Using TensorFlow backend.


In [2]:
max_features = 5000 # max number of words to include (remove lower frequency words)
maxlen = 5000  # cut texts after this number of words 

test_train_split_year = 2011
forecast_year = 1

In [3]:
def n_year_before(df, n = 1):
    """input x,y df, return df with y and n year before x"""
    dat_tmp = df.copy()
    dat_tmp['fyear'] = dat_tmp['fyear'] + n
    dat_tmp = dat_tmp.drop('Y',axis =1)
    Ys = df[['fyear','gvkey','Y']]
    n_year = pd.merge(dat_tmp,Ys,how = 'inner',on=['fyear','gvkey'])
    return n_year


def load_data(X_padded_text):
    """ Load tokenized word sequence and pad it for deep learning"""   
    index_10k = pd.read_csv('data/10k/10k_index.csv',usecols=['gvkey','fyear'])
    index_10k['index_10k'] = index_10k.index
    final_variable = pd.read_csv('data/final_variables.csv')
    final_variable = final_variable.drop('Unnamed: 0',1)
    final_variable = final_variable.replace([np.inf,-np.inf],0)

    # match text index with one year after y, index_10k_y has the index of text data has one year after Y matched
    index_10k_y = pd.merge(left=index_10k, right=final_variable, how='inner', on=['gvkey','fyear'])
    print("Total number of observations with no forecasting: ")
    print(index_10k_y.shape)
    
    index_10k_y_n_year = n_year_before(index_10k_y, n = forecast_year)
    print("Total number of observations: ")
    print(index_10k_y_n_year.shape)
    
    y = np.array(index_10k_y_n_year['Y'])
    X_text = X_padded_text[index_10k_y_n_year['index_10k'].tolist()] # get all text X which has matched one year after Y
    X_num = index_10k_y_n_year.drop(['gvkey', 'fyear', 'datadate', 'cusip', 'PERMCO', 'Y', 'PERMNO', 'index_10k'], 1)
    X_num = X_num.as_matrix()
    # split train-test by year
    index_10k_y_n_year = index_10k_y_n_year.reset_index(drop=True)
    train_index = index_10k_y_n_year[index_10k_y_n_year['fyear'] < test_train_split_year].index.tolist()
    test_index = index_10k_y_n_year[index_10k_y_n_year['fyear'] >= test_train_split_year].index.tolist()
    print(X_text.shape, X_num.shape, y.shape)
    return X_text, X_num, train_index, test_index, y

In [5]:
X_padded = pickle.load(open("data/10k/X_padded.pickle", 'rb'))
X_text, X_num, train_index, test_index, y = load_data(X_padded_text = X_padded)

Total number of observations with no forecasting: 
(79222, 44)
Total number of observations: 
(64999, 44)
(64999, 5000) (64999, 36) (64999,)


In [20]:
embedding_dims = 50
batch_size = 32
nb_epoch = 8
class_weight = {0:1,1:50}

def create_model_text(no_merge = False):
    # this is the Deep Averaging Network Moodel
    # see "Deep Unordered Composition Rivals Syntactic Methods for Text Classification", Iyyer et al. 2015

    embedding_size = embedding_dims
    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features, embedding_size, input_length=maxlen))

#     model.add(Dropout(0.1)) # Use dropout if implement the DAN model
    model.add(AveragePooling1D(pool_length=model.output_shape[1]))
    model.add(Flatten())

    # We add a vanilla hidden layer:
    model.add(Dense(8, activation='relu', activity_regularizer=activity_l2(0.1)))
    model.add(Dropout(0.1))
    
#     model.add(Dense(1, activation='relu'))

    return model


def create_model_num(no_merge = False):
    model = Sequential()
    model.add(Dense(4, input_dim=36, activation='relu', activity_regularizer=activity_l2(0.00001)))
#     model.add(Dense(1, activation='relu'))
    
    return model

def create_model_merge_layer():
    model_text = create_model_text()
    model_num = create_model_num()
    merged = Merge([model_text, model_num], mode='concat', concat_axis = 1)
    final_model = Sequential()
    final_model.add(merged)
    print(final_model.layers[-1].output_shape)
    final_model.add(Dense(4, activation='relu', activity_regularizer=activity_l2(0.00001)))
    final_model.add(Dense(1, activation='sigmoid'))
    final_model.compile(loss='binary_crossentropy',
                      optimizer='nadam',
                      metrics=['accuracy'])
    return final_model


def get_date_time():
    d_date = datetime.datetime.now()
    reg_format_date = d_date.strftime("%Y-%m-%d %H:%M:%S")
    return(reg_format_date)


class IntervalEvaluation(Callback):
    """ Show AUC after interval number of epoches """
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict_proba(self.X_val, verbose=0)
            score = metrics.roc_auc_score(self.y_val, y_pred)
            logs['score'] = score
            print(" epoch:{:d} AUC: {:.4f}".format(epoch, score))
            
            
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, 
                             model_output, model_config_output,
                             batch_size = batch_size, nb_epoch = nb_epoch, verbose = 1,
                            ):
    """Calculate and print ROC score from a set of X and y"""
    ival = IntervalEvaluation(validation_data=(X_test, y_test), interval=1)
    early_stopping = EarlyStopping(monitor='score', min_delta=0, patience=2, mode = 'max')
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, 
             validation_data= None, class_weight = class_weight, verbose = verbose, callbacks=[ival, early_stopping])
    pred_yp = model.predict(X_test)
    roc = metrics.roc_auc_score(y_test , pred_yp)
    accuracy_ratio = (roc-0.5)*2 
    brier = metrics.brier_score_loss(y_test , pred_yp)
    date_time_stamp = get_date_time()
    model.save(model_output + date_time_stamp)
    with open(model_config_output + date_time_stamp, 'w') as file:
        file.write(model.to_json())
    print("AUC " + str(roc))
    print("Accuracy_ratio " + str(accuracy_ratio))
    print("Brier Score " + str(brier))
    with open("model/dl_log.txt", 'a') as file:
        file.write(date_time_stamp + "\n" + str([accuracy_ratio, roc, brier]) + "\n")
    return [accuracy_ratio, roc, brier]



def forecast_performace(X_text, X_num, train_index, test_index, y):
    """Train_test split using year; Calculate and print performance score"""
    print("Running Model")
    model = None # Clearing the NN.
    model = create_model_merge_layer()
    X_train = [X_text[train_index], X_num[train_index]]
    y_train = y[train_index]
    X_test = [X_text[test_index], X_num[test_index]]
    y_test =  y[test_index]
    train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_output = 'model/DL_merge_.mod',
                                model_config_output = "model/DL_merge.json")



In [None]:
forecast_performace(X_text, X_num, train_index, test_index, y)