#  Bankruptcy Prediction using Keras

2016-12-20, Feng Mai  
Use Keras merge layer to combine textual and numerical features for bankruptcy prediction

In [1]:
import pandas as pd
import numpy as np
import datetime
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Input, Conv1D, Dense, Dropout, Activation, Embedding, Flatten, Convolution2D, Convolution1D, Reshape, Lambda, AveragePooling1D, AveragePooling2D, MaxPooling1D
from keras.layers import LSTM, SimpleRNN, GRU
from keras.regularizers import l1, l2
from keras.preprocessing.text import Tokenizer
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, GlobalMaxPooling2D
from keras.layers import Merge
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping
from keras.models import Model


import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import sklearn

from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold, cross_val_score

Using TensorFlow backend.


## Data Preparation

Parameters

In [2]:
max_features = 10000 # max number of words to include (remove lower frequency words)
maxlen = 5000  # cut texts after this number of words 

In [3]:
def n_year_before(df, n = 1):
    """input x,y df, return df with y and n year before x"""
    dat_tmp = df.copy()
    dat_tmp['fyear'] = dat_tmp['fyear'] + n
    dat_tmp = dat_tmp.drop('Y',axis =1)
    Ys = df[['fyear','gvkey','Y']]
    n_year = pd.merge(dat_tmp,Ys,how = 'inner',on=['fyear','gvkey'])
    return n_year


def pad_text_data():
    """ Load tokenized word sequence and pad it for deep learning"""
    print('Loading data')
    X = np.load("data/10k/X_keras_unigram.npy")
    lengths = [len(x) for x in X]
    pd.Series(lengths).describe()
    # pad sequence for deep learning
    print('Pad sequences')
    X = sequence.pad_sequences(X, maxlen=maxlen)
    
    return X


def construct_aligned_num_matrix(forecast_year = 1):
    index_10k = pd.read_csv('/shared/data/10k_2017/processed_corpus/10k_index.csv',usecols=['gvkey','fyear'])
    index_10k['index_10k'] = index_10k.index
    final_variable = pd.read_csv('data/final_variables.csv')
    final_variable = final_variable.drop('Unnamed: 0',1)
    final_variable = final_variable.replace([np.inf,-np.inf],0)

    final_variable = final_variable.query('fyear <= 2014 & fyear >= 1993')
    # match text index with one year after y, index_10k_y has the index of text data has one year after Y matched
    index_10k_y = pd.merge(left=index_10k, right=final_variable, how='inner', on=['gvkey','fyear'])
    print("Total number of observations with no forecasting: ")
    print(index_10k_y.shape)

    index_10k_y_n_year = n_year_before(index_10k_y, n = forecast_year)
    print("Total number of observations: ")
    print(index_10k_y_n_year.shape)
    # split train-test by year
    index_10k_y_n_year = index_10k_y_n_year.sort_values(['gvkey', 'fyear'], ascending=[1, 1])
    index_10k_y_n_year = index_10k_y_n_year.reset_index(drop=True)
    return index_10k_y_n_year
    
    
def load_data(X_padded_text, forecast_year = 1, random_split = False, random_state = 1001,
              test_train_split_year = None):
    """ Load tokenized word sequence and pad it for deep learning
    If random_split, then treat it as a classification problem
    If random_split is false, use test_train_split_year to split the dataset into training and spliting
    """   
    index_10k_y_n_year = construct_aligned_num_matrix(forecast_year=1)

    if random_split:
        all_index = index_10k_y_n_year.index.tolist()
        split = sklearn.model_selection.train_test_split(all_index, train_size = 0.8, random_state=random_state)
        pickle.dump(split, file=open("data/split.pickle_" + str(forecast_year) + "r" + str(random_state), "wb"))
        train_index = split[0]
        test_index = split[1]
    else:
        train_index = index_10k_y_n_year[index_10k_y_n_year['fyear'] < test_train_split_year].index.tolist()
        test_index = index_10k_y_n_year[index_10k_y_n_year['fyear'] >= test_train_split_year].index.tolist()
    
    y = np.array(index_10k_y_n_year['Y'])
    X_text = X_padded_text[index_10k_y_n_year['index_10k'].tolist()] # get all text X which has matched one year after Y
    X_num = index_10k_y_n_year.drop(['gvkey', 'fyear', 'datadate', 'cusip', 'PERMCO', 'Y', 'PERMNO', 'index_10k'], 1)
    X_num = X_num.as_matrix()
    print(len(train_index), len(test_index))
    return X_text, X_num, train_index, test_index, y

In [32]:
def dump_ten_by_ten_split(X_padded_text, forecast_year = 1, random_state = 1001):
    '''
    Dump the indicies (list of kfold splits) for 10x10 cv
    '''
    index_10k_y_n_year = construct_aligned_num_matrix(forecast_year=forecast_year)

    ten_by_ten_splits = []
    all_index = index_10k_y_n_year.index.tolist()
    for i in range(10):
        all_index = shuffle(all_index, random_state=1001)
        kf = KFold(n_splits = 10)
        splits = kf.split(all_index)
        ten_by_ten_splits.append(list(splits))
        pickle.dump(ten_by_ten_splits, file=open("data/ten_by_ten_splits.pickle_" + str(forecast_year) + "r" + str(random_state), "wb"))

In [5]:
def pad_text():
    X_padded = pad_text_data()
    pickle.dump(X_padded, open("data/10k/X_padded.pickle", 'wb'), protocol=4)

In [None]:
# pad_text()

In [6]:
X_padded = pickle.load(open("data/10k/X_padded.pickle", 'rb'))

In [None]:
dump_ten_by_ten_split(X_padded, forecast_year = 1)

In [33]:
dump_ten_by_ten_split(X_padded, forecast_year = 2)

Total number of observations with no forecasting: 
(95987, 44)
Total number of observations: 
(73112, 44)


In [34]:
dump_ten_by_ten_split(X_padded, forecast_year = 3)

Total number of observations with no forecasting: 
(95987, 44)
Total number of observations: 
(63762, 44)


## Summary Stats

In [None]:
forecast_year = 1 
index_10k = pd.read_csv('/shared/data/10k_2017/processed_corpus/10k_index.csv',usecols=['gvkey','fyear'])
index_10k['index_10k'] = index_10k.index
final_variable = pd.read_csv('data/final_variables.csv')
final_variable = final_variable.drop('Unnamed: 0',1)
final_variable = final_variable.replace([np.inf,-np.inf],0)
final_variable = final_variable.query('fyear <= 2014 & fyear >=1994')

# match text index with one year after y, index_10k_y has the index of text data has one year after Y matched
index_10k_y = pd.merge(left=index_10k, right=final_variable, how='inner', on=['gvkey','fyear'])
print("Total number of observations with no forecasting: ")
print(index_10k_y.shape)

index_10k_y_n_year = n_year_before(index_10k_y, n = forecast_year)

print("Total number of observations with forecasting: ")
print(index_10k_y_n_year.shape)

In [None]:
sum(final_variable['Y'])

In [None]:
sum(index_10k_y['Y'])

In [None]:
index_10k_y.to_csv("data/final_sample.csv", index=None)

In [None]:
index_10k_y_n_year.head(5)

In [None]:
index_10k_y[['fyear', 'Y']].groupby('fyear').agg('count')

## Train and Evaluate a Deep Learning Model

Define different deep learning models

In [24]:
embedding_dims = 20

# def create_model_text(no_merge = False):
#     # this is the Deep Averaging Network Moodel
#     # see "Deep Unordered Composition Rivals Syntactic Methods for Text Classification", Iyyer et al. 2015

#     embedding_size = embedding_dims
#     print('Build model...')
#     model = Sequential()
#     model.add(Embedding(max_features, embedding_size, input_length=maxlen))
# #     print(model.layers[-1].output_shape)

#     model.add(GlobalAveragePooling1D())

#     model.add(Dense(4, activity_regularizer=l2(0.01)))
#     model.add(Activation('relu'))
#     model.add(Dense(4, activity_regularizer=l2(0.01)))
#     model.add(Activation('relu'))
    
#     if no_merge:
#         model.add(Dense(1, activation='sigmoid'))
# #         print(model.layers[-1].output_shape)

#         model.compile(loss='binary_crossentropy',
#                   optimizer='nadam',
#                   metrics=['accuracy'])
    
#     return model


def create_model_text(no_merge = False, archi = "embedding"):
    if archi == "embedding":
        # this is the fast-text model
        model = Sequential()
        # we start off with an efficient embedding layer which maps
        # our vocab indices into embedding_dims dimensions
        model.add(Embedding(max_features,
                            embedding_dims,
                            input_length=maxlen))
        # we add a GlobalAveragePooling1D, which will average the embeddings
        # of all words in the document
        model.add(GlobalAveragePooling1D())
        if no_merge:
            # We project onto a single unit output layer, and squash it with a sigmoid:
            model.add(Dense(1, activation='sigmoid'))
            model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

    elif archi == "CNN":
        # this is a CNN from Keras example
        nb_filter = 100
        filter_length = 4
        hidden_dims = 100
        model = Sequential()

        model.add(Embedding(max_features,
                            embedding_dims,
                            input_length=maxlen))
        model.add(Dropout(0.2))

        # we add a Convolution1D, which will learn filters
        # word group filters of size filter_length:
        model.add(Conv1D(nb_filter,
                         filter_length,
                         padding='valid',
                         activation='relu',
                         strides=1))
        # we use max pooling:
        model.add(GlobalMaxPooling1D())

        # We add a vanilla hidden layer:
        model.add(Dense(hidden_dims))
        model.add(Dropout(0.2))
        model.add(Activation('relu'))

        if no_merge:
            model.add(Dense(1))
            model.add(Activation('sigmoid'))
            model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
    return model


# def create_model_text(no_merge = False):
#     # this is a 2D CNN
#     nb_filter = 50
#     filter_length = 3
#     hidden_dims = 4
#     model = Sequential()

#     model.add(Embedding(max_features,
#                         embedding_dims,
#                         input_length=maxlen))
#     print(model.layers[-1].output_shape)
#     model.add(Reshape((maxlen, embedding_dims, 1)))
#     print(model.layers[-1].output_shape)
#     model.add(Convolution2D(nb_filter = nb_filter, nb_col= embedding_dims,  nb_row= filter_length, 
#                             border_mode='same'))
#     model.add(Activation("relu"))
#     print(model.layers[-1].output_shape)

#     # we use max pooling:
#     model.add(GlobalMaxPooling2D())
#     print(model.layers[-1].output_shape)

#     # We add a vanilla hidden layer:
#     model.add(Dense(hidden_dims))
#     model.add(Activation('relu'))

#     if no_merge:
#         model.add(Dense(1))
#         model.add(Activation('sigmoid'))
#         model.compile(loss='binary_crossentropy',
#                       optimizer='adam',
#                       metrics=['accuracy'])
#     return model

        
def create_model_num(no_merge = False):
    # simple model
    model = Sequential()
    model.add(Dense(8, input_dim=36, activation='relu', activity_regularizer=l1(0.0001)))   
    if no_merge:
        model.add(Dense(1, activation='sigmoid',init='zero'))
        model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
    return model

# def create_model_num(no_merge = False):
#     # deep model
#     model = Sequential()
#     model.add(Dense(4, input_dim=36, activation='relu'))  
#     model.add(Dense(4, activation='relu'))   
#     model.add(Dense(4, activation='relu'))   
#     if no_merge:
#         model.add(Dense(1, activation='sigmoid',init='zero'))
#         model.compile(loss='binary_crossentropy',
#                           optimizer='adam',
#                           metrics=['accuracy'])
#     return model
              
              
# def create_model_num(no_merge = False):
#     # wider model
#     model = Sequential()
#     model.add(Dense(16, input_dim=36, activation='relu', activity_regularizer=l2(0.0001)))
#     if no_merge:
#         model.add(Dense(1, activation='sigmoid',init='zero'))
#         model.compile(loss='binary_crossentropy',
#                           optimizer='adam',
#                           metrics=['accuracy'])
#     return model


In [22]:
batch_size = 32
nb_epoch = 6
class_weight = {0:1,1:1}

def get_date_time():
    d_date = datetime.datetime.now()
    reg_format_date = d_date.strftime("%Y-%m-%d %H:%M:%S")
    return(reg_format_date)


class IntervalEvaluation(Callback):
    """ Show AUC after interval number of epoches """
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict_proba(self.X_val, verbose=0)
            score = metrics.roc_auc_score(self.y_val, y_pred)
            logs['score'] = score
            print(" epoch:{:d} AUC: {:.4f}".format(epoch, score))
            
            
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, 
                             model_output, model_config_output,
                             batch_size = batch_size, nb_epoch = nb_epoch, verbose = 1,
                            ):
    """Calculate and print ROC score from a set of X and y"""
    ival = IntervalEvaluation(validation_data=(X_test, y_test), interval=1)
    early_stopping = EarlyStopping(monitor='score', min_delta=0.0, patience=500, mode = 'max')
    model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, 
             validation_data= None, class_weight = class_weight, verbose = verbose, callbacks=[ival, early_stopping])
    pred_yp = model.predict(X_test)
    roc = metrics.roc_auc_score(y_test , pred_yp)
    accuracy_ratio = (roc-0.5)*2 
    brier = metrics.brier_score_loss(y_test , pred_yp)
    date_time_stamp = get_date_time()
    model.save(model_output + date_time_stamp)
    with open(model_config_output + date_time_stamp, 'w') as file:
        file.write(model.to_json())
    print("AUC " + str(roc))
    print("Accuracy_ratio " + str(accuracy_ratio))
    print("Brier Score " + str(brier))
    with open("model/dl_log.txt", 'a') as file:
        file.write(date_time_stamp + "\n" + str([accuracy_ratio, roc, brier]) + "\n")
    return [accuracy_ratio, roc, brier]


def forecast_performace(X_text, X_num, train_index, test_index, y, model_output_path, 
                        model_type = "text", archi = "embedding"):
    """Train_test split using year; Calculate and print performance score"""
    print("Running Model")
    model = None # Clearing the NN.
    
    if model_type == "text":
        model = create_model_text(no_merge=True, archi = archi)
        X_train = X_text[train_index]
        y_train = y[train_index]
        X_test = X_text[test_index]
        y_test = y[test_index]
        return train_and_evaluate_model(model, X_train, y_train, X_test, y_test, 
                                        model_output = model_output_path + ".mod",
                                        model_config_output = model_output_path + ".json", verbose = 0)
        
    if model_type == "num":
        model = create_model_num(no_merge=True)
        X_train = X_num[train_index]
        y_train = y[train_index]
        X_test = X_num[test_index]
        y_test = y[test_index]
        return train_and_evaluate_model(model, X_train, y_train, X_test, y_test,
                                 model_output = model_output_path + ".mod",
                                 model_config_output = model_output_path + ".json",
                                 verbose = 0)

    if model_type == "merge":
        model = create_model_merge_layer()
        X_train = [X_text[train_index], X_num[train_index]]
        y_train = y[train_index]
        X_test = [X_text[test_index], X_num[test_index]]
        y_test =  y[test_index]
        return train_and_evaluate_model(model, X_train, y_train, X_test, y_test)

    

def kfold_performace(X_text, X_num, y, n_folds = 2):
    """Calculate and print average stratified K-fold performance score"""
    skf = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
    performance_scores = []
    for i, (train_index, test_index) in enumerate(skf):
        print("Running Fold", i+1, "/", n_folds)
        model = None # Clearing the NN.
        model = create_model_merge_layer()
        performance_scores.append(train_and_evaluate_model(model, X_text, X_num, y, train_index, test_index, 
                                                          just_text=False))
    print(sum(performance_scores)/n_folds)

#kfold_performace(X, y, n_folds = 2)

In [None]:
for random_state in range(1000, 1010):
    X_text, X_num, train_index, test_index, y = load_data(X_padded_text = X_padded, 
                                                          forecast_year = 1, 
                                                          random_split=True, random_state = random_state)
    print(sum(y[train_index]), sum(y[test_index]))
    AUC = forecast_performace(X_text, X_num, train_index, test_index, y, "text")[1]
    with open("model/text_by_random.txt", 'a') as file:
        file.write("CNN-50"+ ", "+ str(random_state) + ", " + str(1) + ", " + str(AUC) + "\n")
    AUC = forecast_performace(X_text, X_num, train_index, test_index, y, "num")[1]
#     with open("model/num_by_random.txt", 'a') as file:
#         file.write("DL"+ ", "+ str(random_state) + ", " + str(1) + ", " + str(AUC) + "\n")
        

In [None]:
X_text, X_num, train_index, test_index, y = load_data(X_padded_text = X_padded, 
                                                      forecast_year = 1, 
                                                      random_split=True, random_state = 1004)
print(sum(y[train_index]), sum(y[test_index]))

In [None]:
forecast_performace(X_text, X_num, train_index, test_index, y, "model/text_cnn_1004_1y", "text")

In [None]:
forecast_performace(X_text, X_num, train_index, test_index, y, "model/num_wider_1004_1y", "num")

In [None]:
for year in range(2005, 2013):
    print(year)
    X_text, X_num, train_index, test_index, y = load_data(X_padded_text = X_padded, forecast_year = 1, random_split=False, test_train_split_year=year)
    print(sum(y[train_index]), sum(y[test_index]))
    AUC = forecast_performace(X_text, X_num, train_index, test_index, y, "text")[1]
    with open("model/text_by_year.txt", 'a') as file:
        file.write("Embedding_avg"+ ", "+ str(year) + ", " + str(1) + ", " + str(AUC) + "\n")
    AUC = forecast_performace(X_text, X_num, train_index, test_index, y, "num")[1]
    with open("model/num_by_year.txt", 'a') as file:
        file.write("DL"+ ", "+ str(year) + ", " + str(1) + ", " + str(AUC) + "\n")
        

## 10 x 10 cv

Longer forecasting horizon y =2

In [None]:
forecast_year = 2
ten_by_ten_split = pickle.load(open("data/ten_by_ten_splits.pickle_2r1001", 'rb'))
index_10k_y_n_year = construct_aligned_num_matrix(forecast_year=forecast_year)


Total number of observations with no forecasting: 
(95987, 44)
Total number of observations: 
(73112, 44)


In [None]:
nb_epoch = 6

for outer_loop, outer_split in enumerate(ten_by_ten_split):
    for inner_loop, fold in enumerate(outer_split):
        train_index = fold[0]
        test_index = fold[1]
        y = np.array(index_10k_y_n_year['Y'])
        X_text = X_padded[index_10k_y_n_year['index_10k'].tolist()] # get all text X which has matched one year after Y
        X_num = index_10k_y_n_year.drop(['gvkey', 'fyear', 'datadate', 'cusip', 'PERMCO', 'Y', 'PERMNO', 'index_10k'], 1)
        X_num = X_num.as_matrix()
        print(len(train_index), len(test_index))
        print(sum(y[train_index]), sum(y[test_index]))
        performance = forecast_performace(X_text, X_num, train_index, test_index, y, "model/tenbyten_"+
                                          str(outer_loop)+"_"+str(inner_loop), model_type = "text", archi = "embedding")
        with open("model/dl_text_embedding_10by10_y"+str(forecast_year)+".txt", 'a') as file:
            for item in performance:
                file.write("%s," % item)
            file.write("\n")

65800 7312
272 26
Running Model
<keras.models.Sequential object at 0x7f59ff2673c8>
 epoch:0 AUC: 0.6290
 epoch:1 AUC: 0.6919
 epoch:2 AUC: 0.7100
 epoch:3 AUC: 0.7301


In [None]:
for outer_loop, outer_split in enumerate(ten_by_ten_split):
    for inner_loop, fold in enumerate(outer_split):
        train_index = fold[0]
        test_index = fold[1]
        y = np.array(index_10k_y_n_year['Y'])
        X_text = X_padded[index_10k_y_n_year['index_10k'].tolist()] # get all text X which has matched one year after Y
        X_num = index_10k_y_n_year.drop(['gvkey', 'fyear', 'datadate', 'cusip', 'PERMCO', 'Y', 'PERMNO', 'index_10k'], 1)
        X_num = X_num.as_matrix()
        print(len(train_index), len(test_index))
        print(sum(y[train_index]), sum(y[test_index]))
        performance = forecast_performace(X_text, X_num, train_index, test_index, y, "model/tenbyten_"+
                                          str(outer_loop)+"_"+str(inner_loop), model_type = "text", archi = "CNN")
        with open("model/dl_text_cnn_10by10_y"+str(forecast_year)+".txt", 'a') as file:
            for item in performance:
                file.write("%s," % item)
            file.write("\n")

In [None]:
for outer_loop, outer_split in enumerate(ten_by_ten_split):
    for inner_loop, fold in enumerate(outer_split):
        train_index = fold[0]
        test_index = fold[1]
        y = np.array(index_10k_y_n_year['Y'])
        X_text = X_padded[index_10k_y_n_year['index_10k'].tolist()] # get all text X which has matched one year after Y
        X_num = index_10k_y_n_year.drop(['gvkey', 'fyear', 'datadate', 'cusip', 'PERMCO', 'Y', 'PERMNO', 'index_10k'], 1)
        X_num = X_num.as_matrix()
        print(len(train_index), len(test_index))
        print(sum(y[train_index]), sum(y[test_index]))
        performance = forecast_performace(X_text, X_num, train_index, test_index, y, "model/tenbyten_"+
                                          str(outer_loop)+"_"+str(inner_loop), "num")
        with open("model/dl_num_10by10_y"+str(forecast_year)+".txt", 'a') as file:
            for item in performance:
                file.write("%s," % item)
            file.write("\n")

Longer forecasting horizon y =3

In [None]:
forecast_year = 3
ten_by_ten_split = pickle.load(open("data/ten_by_ten_splits.pickle_3r1001", 'rb'))
index_10k_y_n_year = construct_aligned_num_matrix(forecast_year=forecast_year)
nb_epoch = 6

for outer_loop, outer_split in enumerate(ten_by_ten_split):
    for inner_loop, fold in enumerate(outer_split):
        train_index = fold[0]
        test_index = fold[1]
        y = np.array(index_10k_y_n_year['Y'])
        X_text = X_padded[index_10k_y_n_year['index_10k'].tolist()] # get all text X which has matched one year after Y
        X_num = index_10k_y_n_year.drop(['gvkey', 'fyear', 'datadate', 'cusip', 'PERMCO', 'Y', 'PERMNO', 'index_10k'], 1)
        X_num = X_num.as_matrix()
        print(len(train_index), len(test_index))
        print(sum(y[train_index]), sum(y[test_index]))
        performance = forecast_performace(X_text, X_num, train_index, test_index, y, "model/tenbyten_"+
                                          str(outer_loop)+"_"+str(inner_loop), model_type = "text", archi = "embedding")
        with open("model/dl_text_embedding_10by10_y"+str(forecast_year)+".txt", 'a') as file:
            for item in performance:
                file.write("%s," % item)
            file.write("\n")
            
            
for outer_loop, outer_split in enumerate(ten_by_ten_split):
    for inner_loop, fold in enumerate(outer_split):
        train_index = fold[0]
        test_index = fold[1]
        y = np.array(index_10k_y_n_year['Y'])
        X_text = X_padded[index_10k_y_n_year['index_10k'].tolist()] # get all text X which has matched one year after Y
        X_num = index_10k_y_n_year.drop(['gvkey', 'fyear', 'datadate', 'cusip', 'PERMCO', 'Y', 'PERMNO', 'index_10k'], 1)
        X_num = X_num.as_matrix()
        print(len(train_index), len(test_index))
        print(sum(y[train_index]), sum(y[test_index]))
        performance = forecast_performace(X_text, X_num, train_index, test_index, y, "model/tenbyten_"+
                                          str(outer_loop)+"_"+str(inner_loop), model_type = "text", archi = "CNN")
        with open("model/dl_text_cnn_10by10_y"+str(forecast_year)+".txt", 'a') as file:
            for item in performance:
                file.write("%s," % item)
            file.write("\n")

for outer_loop, outer_split in enumerate(ten_by_ten_split):
    for inner_loop, fold in enumerate(outer_split):
        train_index = fold[0]
        test_index = fold[1]
        y = np.array(index_10k_y_n_year['Y'])
        X_text = X_padded[index_10k_y_n_year['index_10k'].tolist()] # get all text X which has matched one year after Y
        X_num = index_10k_y_n_year.drop(['gvkey', 'fyear', 'datadate', 'cusip', 'PERMCO', 'Y', 'PERMNO', 'index_10k'], 1)
        X_num = X_num.as_matrix()
        print(len(train_index), len(test_index))
        print(sum(y[train_index]), sum(y[test_index]))
        performance = forecast_performace(X_text, X_num, train_index, test_index, y, "model/tenbyten_"+
                                          str(outer_loop)+"_"+str(inner_loop), model_type = "text", archi = "CNN")
        with open("model/dl_text_cnn_10by10_y"+str(forecast_year)+".txt", 'a') as file:
            for item in performance:
                file.write("%s," % item)
            file.write("\n")

## Using pretrained embedding

In [None]:
# Example from : https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
import os
word_index = pickle.load(open("data/10k/word_map.pickle", "rb"))
GLOVE_DIR = "/shared/data/word_embedding/"
EMBEDDING_DIM = 100

In [None]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((max(word_index.values()) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
index_word = dict(zip(word_index.values(),word_index.keys()))

In [None]:
maxlen = 7500

In [None]:
def create_model_text(no_merge = True):
    # this is the fast-text model
    model = Sequential()
    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    
    embedding_layer = Embedding(max(word_index.values()) + 1,
                            100,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=True)
    model.add(embedding_layer)
    model.add(GlobalAveragePooling1D())
    # We add a vanilla hidden layer:
    model.add(Dense(4))
    model.add(Dropout(0.0))
    model.add(Activation('relu'))


    if no_merge:
        model.add(Dense(1))
        model.add(Activation('sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model


In [None]:
for year in range(2006, 2013):
    print(year)
    X_text, X_num, train_index, test_index, y = load_data(X_padded_text = X_padded, forecast_year = 1, random_split=False, test_train_split_year=year)
    print(sum(y[train_index]), sum(y[test_index]))
    forecast_performace(X_text, X_num, train_index, test_index,o y, "text")