In [1]:
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import sklearn.metrics 


import pandas as pd
import numpy as np

import datetime
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten, Convolution2D, Convolution1D, Reshape, Lambda, AveragePooling1D, AveragePooling2D, MaxPooling1D
from keras.layers import LSTM, SimpleRNN, GRU
from keras.regularizers import l1, l2, l1
from keras.preprocessing.text import Tokenizer
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import Merge
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping
from keras.models import load_model
from keras.utils.vis_utils import plot_model


Using TensorFlow backend.


2017-02-17  
Feng Mai  

Given the best selected models report results

# Deep Learning Models

## Helper Functions

In [2]:
test_train_split_year = 2011



def n_year_before(df, n = 1):
    """input x,y df, return df with y and n year before x"""
    dat_tmp = df.copy()
    dat_tmp['fyear'] = dat_tmp['fyear'] + n
    dat_tmp = dat_tmp.drop('Y',axis =1)
    Ys = df[['fyear','gvkey','Y']]
    n_year = pd.merge(dat_tmp,Ys,how = 'inner',on=['fyear','gvkey'])
    return n_year


def load_data(X_padded_text, forecast_year = 1, random_split = False, random_state = 1001,
              test_train_split_year = None):
    """ Load tokenized word sequence and pad it for deep learning
    If random_split, then treat it as a classification problem
    If random_split is false, use test_train_split_year to split the dataset into training and spliting
    """   
    index_10k = pd.read_csv('/shared/data/10k_2017/processed_corpus/10k_index.csv',usecols=['gvkey','fyear'])
    index_10k['index_10k'] = index_10k.index
    final_variable = pd.read_csv('data/final_variables.csv')
    final_variable = final_variable.drop('Unnamed: 0',1)
    final_variable = final_variable.replace([np.inf,-np.inf],0)

    final_variable = final_variable.query('fyear <= 2014 & fyear >= 1993')
    # match text index with one year after y, index_10k_y has the index of text data has one year after Y matched
    index_10k_y = pd.merge(left=index_10k, right=final_variable, how='inner', on=['gvkey','fyear'])
    print("Total number of observations with no forecasting: ")
    print(index_10k_y.shape)
    
    index_10k_y_n_year = n_year_before(index_10k_y, n = forecast_year)
    print("Total number of observations: ")
    print(index_10k_y_n_year.shape)
    # split train-test by year
    index_10k_y_n_year = index_10k_y_n_year.sort_values(['gvkey', 'fyear'], ascending=[1, 1])
    index_10k_y_n_year = index_10k_y_n_year.reset_index(drop=True)

    if random_split:
        all_index = index_10k_y_n_year.index.tolist()
        split = sklearn.model_selection.train_test_split(all_index, train_size = 0.8, random_state=random_state)
        pickle.dump(split, file=open("data/split.pickle_" + str(forecast_year) + "r" + str(random_state), "wb"))
        train_index = split[0]
        test_index = split[1]
    else:
        train_index = index_10k_y_n_year[index_10k_y_n_year['fyear'] < test_train_split_year].index.tolist()
        test_index = index_10k_y_n_year[index_10k_y_n_year['fyear'] >= test_train_split_year].index.tolist()
    
    y = np.array(index_10k_y_n_year['Y'])
    X_text = X_padded_text[index_10k_y_n_year['index_10k'].tolist()] # get all text X which has matched one year after Y
    X_num = index_10k_y_n_year.drop(['gvkey', 'fyear', 'datadate', 'cusip', 'PERMCO', 'Y', 'PERMNO', 'index_10k'], 1)
    X_num = X_num.as_matrix()
    print(len(train_index), len(test_index))
    return X_text, X_num, train_index, test_index, y



def performance_measure(pred_yp, x, y):
    '''
    Given lists of predicted y probability and x, y, return a dataframe of AR, AUC, Brier, Decile Table
    '''
    fpr,tpr,thresholds = roc_curve(y, pred_yp)
    roc_auc = roc_auc_score(y, pred_yp)
    accuracy_ratio = (roc_auc-0.5)*2
    brier = metrics.brier_score_loss(y, pred_yp)
    
    tenc_dat = pd.DataFrame({'y_true':y,'probability':pred_yp.flatten()})
    tenc_dat.sort_values('probability',axis = 0,ascending=False, inplace = True)
    tenc_dat.index = range(0,len(tenc_dat))
    y = tenc_dat['y_true']
    point = float(len(tenc_dat))/10
    point = int(round(point))
    tenc = []
    for i in range(0,10):
        tenc.append(y[(i*point):((i+1)*point)])
    tenc[9]=tenc[9].append(y[10*point:])
    total = sum(y)
    num_of_bkr = []
    for j in range(0,10):
        num_of_bkr.append(sum(tenc[j]))
    tencile_bkr = np.array(num_of_bkr)
    rate = tencile_bkr.astype(float)/total
    tencile_result=pd.DataFrame({'Group':range(1,11),'Rate':rate})
    # combine tencile 6 - 10
    sum_6_to_10 = sum(tencile_result.loc[tencile_result['Group'] > 5]['Rate'])
    tencile_result = tencile_result.loc[tencile_result['Group'] <= 5]
    tencile_result = tencile_result.append({'Group':'6-10', 'Rate': sum_6_to_10}, ignore_index=True)
    
#     overall_results = pd.DataFrame({'Group':['Accuracy_ratio', 'AUC', 'Brier'],'Rate':[accuracy_ratio, roc_auc, brier]})
    overall_results = pd.DataFrame({'Group':['Accuracy_ratio', 'AUC'],'Rate':[accuracy_ratio, roc_auc]})

    return pd.concat([overall_results, tencile_result])


def score_tencile_table_keras(model,x,y):
    '''
    input: a model and  X y
    output: [ AUC, Accuracy Ratio, and Brier Score], a tencile table 
    '''
    pred_yp = model.predict(x)
    
    return performance_measure(pred_yp, x, y)


def compare_multiple_models_keras(model_list, x, y):
    '''
    Rowbind results for a list of models
    '''
    result_list = [score_tencile_table_keras(model,x,y) for model in model_list]
    results = pd.concat(result_list, axis=1).drop(['Group'], axis=1)
    return results
    

In [3]:
X_padded = pickle.load(open("data/10k/X_padded.pickle", 'rb'))

## Keras results for textual data

### 1-year ahead

Split by year

In [None]:
X_text, X_num, train_index, test_index, y = load_data(X_padded_text = X_padded, forecast_year = 1, random_split=True)

Random Split

In [4]:
X_text, X_num, train_index, test_index, y = load_data(X_padded_text = X_padded, 
                                                          forecast_year = 1, 
                                                          random_split=True, random_state = 1004)

Total number of observations with no forecasting: 
(95987, 44)
Total number of observations: 
(83746, 44)
66996 16750


In [5]:
compare_multiple_models_keras([load_model('model/final/y1/1004/text_embedding_1004_1y.mod2017-06-09 15:51:15'), 
                         load_model('model/final/y1/1004/text_cnn_1004_1y.mod2017-06-13 13:47:36')], X_text[test_index], y[test_index])

Unnamed: 0,Rate,Rate.1
0,0.567878,0.427811
1,0.783939,0.713906
0,0.357143,0.25
1,0.202381,0.190476
2,0.154762,0.119048
3,0.107143,0.178571
4,0.059524,0.071429
5,0.119048,0.190476


## Keras results for numerical data

In [6]:
X_text, X_num, train_index, test_index, y = load_data(X_padded_text = X_padded, forecast_year = 1, random_split = True, random_state = 1004)

Total number of observations with no forecasting: 
(95987, 44)
Total number of observations: 
(83746, 44)
66996 16750


DL Model

In [8]:
compare_multiple_models_keras([load_model('model/final/y1/1004/num_simple_1004_1y.mod2017-06-09 17:57:28'), 
                         load_model('model/final/y1/1004/num_deeper_1004_1y.mod2017-06-09 17:59:13'),
                         load_model('model/final/y1/1004/num_wider_1004_1y.mod2017-06-09 18:11:02')], X_num[test_index], y[test_index])

Unnamed: 0,Rate,Rate.1,Rate.2
0,0.633211,0.603058,0.596655
1,0.816606,0.801529,0.798328
0,0.404762,0.345238,0.238095
1,0.25,0.202381,0.321429
2,0.142857,0.238095,0.238095
3,0.119048,0.095238,0.119048
4,0.035714,0.083333,0.035714
5,0.047619,0.035714,0.047619


# Sklearn Models

## Helper functions

In [None]:
# all numerical data
final_variable = pd.read_csv('data/final_variables.csv')
final_variable = final_variable.drop('Unnamed: 0',1)
drop_list = ['gvkey','datadate','fyear','cusip','PERMNO','PERMCO', 'Y']
all_x_var = list(final_variable.drop(drop_list, axis=1))
print(all_x_var)

test_train_split_year = 2011
forecast_year = 1

def n_year_before(df, n = 1):
    """input x,y df, return df with y and n year before x"""
    dat_tmp = df.copy()
    dat_tmp['fyear'] = dat_tmp['fyear'] + n
    dat_tmp = dat_tmp.drop('Y',axis =1)
    Ys = df[['fyear','gvkey','Y']]
    n_year = pd.merge(dat_tmp,Ys,how = 'inner',on=['fyear','gvkey'])
    return n_year

def load_aligned_data(how='text', svd=False, forecast_year = 1, test_train_split_year = None, random_state = None):
    [train_x, train_y, test_x, test_y] = [None, None, None, None]
    suffix = "r" + str(random_state)
    if test_train_split_year is not None:
        suffix = "_" + str(test_train_split_year)
    if how == "text":
        if svd is True:
            [train_x, train_y, test_x, test_y] = pickle.load(open("data/aligned/sklearn_textsvd_" + str(forecast_year)+suffix, 'rb'))
        else:
            [train_x, train_y, test_x, test_y] = pickle.load(open("data/aligned/sklearn_text_" + str(forecast_year)+suffix, 'rb'))
    if how == "num":
        [train_x, train_y, test_x, test_y] = pickle.load(open("data/aligned/sklearn_num_"+ str(forecast_year) + suffix, 'rb'))
    if how == "total":
        if svd is True:
            [train_x, train_y, test_x, test_y] = pickle.load(open("data/aligned/sklearn_totalsvd_" + str(forecast_year) + suffix, 'rb'))
        else:
            [train_x, train_y, test_x, test_y] = pickle.load(open("data/aligned/sklearn_total_" + str(forecast_year) + suffix, 'rb'))
    return train_x, train_y, test_x, test_y


def tencile_table_sklearn(model,x,y):
    '''
    input: a model and  X y
    output: a tencile table
    '''
    if hasattr(model, 'predict_proba'):
        pred_yp = model.predict_proba(x)[:,1]
    else:
        from sklearn.calibration import CalibratedClassifierCV
        clf = CalibratedClassifierCV(model) 
        clf.fit(train_x, train_y)
        pred_yp = clf.predict_proba(test_x)[:,1]
    
    return performance_measure(pred_yp, x, y)


def f1_sklearn(model,x,y, class_ratio = 20):
    '''
    UNFINISHED
    input: a model and  X y
    output: a tencile table
    '''
    pred_yp = model.predict_proba(x)[:,1]
    return performance_measure(pred_yp, x, y)

## Sklearn results for textual data

### 1-year ahead

Random Split

In [None]:
results = []

In [None]:
train_x, train_y, test_x, test_y = load_aligned_data(how="text", svd=None, test_train_split_year= None, random_state = 1004)

In [None]:
model = pickle.load(open("model/final/y1/1004/text_lg.pickle", 'rb'))
results.append(tencile_table_sklearn(model, test_x, test_y))

In [None]:
model = pickle.load(open("model/final/y1/1004/text_SVM-text.pickle", 'rb'))
results.append(tencile_table_sklearn(model, test_x, test_y))

In [None]:
model = pickle.load(open("model/final/y1/1004/text_random_forest.pickle", 'rb'))
results.append(tencile_table_sklearn(model, test_x, test_y))

In [None]:
pd.concat(results, axis=1).drop(['Group'], axis=1)

## Sklearn results for numerical data

### 1-year ahead

Random Split

In [None]:
results = []

In [None]:
train_x, train_y, test_x, test_y = load_aligned_data(how= "num", svd=None, test_train_split_year= None, random_state = 1004)

In [None]:
model = pickle.load(open("model/final/y1/1004/num_lg.pickle", 'rb'))
results.append(tencile_table_sklearn(model, test_x, test_y))

In [None]:
model = pickle.load(open("model/final/y1/1004/num_SVM-num.pickle", 'rb'))
results.append(tencile_table_sklearn(model, test_x, test_y))

In [None]:
model = pickle.load(open("model/final/y1/1004/num_random_forest-num.pickle", 'rb'))
results.append(tencile_table_sklearn(model, test_x, test_y))

In [None]:
pd.concat(results, axis=1).drop(['Group'], axis=1)