In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
import sklearn

Use traditional method in sklearn to forecasting bankcruptcy

# Bankcruptcy prediction using sklearn

## Load and align data

SVD for text data

In [2]:
# # Use SVD to reduce the dimension in the 10k text
svd_dim = 50
max_features = 10000
def generate_save_SVD():
    """
    Note: Deprecated. Should fit SVD on train set and tranform on test
    """
    X = np.load('data/10k/X_tfidf.npy')
    print(X.shape)
    text = pd.DataFrame(data=X)
    svd = TruncatedSVD(n_components=svd_dim)
    svd_text = svd.fit_transform(text)
    print(svd.explained_variance_ratio_.sum())
    np.save("data/10k/svd_X_tfidf.npy", svd_text)

Load Numerical Data

In [3]:
# all numerical data
final_variable = pd.read_csv('data/final_variables.csv')
final_variable = final_variable.drop('Unnamed: 0',1)
drop_list = ['gvkey','datadate','fyear','cusip','PERMNO','PERMCO', 'Y']
all_x_var = list(final_variable.drop(drop_list, axis=1))
print(all_x_var)
lasso_x = ['PRICE','OIADPAT','NIMTA','FAT','LCTCHAT','EXCESS_RETURN','LCTAT','EBITDPAT'] # lasso selected features
non_svd_text = pd.DataFrame(data=np.load('data/10k/X_tfidf.npy'))

['NIAT', 'NISALE', 'OIADPAT', 'OIADPSALE', 'EBITAT', 'EBITDPAT', 'EBITSALE', 'SEQAT', 'REAT', 'LCTAT', 'LCTCHAT', 'LTAT', 'LOGSALE', 'CHAT', 'CHLCT', 'QALCT', 'ACTLCT', 'WCAPAT', 'LCTLT', 'INVTSALE', 'SALEAT', 'APSALE', 'LOGAT', 'INVCHINVT', 'CASHAT', 'LCTSALE', 'RELCT', 'FAT', 'SIGMA', 'NIMTA', 'LTMTA', 'CASHMTA', 'PRICE', 'RSIZE', 'EXCESS_RETURN', 'MBE']


Helper functions to load and align data

In [4]:
def n_year_before(df, n = 1):
    """input x,y df, return df with y and n year before x"""
    dat_tmp = df.copy()
    dat_tmp['fyear'] = dat_tmp['fyear'] + n
    dat_tmp = dat_tmp.drop('Y',axis =1)
    Ys = df[['fyear','gvkey','Y']]
    n_year = pd.merge(dat_tmp,Ys,how = 'inner',on=['fyear','gvkey'])
    return n_year


def merge_data(svd=False, forecast_year = 1):
    print('Loading data')
    
    # load accounting and market data
    final_variable = pd.read_csv('data/final_variables.csv')
    final_variable = final_variable.drop('Unnamed: 0',1)
    final_variable = final_variable.replace([np.inf,-np.inf],0)
    final_variable = final_variable.query('fyear <= 2014 & fyear >=1993')
    final_variable.shape

    index_10k = pd.read_csv('/shared/data/10k_2017/processed_corpus/10k_index.csv',usecols=['gvkey','fyear'])
    print("concating")
    if svd:
        text = pd.concat([svd_text, index_10k], axis=1)
        text_idx = list(range(svd_dim))
    if not svd:
        text = pd.concat([non_svd_text, index_10k], axis=1)
        text_idx = list(range(max_features))

    print("merging")
    # combine text and numerical data
    text_num = pd.merge(left=final_variable, right=text, how='inner', on=['gvkey','fyear'])
    print("Total number of observations with no forecasting: ")
    print(text_num.shape)
    print("n year before")
    text_num_n_year = n_year_before(text_num, n = forecast_year)
    print("Total number of observations: ")
    print(text_num_n_year.shape)
    text_num_n_year = text_num_n_year.sort_values(['gvkey', 'fyear'], ascending=[1, 1])
    text_num_n_year = text_num_n_year.reset_index(drop=True)
    return text_num_n_year, text_idx


def load_data(how='text', svd=False, forecast_year = 1, 
              random_split = None, test_train_split_year = None):
    """Input user specified method, return train_x, train_y, test_x, test_y based on pre-load df
    :param how: ['text','numerical','total']
    :type how: str
    :param svd: do decomposition to tfidf
    :type svd: boolean
    :random_split: If random split, need to load `data/split.pickle_1r` file that contains the train/test split
    :test_train_split_year: Train data = less than this year; Test data = greater than or equal to this year
    :return: train_x, train_y, test_x, test_y split by test_train_split_year or the split pickle file
    :rtype: pandas.dataframe
    """
    merged = merge_data(svd = svd, forecast_year = forecast_year)
    text_num_n_year = merged[0]
    text_idx = merged[1]
    print("spliting")
    if random_split is not None:
        all_index = text_num_n_year.index.tolist()
#         split = sklearn.model_selection.train_test_split(all_index, train_size = 0.8, random_state=40)
        split = pickle.load(open("data/split.pickle_" + str(forecast_year) + "r" + str(random_split), "rb"))
        train_index = split[0]
        test_index = split[1]
        train = text_num_n_year.iloc[train_index]
        test = text_num_n_year.iloc[test_index]
    else:
        train = text_num_n_year[text_num_n_year['fyear'] < test_train_split_year]
        test = text_num_n_year[text_num_n_year['fyear'] >= test_train_split_year]
        
    train_y = train['Y']
    test_y = test['Y']

    if how == 'text':
        train_x = train.loc[:,text_idx]
        test_x = test.loc[:,text_idx]
        if svd:
            print("Generating SVD")
            svd = TruncatedSVD(n_components=svd_dim)
            train_x = svd.fit_transform(train_x)
            test_x = svd.transform(test_x)
            
    if how == 'num':
        # use 8 selected numerical features
        train_x = train[all_x_var]
        test_x = test[all_x_var]
        
    if how == 'total':
        train_x_text = train.loc[:,text_idx]
        test_x_text = test.loc[:,text_idx]
        if svd:
            print("Generating SVD")
            svd = TruncatedSVD(n_components=svd_dim)
            train_x_text = svd.fit_transform(train_x_text)
            test_x_text = svd.transform(test_x_text)
        train_x = pd.concat([train_x_text,train[all_x_var]], axis=1) 
        test_x = pd.concat([test_x_text,test[all_x_var]], axis=1) 
    print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)
    # print(train_x.head(5))
    return train_x, train_y, test_x, test_y


def get_prob_auc(model,x,y):
    '''
    input: a model and X y
    ouput: AUC, Accuracy Ratio, and Brier Score
    '''
    if hasattr(model, 'predict_proba'):
        pred_yp = model.predict_proba(x)[:,1]
    else:
        pred_yp = model.decision_function(x)
        pred_yp = (pred_yp - pred_yp.min()) / (pred_yp.max() - pred_yp.min())
    
    fpr,tpr,thresholds = roc_curve(y, pred_yp)
    roc_auc = roc_auc_score(y, pred_yp)
    accuracy_ratio = (roc_auc-0.5)*2
    brier = metrics.brier_score_loss(y, pred_yp)
    return roc_auc, accuracy_ratio, brier


In [5]:
def dump_aligned_data_by_random_seed(random_state):
    train_x, train_y, test_x, test_y = load_data(how="num", svd=None, random_split = random_state)
    pickle.dump([train_x, train_y, test_x, test_y], open("data/aligned/sklearn_num_1r"+str(random_state), 'wb'), protocol=4)

    train_x, train_y, test_x, test_y = load_data(how="text", svd=False, random_split = random_state)
    pickle.dump([train_x, train_y, test_x, test_y], open("data/aligned/sklearn_text_1r"+str(random_state), 'wb'), protocol=4)

    train_x, train_y, test_x, test_y = load_data(how="text", svd=True, random_split = random_state)
    pickle.dump([train_x, train_y, test_x, test_y], open("data/aligned/sklearn_textsvd_1r"+str(random_state), 'wb'), protocol=4)

#     train_x, train_y, test_x, test_y = load_data(how="total", svd=False, random_split = random_state)
#     pickle.dump([train_x, train_y, test_x, test_y], open("data/aligned/sklearn_total_1r"+str(random_state), 'wb'), protocol=4)

#     train_x, train_y, test_x, test_y = load_data(how="total", svd=True, random_split = random_state)
#     pickle.dump([train_x, train_y, test_x, test_y], open("data/aligned/sklearn_totalsvd_1r"+str(random_state), 'wb'), protocol=4)

In [None]:
for random_state in range(1000, 1010):
    dump_aligned_data_by_random_seed(random_state)

In [6]:
def dump_aligned_data_by_year(year):
    
#     train_x, train_y, test_x, test_y = load_data(how="num", svd=True, random_split = False, test_train_split_year = year)
#     pickle.dump([train_x, train_y, test_x, test_y], open("data/aligned/sklearn_num_1_" + str(year), 'wb'), protocol=4)

    train_x, train_y, test_x, test_y = load_data(how="text", svd=False, random_split = False, test_train_split_year = year)
    pickle.dump([train_x, train_y, test_x, test_y], open("data/aligned/sklearn_text_1_"+ str(year), 'wb'), protocol=4)

    train_x, train_y, test_x, test_y = load_data(how="text", svd=True, random_split = False, test_train_split_year = year)
    pickle.dump([train_x, train_y, test_x, test_y], open("data/aligned/sklearn_textsvd_1_"+ str(year), 'wb'), protocol=4)

#     train_x, train_y, test_x, test_y = load_data(how="total", svd=False, random_split = False, test_train_split_year = year)
#     pickle.dump([train_x, train_y, test_x, test_y], open("data/aligned/sklearn_total_1_"+ str(year), 'wb'), protocol=4)

#     train_x, train_y, test_x, test_y = load_data(how="total", svd=True, random_split = False, test_train_split_year = year)
#     pickle.dump([train_x, train_y, test_x, test_y], open("data/aligned/sklearn_totalsvd_1_"+ str(year), 'wb'), protocol=4)

In [None]:
dump_aligned_data_by_year(2005)
dump_aligned_data_by_year(2006)
dump_aligned_data_by_year(2007)
dump_aligned_data_by_year(2008)
dump_aligned_data_by_year(2009)
dump_aligned_data_by_year(2010)
dump_aligned_data_by_year(2011)
dump_aligned_data_by_year(2012)

In [7]:
def load_aligned_data(how='text', svd=False, forecast_year = 1, test_train_split_year = None, random_state = None):
    [train_x, train_y, test_x, test_y] = [None, None, None, None]
    suffix = "r" + str(random_state)
    if test_train_split_year is not None:
        suffix = "_" + str(test_train_split_year)
    if how == "text":
        if svd is True:
            [train_x, train_y, test_x, test_y] = pickle.load(open("data/aligned/sklearn_textsvd_" + str(forecast_year)+suffix, 'rb'))
        else:
            [train_x, train_y, test_x, test_y] = pickle.load(open("data/aligned/sklearn_text_" + str(forecast_year)+suffix, 'rb'))
    if how == "num":
        [train_x, train_y, test_x, test_y] = pickle.load(open("data/aligned/sklearn_num_"+ str(forecast_year) + suffix, 'rb'))
    if how == "total":
        if svd is True:
            [train_x, train_y, test_x, test_y] = pickle.load(open("data/aligned/sklearn_totalsvd_" + str(forecast_year) + suffix, 'rb'))
        else:
            [train_x, train_y, test_x, test_y] = pickle.load(open("data/aligned/sklearn_total_" + str(forecast_year) + suffix, 'rb'))
    return train_x, train_y, test_x, test_y

## Create and Compare Models

Search model hyper-parameters using random search  
see example: http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html

Factory method for final models

In [8]:
class_weight = {0:1,1:100}

def create_model(model_type):
    # create your model using this function
    if model_type == 'MNB':
        model = MultinomialNB()
    if model_type == 'lg-num':
        model = LogisticRegression(C = 0.18,penalty='l1', class_weight=class_weight)
    if model_type == 'lg-text':
        model = LogisticRegression(C = 0.001,penalty='l2', class_weight=class_weight)
    if model_type == 'lg-text-nosvd':
        model = LogisticRegression(C = 0.1,penalty='l1', class_weight=class_weight)
    if model_type == 'lg':
        model = LogisticRegressionCV(Cs=10, class_weight=class_weight)
    if model_type =='SVM-text':
        model = LinearSVC(C=0.001, class_weight=class_weight)
    if model_type =='SVC-text':
        model = SVC(C=0.5, kernel='linear', class_weight=class_weight, probability = True)        
    if model_type == 'SVM-num':
        model = LinearSVC(C=0.002, class_weight=class_weight)
    if model_type == 'SVM-total':
        model = LinearSVC(C= 0.0005, class_weight=class_weight)
    if model_type == 'random_forest':
        model = RandomForestClassifier(max_depth = 8, n_estimators= 200, class_weight=class_weight)
    if model_type == 'random_forest-num':
        model = RandomForestClassifier(max_depth = 2, n_estimators= 50, class_weight=class_weight)
    if model_type == "boost-text":
        model = GradientBoostingClassifier(min_samples_leaf = 0.1, n_estimators=50, class_weight=class_weight)
    return model

### Only look at single models

In [None]:
method = ('text','lg', False)
print('method is: ', method)
train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], test_train_split_year= None, random_state = 1004)
print(sum(test_y))
model = create_model(model_type=method[1])
model.fit(train_x, train_y)
pred_yp = model.predict_proba(test_x)[:,1]
roc = metrics.roc_auc_score(test_y, pred_yp)
print('out sample AUC:')
print(roc)

In [None]:
method = ('text','random_forest', False)
print('method is: ', method)
train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], test_train_split_year= None, random_state = 1004)
print(sum(test_y))
model = create_model(model_type=method[1])
model.fit(train_x, train_y)
pred_yp = model.predict_proba(test_x)[:,1]
roc = metrics.roc_auc_score(test_y, pred_yp)
print('out sample AUC:')
print(roc)

In [None]:
method = ('text', 'MNB', False)
print('method is: ', method)
train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], test_train_split_year= None, random_state = 1004)
print(sum(test_y))
model = create_model(model_type=method[1])
model.fit(train_x, train_y)
pred_yp = model.predict_proba(test_x)[:,1]
roc = metrics.roc_auc_score(test_y, pred_yp)
print('out sample AUC:')
print(roc)

In [None]:
method = ('text', 'SVM-text', False)
print('method is: ', method)
train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], test_train_split_year= None, random_state = 1004)
print(sum(test_y))
model = create_model(model_type=method[1])
clf = CalibratedClassifierCV(model) 
clf.fit(train_x, train_y)
pred_yp = clf.predict_proba(test_x)[:,1]
roc = metrics.roc_auc_score(test_y, pred_yp)
print('out sample AUC:')
print(roc)

### Split Randomly

In [None]:
for random_state in range(1000, 1010):
    method = ('num','lg', None)
    print('method is: ', method)
    train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], random_state = random_state)
    print(train_x.shape, test_x.shape, sum(train_y), sum(test_y))
    model = create_model(model_type=method[1])
    model.fit(train_x, train_y)
    pred_yp = model.predict_proba(test_x)[:,1]
    roc = metrics.roc_auc_score(test_y, pred_yp)
    print('out sample AUC:')
    print(roc)
    with open("model/num_by_random.txt", 'a') as file:
        file.write("lg"+ ", "+ str(random_state) + ", " + str(1) + ", " + str(roc) + "\n")
        
for random_state in range(1000, 1010):
    method = ('text','lg', None)
    print('method is: ', method)
    train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], random_state = random_state)
    print(train_x.shape, test_x.shape, sum(train_y), sum(test_y))
    model = create_model(model_type=method[1])
    model.fit(train_x, train_y)
    pred_yp = model.predict_proba(test_x)[:,1]
    roc = metrics.roc_auc_score(test_y, pred_yp)
    print('out sample AUC:')
    print(roc)
    with open("model/text_by_random.txt", 'a') as file:
        file.write("lg_nosvd"+ ", "+ str(random_state) + ", " + str(1) + ", " + str(roc) + "\n")
        
for random_state in range(1000, 1010):
    method = ('text','lg', True)
    print('method is: ', method)
    train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], random_state = random_state)
    print(train_x.shape, test_x.shape, sum(train_y), sum(test_y))
    model = create_model(model_type=method[1])
    model.fit(train_x, train_y)
    pred_yp = model.predict_proba(test_x)[:,1]
    roc = metrics.roc_auc_score(test_y, pred_yp)
    print('out sample AUC:')
    print(roc)
    with open("model/text_by_random.txt", 'a') as file:
        file.write("lg_svd"+ ", "+ str(random_state) + ", " + str(1) + ", " + str(roc) + "\n")

### Split by year  -LG

Numeric

In [None]:
for year in range(2006, 2013):
    print(year)
    method = ('num','lg', None)
    print('method is: ', method)
    train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], test_train_split_year = year)
    print(train_x.shape, test_x.shape, sum(train_y), sum(test_y))
    model = create_model(model_type=method[1])
    model.fit(train_x, train_y)
    pred_yp = model.predict_proba(test_x)[:,1]
    roc = metrics.roc_auc_score(test_y, pred_yp)
    print('out sample AUC:')
    print(roc)
    with open("model/num_by_year.txt", 'a') as file:
        file.write("lg"+ ", "+ str(year) + ", " + str(1) + ", " + str(roc) + "\n")

Text - No SVD

In [None]:
for year in range(2005, 2013):
    print(year)
    method = ('text','lg', False)
    print('method is: ', method)
    train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], test_train_split_year = year)
    print(train_x.shape, test_x.shape, sum(train_y), sum(test_y))
    model = create_model(model_type=method[1])
    model.fit(train_x, train_y)
    pred_yp = model.predict_proba(test_x)[:,1]
    roc = metrics.roc_auc_score(test_y, pred_yp)
    print('out sample AUC:')
    print(roc)
    with open("model/text_by_year.txt", 'a') as file:
        file.write("lg_nosvd"+ ", "+ str(year) + ", " + str(1) + ", " + str(roc) + "\n")

Text - SVD

In [None]:
for year in range(2006, 2013):
    print(year)
    method = ('text','lg', True)
    print('method is: ', method)
    train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], test_train_split_year = year)
    print(train_x.shape, test_x.shape, sum(train_y), sum(test_y))
    model = create_model(model_type=method[1])
    model.fit(train_x, train_y)
    pred_yp = model.predict_proba(test_x)[:,1]
    roc = metrics.roc_auc_score(test_y, pred_yp)
    with open("model/text_by_year.txt", 'a') as file:
        file.write("lg_svd"+ ", "+ str(year) + ", " + str(1) + ", " + str(roc) + "\n")

In [None]:
model = create_model(model_type=method[1])
model.fit(train_x, train_y)
with open('model/' + method[0] + "_" + method[1] + ".pickle", 'wb') as handle:
    pickle.dump(model, handle)
if hasattr(model, 'predict_proba'):
    pred_yp = model.predict_proba(test_x)[:,1]
else:
    from sklearn.calibration import CalibratedClassifierCV
    clf = CalibratedClassifierCV(model) 
    clf.fit(train_x, train_y)
    pred_yp = clf.predict_proba(test_x)[:,1]
roc = metrics.roc_auc_score(test_y, pred_yp)
print('out sample AUC:')
print(roc)

In [None]:
model.feature_importances_

In [None]:
word_map = pickle.load(file=open("data/10k/word_map.pickle", 'rb'))

In [None]:
features = dict(zip(sorted(word_map, key=word_map.get)[:10000], model.feature_importances_,))

In [None]:
sorted(features, key=features.get)[:10]

# Benchmark multiple models

In [None]:
methods = [('text','lg', False), ('text', 'SVM-text', False), ('text','random_forest', False)]

for method in methods:
    print('method is: ', method)
    train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], test_train_split_year= None, random_state = 1004)
    print(sum(test_y))
    model = create_model(model_type=method[1])
    model.fit(train_x, train_y)
    with open('model/' + method[0] + "_" + method[1] + ".pickle", 'wb') as handle:
        pickle.dump(model, handle)

    if hasattr(model, 'predict_proba'):
        pred_yp = model.predict_proba(test_x)[:,1]
    else:
        from sklearn.calibration import CalibratedClassifierCV
        clf = CalibratedClassifierCV(model) 
        clf.fit(train_x, train_y)
        pred_yp = clf.predict_proba(test_x)[:,1]
    roc = metrics.roc_auc_score(test_y, pred_yp)
    print('out sample AUC:')
    print(roc)

In [None]:
methods = [('num','lg', None),('num','random_forest-num', None),('num','SVM-num', None)]
methods = [('num','random_forest-num', None)]


for method in methods:
    print('method is: ', method)
    train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], test_train_split_year= None, random_state = 1004)
    model = create_model(model_type=method[1])
    model.fit(train_x, train_y)
    with open('model/' + method[0] + "_" + method[1] + ".pickle", 'wb') as handle:
        pickle.dump(model, handle)

    if hasattr(model, 'predict_proba'):
        pred_yp = model.predict_proba(test_x)[:,1]
    else:
        from sklearn.calibration import CalibratedClassifierCV
        clf = CalibratedClassifierCV(model) 
        clf.fit(train_x, train_y)
        pred_yp = clf.predict_proba(test_x)[:,1]
    roc = metrics.roc_auc_score(test_y, pred_yp)
    print('out sample AUC:')
    print(roc)

In [None]:
methods = [('total','SVM-total', True)]

for method in methods:
    print('method is: ', method)
    train_x, train_y, test_x, test_y = load_aligned_data(how=method[0], svd=method[2], test_train_split_year = 2008)
    model = create_model(model_type=method[1])
    model.fit(train_x, train_y)
    with open('model/' + method[0] + "_" + method[1] + ".pickle", 'wb') as handle:
        pickle.dump(model, handle)

    if hasattr(model, 'predict_proba'):
        pred_yp = model.predict_proba(test_x)[:,1]
    else:
        from sklearn.calibration import CalibratedClassifierCV
        clf = CalibratedClassifierCV(model) 
        clf.fit(train_x, train_y)
        pred_yp = clf.predict_proba(test_x)[:,1]
    roc = metrics.roc_auc_score(test_y, pred_yp)
    print('out sample AUC:')
    print(roc)

#  10 x 10 cv

In [13]:
def cv_10_by_10(method):
    for outer_loop, outer_split in enumerate(ten_by_ten_split):
        for inner_loop, fold in enumerate(outer_split):
            train_index = fold[0]
            test_index = fold[1]
            train = text_num_n_year.iloc[train_index]
            test = text_num_n_year.iloc[test_index]
            train_y = train['Y']
            test_y = test['Y']
            
            how = method[0]
            if how == 'text':
                train_x = train.loc[:,text_idx]
                test_x = test.loc[:,text_idx]

            if how == 'num':
                train_x = train[all_x_var]
                test_x = test[all_x_var]
                
            model = create_model(model_type=method[1])
            model.fit(train_x, train_y)
            if hasattr(model, 'predict_proba'):
                pred_yp = model.predict_proba(test_x)[:,1]
            else:
                from sklearn.calibration import CalibratedClassifierCV
                clf = CalibratedClassifierCV(model) 
                clf.fit(train_x, train_y)
                pred_yp = clf.predict_proba(test_x)[:,1]
            roc = metrics.roc_auc_score(test_y, pred_yp)
            with open("model/"+ method[0] + "_" + 
                method[1] + "_" + "10by10_y"+str(forecast_year)+".txt", 'a') as file:
                file.write(str(roc))
                file.write("\n")

In [10]:
ten_by_ten_split = pickle.load(open("data/ten_by_ten_splits.pickle_1r1001", 'rb'))

In [11]:
forecast_year = 1
merged = merge_data(svd = False, forecast_year = forecast_year)
text_num_n_year = merged[0]
text_idx = merged[1]

Loading data
concating
merging
Total number of observations with no forecasting: 
(95987, 10043)
n year before
Total number of observations: 
(83746, 10043)


In [None]:
methods = [('text','lg-text-nosvd', False), ('text','random_forest', False)]
for method in methods:
    cv_10_by_10(method)

In [None]:
methods = [('num','lg-num', False), ('text','random_forest-num', False)]
for method in methods:
    cv_10_by_10(method)

In [None]:
ten_by_ten_split = pickle.load(open("data/ten_by_ten_splits.pickle_2r1001", 'rb'))

In [None]:
forecast_year = 2
merged = merge_data(svd = False, forecast_year = forecast_year)
text_num_n_year = merged[0]
text_idx = merged[1]
methods = [('text','lg-text-nosvd', False), ('text','random_forest', False)]
for method in methods:
    cv_10_by_10(method)
    
methods = [('num','lg-num', False), ('text','random_forest-num', False)]
for method in methods:
    cv_10_by_10(method)

In [None]:
ten_by_ten_split = pickle.load(open("data/ten_by_ten_splits.pickle_3r1001", 'rb'))

In [None]:
forecast_year = 3
merged = merge_data(svd = False, forecast_year = forecast_year)
text_num_n_year = merged[0]
text_idx = merged[1]
methods = [('text','lg-text-nosvd', False), ('text','random_forest', False)]
for method in methods:
    cv_10_by_10(method)
    
methods = [('num','lg-num', False), ('text','random_forest-num', False)]
for method in methods:
    cv_10_by_10(method)