# Import Dataset and Data Splitting

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
# for Google Colab only
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_pickle("/content/drive/MyDrive/Big_Data_Fin2/data_sources/correct dataset.pkl")#.sample(n=100000)
print(df.shape)

(3739449, 190)


In [None]:
pd.set_option('display.max_rows', 6)
df

Unnamed: 0,date,permno,excess_ret,ret,rfree,mvel1,beta,betasq,chmom,dolvol,...,ind_84,ind_85,ind_86,ind_87,ind_88,ind_89,ind_90,ind_91,ind_92,ind_93
0,1986-02-01,10,-0.262610,-0.257143,0.005467,-0.375440,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
1,1986-03-01,10,0.360335,0.365385,0.005050,-0.496811,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
2,1986-04-01,10,-0.103717,-0.098592,0.005125,-0.401783,0.000000,0.000000,0.000000,-0.521182,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3739446,2016-10-01,93,-0.031253,-0.030878,0.000375,0.936380,0.650269,0.647670,-0.310701,0.993629,...,0,0,0,0,0,0,0,0,0,1
3739447,2016-11-01,93,-0.042553,-0.042128,0.000425,0.936096,0.638693,0.636094,-0.806963,0.993638,...,0,0,0,0,0,0,0,0,0,1
3739448,2016-12-01,93,0.127822,0.128247,0.000425,0.929911,0.622969,0.620384,-0.540670,0.993655,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# features + targets 
y = df['excess_ret']

date = df['date']
nonfeatures = ['ret','excess_ret','rfree','permno','date']
features = df.columns.values[5:]
# make 18 years of training data from 1957~1974

training = (date <= '1974-12') # selects 
X_train, y_train = df.loc[training].drop(columns=nonfeatures).values, y.loc[training].values 

# make 12 years of validation data from 1975~1986
validation = (date > '1974-12') & (date <= '1986-12') 
X_val, y_val = df.loc[validation].drop(columns=nonfeatures).values, y.loc[validation].values 

# make the rest of 30 years as the test data
test = (date > '1986-12') 
X_test, y_test = df.loc[test].drop(columns=nonfeatures).values, y.loc[test].values 

del df, training, validation, test
import gc
gc.collect()

0

In [None]:
print(X_train.shape, X_val.shape, X_test.shape)

(475849, 185) (770215, 185) (2493385, 185)


# Model Fitting

In [None]:
!cp /content/drive/MyDrive/Big_Data_Fin2/NNs.py . # copy the file to the current directory, for colab only
from NNs import *
from tensorflow.keras.callbacks import EarlyStopping


nn2 = Model2Layers()
nn2.compile(loss='mse', optimizer='Adam',metrics = ['mse'])

history2 = nn2.fit(X_train, y_train,
            epochs=50, 
            batch_size=2000, 
            verbose=True,
            validation_data = (X_val, y_val),
            callbacks = [EarlyStopping(patience=5, restore_best_weights=True)])

In [None]:
ypred_nn2 = nn2.predict(X_test)

# We can compute mse and r2 directly using their fomular.
# But as the data size is really large, we try to avoid creating unnecessary intermediate variables
# So we just use the built in function in sklearn for calculating mse and R2

#mse_nn2 = np.mean( (y_test-ypred_nn2.ravel())**2 )
from sklearn.metrics import mean_squared_error
mse_nn2 = mean_squared_error(y_test,ypred_nn2)
print("MSE of the NN2 model is:",mse_nn2)

#rss_nn2 = np.sum( (y_test - ypred_nn2.ravel())**2 )
#tss_nn2 = np.sum( (y_test - np.mean(y_test))**2 )
#R2_nn2 = 1- rss_nn2/tss_nn2
from sklearn.metrics import r2_score
r2_nn2 = r2_score(y_test,ypred_nn2)
print("R2 of the NN2 model is:",r2_nn2)

In [None]:
nn3 = Model3Layers()
nn3.compile(loss='mse', optimizer='Adam',metrics = ['mse'])

In [None]:
nn4 = Model4Layers()
nn4.compile(loss='mse', optimizer='Adam',metrics = ['mse'])

# Calculating Feature Importances

In [None]:
from sklearn.metrics import r2_score

# runtime warning 
# using for loop to calculate the feature importances

r2_list = []
for i in range(X_test.shape[1]): # 
  X_test_copy = X_test.copy()
  X_test_copy[:,i] = 0
  
  r2_new = r2_score(y_test, nn2.predict(X_test_copy)).round(5)
  r2_list.append(r2_new)

  if i%10 == 0:
    print(i,"iterations have finished...")

r2_scores = np.array(r2_list)



In [None]:
from joblib import Parallel, delayed
import multiprocessing

# Using parallel computing. But the run time didn't get improved significantly
# And it is hard to track the completion progress compared to for loop 

def get_r2_score(Xtest,i):
  Xtest_copy = Xtest.copy()
  Xtest_copy[:, i] = 0

  y_pred = nn2.predict(Xtest_copy)
  r2_new = r2_score(y_test, y_pred).round(5)
  return r2_new


def apply_parrallel(Xtest,func):
    retList = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(Xtest,i) for i in range(X_test.shape[1]))
    return np.array(retList)

# r2_scores= apply_parrallel(X_test, get_r2_score)

In [None]:
# normalize the r2 difference to make it sum up to 1
r2_diff = r2_nn2-r2_scores

r2_diff = (r2_diff-np.min(r2_diff))/(np.max(r2_diff)-np.min(r2_diff))
r2_diff = r2_diff/np.sum(r2_diff)

pd.Series(r2_diff,index=features).sort_values(ascending=False)



mvel1      0.062017
agr        0.057615
idiovol    0.055564
             ...   
beta       0.050267
mom12m     0.048484
turn       0.000000
Length: 20, dtype: float64

2

0

# Old Codes

In [None]:
#Gradient Descent Alogrithm
sgd = tf.keras.optimizers.SGD(learning_rate=0.0, momentum=0.9)

In [None]:
lrate = LearningRateScheduler(step_decay)
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
callbacks_list = [lrate,es]

In [None]:
import numpy
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor

In [None]:
seed = 7
numpy.random.seed(seed)
batch_size = 128
epochs = 10
def CrossValandResults(model,x_train,y_train):

  model_CV = KerasClassifier(build_fn=model, epochs=epochs, 
                            batch_size=batch_size, verbose=1)
  # define the grid search parameters
  init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 
              'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
  batch_size = [10, 20, 40, 60, 80, 100]
  epochs = [10, 50, 100]

  param_grid = dict(init_mode=init_mode,batch_size=batch_size,epochs=epochs)
  grid = GridSearchCV(estimator=model_CV, param_grid=param_grid, n_jobs=-1, cv=3)
  grid_result = grid.fit(x_train, y_train)

  print(f'Best Accuracy for {grid_result.best_score_} using {grid_result.best_params_}')
  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
    print(f' mean={mean:.4}, std={stdev:.4} using {param}')

In [None]:
# Drop-Based Learning Rate Decay
import math

# learning rate schedule
def step_decay(epoch):
	initial_lrate = 0.1
	drop = 0.5
	epochs_drop = 10.0
	lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
	return lrate

lrate = tf.keras.callbacks.LearningRateScheduler(step_decay)