In [2]:
import numpy as np
import sys
from numpy import array
import string
import pickle as plk
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Activation
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.utils import to_categorical
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [3]:
SEQUENCE_LEN = 60
BATCH_SIZE = 512
EPOCHS = 10
HIDDEN_LAYERS_DIM = 512
LAYER_COUNT = 4
DROPOUT = 0.2

df = pd.read_excel('/Users/luyin/Desktop/project/Q&A.xlsx',header = 0)
l = df['Breakout'].unique() # 79 unique analyst
dic = {} #create dictionary for questions
for category in l:
    list_ = list(df.loc[df['Breakout']  == category]['Question'])
    dic[category] = list_

In [4]:
#character vocabulary
UNK_IDX = 98
characters = list(string.printable)
characters.remove('\x0b')
characters.remove('\x0c')
VOCABULARY_SIZE = len(characters)
characters_to_ix = {c:i for i,c in enumerate(characters)}
print("vocabulary len = %d" % VOCABULARY_SIZE)
characters.append('<unk>')
print(characters)

vocabulary len = 98
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ' ', '\t', '\n', '\r', '<unk>']


In [5]:
dic_len = len(dic['Balance sheet'])
all_data = dic['Balance sheet']
train_data = dic['Balance sheet'][:600]
val_data = dic['Balance sheet'][600: dic_len]

In [6]:
def letter_generator(data):
    out_text = ''
    for i in range(len(data)):
        out = ''
        for letter in data[i]:
            out += letter
        out_text += out
    return out_text

def describe_batch(X, y, samples=3):

    for i in range(samples):
        sentence = ""
        for s in range(SEQUENCE_LEN):
            sentence += characters[X[i,s,:].argmax()]
        next_char = characters[y[i,:].argmax()]
        
        print("sample #%d: ...%s -> '%s'" % (
            i,
            sentence[-20:],
            next_char
        ))

def batch_generator(text, count):

    while True:
        for batch_ix in range(count):
            X = np.zeros((BATCH_SIZE, SEQUENCE_LEN, VOCABULARY_SIZE+1))
            y = np.zeros((BATCH_SIZE, VOCABULARY_SIZE+1))

            batch_offset = BATCH_SIZE * batch_ix

            for sample_ix in range(BATCH_SIZE):
                sample_start = batch_offset + sample_ix
                for s in range(SEQUENCE_LEN):
                    if text[sample_start+s] in characters_to_ix.keys():
                        X[sample_ix, s, characters_to_ix[text[sample_start+s]]] = 1
                    else:
                        X[sample_ix, s, UNK_IDX] = 1
                if text[sample_start+s+1] in characters_to_ix.keys():
                    y[sample_ix, characters_to_ix[text[sample_start+s+1]]]=1
                else:
                    y[sample_ix, UNK_IDX]=1
            yield X, y


In [7]:
train_text = letter_generator(train_data)
val_text = letter_generator(val_data)

In [8]:
file = open('train_text.txt', 'w')
file.write(train_text)
file.close()

file = open('val_text.txt', 'w')
file.write(val_text)
file.close()

In [9]:
with open('train_text.txt', "r") as f:
    text_train = f.read()
with open('val_text.txt', "r") as f:
    text_val = f.read()

text_train_len = len(text_train)
text_val_len = len(text_val)
print("Total of %d characters" % (text_train_len + text_val_len))

for ix, (X,y) in enumerate(batch_generator(text_train, count=1)):
    # describe some samples from the first batch
    describe_batch(X, y, samples=5)
    break

Total of 284224 characters
sample #0: ...utlook for how the n -> 'e'
sample #1: ...tlook for how the ne -> 't'
sample #2: ...look for how the net -> ' '
sample #3: ...ook for how the net  -> 'i'
sample #4: ...ok for how the net i -> 'n'


In [10]:
def build_model():
    """Build a Keras sequential model for training the char-rnn"""
    model = Sequential()
    for i in range(LAYER_COUNT):
        model.add(
            Bidirectional(LSTM(
                HIDDEN_LAYERS_DIM, 
                return_sequences=True if (i!=(LAYER_COUNT-1)) else False,
                input_shape=(SEQUENCE_LEN, VOCABULARY_SIZE+1),
            )) # add a Bidiretional 
        )
        model.add(Dropout(DROPOUT))
    
    model.add(Dense(VOCABULARY_SIZE+1))
    model.add(Activation('softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
    return model

In [11]:
training_model = build_model()

train_batch_count = (text_train_len - SEQUENCE_LEN) // BATCH_SIZE
val_batch_count = (text_val_len - SEQUENCE_LEN) // BATCH_SIZE
print("training batch count: %d" % train_batch_count)
print("validation batch count: %d" % val_batch_count)

# checkpoint
from keras.callbacks import ModelCheckpoint, EarlyStopping
# filepath = "./%d_%d-%s_dp%.2f_%dS_epoch{epoch:02d}-loss{loss:.4f}-val-loss{val_loss:.4f}_weights" % (
#     BATCH_SIZE,
#     LAYER_COUNT,
#     HIDDEN_LAYERS_DIM,
#     DROPOUT,
#     SEQUENCE_LEN)
filepath="char_RNN_2.hdf5"
# checkpoint = ModelCheckpoint(filepath, save_weights_only=True)
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
# early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=0)

callbacks_list = [checkpoint, early_stopping]

training batch count: 442
validation batch count: 112


In [12]:
# history = training_model.fit_generator(
#     batch_generator(text_train, count=train_batch_count),
#     train_batch_count,
#     max_queue_size=1, # no more than one queued batch in RAM
#     epochs=EPOCHS,
#     callbacks=callbacks_list,
#     validation_data=batch_generator(text_val, count=val_batch_count),
#     validation_steps=val_batch_count,
#     initial_epoch=0)

In [13]:
training_model_2 = build_model()
filepath="char_RNN_2.hdf5"
# checkpoint = ModelCheckpoint(filepath, save_weights_only=True)
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
# early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=0)

callbacks_list = [checkpoint, early_stopping]

training_model_2.load_weights(filepath)
history_2 = training_model_2.fit_generator(
    batch_generator(text_train, count=train_batch_count),
    train_batch_count,
    max_queue_size=1, # no more than one queued batch in RAM
    epochs=EPOCHS,
    callbacks=callbacks_list,
    validation_data=batch_generator(text_val, count=val_batch_count),
    validation_steps=val_batch_count,
    initial_epoch=0)

Epoch 1/10

Epoch 00001: loss improved from inf to 1.18133, saving model to char_RNN_2.hdf5
Epoch 2/10
 37/442 [=>............................] - ETA: 1:19:45 - loss: 1.1335 - acc: 0.6613

KeyboardInterrupt: 

In [14]:
training_model_2.save('char_rnn_model_new3.h5')

In [15]:
test_model = Sequential()
for i in range(LAYER_COUNT):
    test_model.add(
            LSTM(
                HIDDEN_LAYERS_DIM, 
                return_sequences=True if (i!=(LAYER_COUNT-1)) else False,
                batch_input_shape=(1, 1, VOCABULARY_SIZE+1),
                stateful=True
            )
        )
test_model.add(Dense(VOCABULARY_SIZE+1))
test_model.add(Activation('softmax'))
test_model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

In [16]:
test_model.load_weights("char_RNN_2.hdf5")

In [21]:
def sample(preds, temperature=1.0):

    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def predict_next_char(model, current_char, diversity=1.0):

    x = np.zeros((1, 1, VOCABULARY_SIZE+1))
    if current_char in characters_to_ix.keys():
        x[:,:,characters_to_ix[current_char]] = 1
        y = model.predict(x, batch_size=1)
        next_char_ix = sample(y[0,:], temperature=diversity)
        next_char = characters[next_char_ix]
    return next_char



def generate_text(model, seed, count=300):

    model.reset_states()
    for s in seed[:-1]:
        next_char = predict_next_char(model, s)
    current_char = seed[-1]

    sys.stdout.write("["+seed+"]")
    
    for i in range(count - len(seed)):
        next_char = predict_next_char(model, current_char, diversity=0.5)
        current_char = next_char
        sys.stdout.write(next_char)
    print("...\n")

In [22]:
for i in range(5):
    generate_text(
        test_model,
        seed="think about"
    )

[think about] the securities being the deposit betas when we saw a little bit about the bank to account that you are some of the consumer balance sheet relative to the asset side, you're aghitional expecting the point in terms of the loan growth about the consumer this quarter and the security in the ...

[think about] the consumer and do you think the outlook loan growth in the bank to your consumer and the loan base has of the consumer deposit betas that so far, and it was you have to a back to increase in the deposit rate hikes? Or a little bit about the deposits of the callable for balance sheet th...

[think about] the retail sense of the consumer banks, are you asset side of the bong that you have it a a little bit on the concern of the bank higher things out the balance sheet related to the short rate environment, and I have in terms of the outlook of the auto a that with the past consumer of the...

[think about] the bank and interest income is the people related to mater the b

In [24]:
for i in range(5):
    generate_text(
        test_model,
        seed="What"
    )

[What] do you think that are the industry is about deposit betas are you particular and the banks to get that you want to have to make in the customers and that concernion the balance sheet roll of the loan growth concers to what have a lot of the consumer being sense of the deposit side of the balanc...

[What] the deposit betas where you are in the bank deposit changes and have your bank into the outlook for that says a little bit of the balance sheet in terms of your did loan growth your deposit growth in loan growth in the consumer particular to the part of the customers and where you think is a li...

[What] we should we think is a bit continue to retail growth of that from a sense of your corporate and the positioned from the concertion in the loan growth in the part of the consumer banks and so the outlook for the retail side?Good morning. And if you all a little bit about your competitors from h...

[What] your takes and the deposit betas that we saw a little bit about the part