In [10]:
import os 
import time
import copy
import math 
import re

import torch
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim

# from torchviz import make_dot

cuda_available = torch.cuda.is_available()
device = torch.device("cuda:0" if cuda_available else "cpu")
print(f'''using device {device}''')

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import pickle
import random 
import pandas as pd

using device cuda:0


In [2]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [3]:
def cuda(input):
    if torch.cuda.is_available(): return input.cuda()
    return input

In [4]:
path = !pwd
path = path[0]
print(path)

/home/r2/Documents/RNNexp


In [5]:
# data = pd.read_pickle(path+"/data/step3_DAT_MT_USDJPY_M1_2018_merged_pickled") 
# data.head()

### functions

In [216]:
class Struct():
    pass 

def load_trumpdata(datapath, pad_tok='£', start_tok='^', end_tok='€'):
    import json 
    van_tws, tws, van_tw_str, tw_str = [],[],'',''
    filenames = ['condensed_2018.json', 'condensed_2016.json', 'condensed_2017.json', 'condensed_2018.json']
    for fname in filenames:
        f = open(datapath+fname,"r")
        data = f.readline()
        f.close()
        data_tr = json.loads(data)
        for line in range(0,len(data_tr)):
            tweet      = data_tr[line]["text"].rstrip('\\')
            van_tw_str = van_tw_str + tweet 
            van_tws.append(tweet)            
    symbols = list(set(van_tw_str))  
    assert(pad_tok   not in symbols)
    assert(start_tok not in symbols)
    assert(end_tok   not in symbols)

    for tweet in van_tws:
        pad_tweet = start_tok + tweet + end_tok
        tw_str    = tw_str + pad_tweet            
        tws.append(pad_tweet)        
    symbols = [pad_tok] + symbols + [start_tok, end_tok]    
    decoder = {idx: symbols[idx] for idx in range(0,len(symbols))}
    encoder = {symbols[idx]: idx for idx in range(0,len(symbols))}        
    return tws, tw_str, decoder, encoder

def pp_trumpdata(filename, prop, bsize=1):
    Data, train, valid, test = Struct(), Struct(), Struct(), Struct()        
    tweets, tweet_str, Data.decoder, Data.encoder = load_trumpdata(filename)    

    train.tweets = tweets[0:round(prop[0]*len(tweets))]
    train.tweet_str = tweet_str[0:round(prop[1]*len(tweet_str))]    
    valid.tweets = tweets[round(prop[0]*len(tweets)):round(prop[1]*len(tweets))]
    valid.tweet_str = tweet_str[round(prop[0]*len(tweet_str)):round(prop[1]*len(tweet_str))]    
    test.tweets  = tweets[round(prop[1]*len(tweets)):-1]
    test.tweet_str  = tweet_str[round(prop[1]*len(tweet_str)):-1]    

    train.batch_str = []
    stepsize = round(len(train.tweet_str)/bsize-1)
    for i in range(0,bsize):
        train.batch_str.append(train.tweet_str[i*stepsize:(i+1)*stepsize])
    valid.batch_str = [valid.tweet_str]
    
    Data.train, Data.valid, Data.test, Data.bsize = train, valid, test, bsize
    return Data

def onehencode(symbol, encoder):
    x = torch.zeros(len(encoder),1)
    x[encoder[symbol]] = 1.0
    return usecuda(x.t())




def encodeYstr(string, encoder):
    return cuda(torch.Tensor([encoder[char] for char in y_str]))

def generate_seq(model, hidden, symbol, seq_len, m, seed):
    with torch.no_grad():
        result_str = symbol
        for i in range(seq_len):
            x = onehencode(symbol,encoder)
            output, new_hidden = model.forward(x,hidden)
        
            hidden = new_hidden.detach()
            prob = np.exp(output.detach().data.cpu().numpy())
            cum_prob = np.cumsum(prob)

            a = random.random()
            idx = np.where(cum_prob - a > 0)[0][0]
            symbol = decoder[idx]
            result_str += symbol

        return result_str
    
def save_checkpoint(state, filename='models/checkpoint.pth.tar'):
    torch.save(state, filename)

def load_checkpoint(filename='models/checkpoint.pth.tar'):
    checkpoint = torch.load(filename)    
    for item in iter(checkpoint):
        print(item)
    model = RNN(checkpoint['in_sz'],checkpoint['hd_sz'],checkpoint['out_sz'])
    model.load_state_dict(checkpoint['state_dict'])
    #     optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    return model, epoch, loss    

def get_valid_loss(model,Data,Params,seq_len,ntweet):
    start = time.time()
    loss_valid = 0
    hidden = usecuda(torch.zeros(1,model.hd_sz))
    with torch.no_grad():    
        model.eval()
        for t in range(ntweet):
            tweet = Data.valid.tweets[t]
            xv, yv = generate_valid(Data,tweet,seq_len)     
            loss = 0
            for char in range(xv.size()[1]):
                x = xv[:,char,:].reshape(xv.shape[0],xv.shape[2])
                output, hidden = model.forward(x,hidden)
                y = yv[:,char,:]
                loss += criterion(output,y.reshape(xv.shape[0]))
            loss_valid += loss/(xv.size()[2])
    print(f"calculating validation loss took {time.time()-start:.2f} seconds")
    return loss_valid/ntweet

def parse_hidden(x,hidden,Data,symbol='*'):
    # use .data to not break the connection to the graph     
    for i in range(0,x.shape[0]):
        if onehdecode(x[i,:],Data.decoder) == symbol:
            hidden.data[i,:] = torch.zeros(1,hidden.shape[1])
    return hidden

def train_batch(model,X,Y,Data,hidden,lr,optimizer,use_opt,update_hidden):
    model.train()
    if use_opt: optimizer.zero_grad() 
    else: model.zero_grad()
    loss = 0
    for char in range(X.size()[1]):
        x = X[:,char,:].reshape(X.shape[0],X.shape[2])
        if update_hidden: hidden = parse_hidden(x,hidden,Data,symbol='*')        
        output, hidden = model.forward(x,hidden)
        y = Y[:,char,:]
        loss += criterion(output,y.reshape(X.shape[0]))
    loss.backward()
    if use_opt: optimizer.step()
    else:
        for p in model.parameters(): p.data.add_(-lr, p.grad.data)
    # hidden.detach() because we are done with training...
    return loss/(X.size()[2]), hidden.detach()

def generate_valid(Data, tweet, seq_len):
    if seq_len > len(tweet)-1: seq_len = len(tweet)-1    
    X = torch.zeros(1,seq_len,len(Data.encoder))
    Y = torch.zeros(1,seq_len,1)  
    x = encodestr(tweet[0:seq_len],Data.encoder,seq_len)
    y = torch.Tensor([Data.encoder[char] for char in tweet[1:seq_len+1]])
    X[0,:,:] = x.reshape(seq_len,len(Data.encoder))
    Y[0,:,:] = y.reshape(seq_len,1)
    return usecuda(X),usecuda(Y.long())

def generate_batch(e, Data, seq_len, get_valid=False):
    if get_valid: 
        batch_str, bsize = Data.valid.batch_str, 1        
    else: batch_str, bsize = Data.train.batch_str, Data.bsize
    X = torch.zeros(bsize,seq_len,len(Data.encoder))
    Y = torch.zeros(bsize,seq_len,1)
    for i in range(0,bsize):        
        x = encodestr(batch_str[i][e:e+seq_len],Data.encoder,seq_len)
        y = torch.Tensor([Data.encoder[char] for char in batch_str[i][e+1:e+seq_len+1]])
        X[i,:,:] = x.reshape(seq_len,len(Data.encoder))
        Y[i,:,:] = y.reshape(seq_len,1)
    return usecuda(X),usecuda(Y.long())

def do_training(model,Data,Params,optimizer,update_hidden,Plots=0):
    if Plots==0:
        Plots = Struct()
        Plots.loss_train, Plots.loss_valid = [], []
    start      = time.time()
    loss_train = 0
    hidden     = usecuda(torch.zeros(Params.bsize,model.hd_sz))
    for epoch in range(Params.ne):
        char_idx = 0
        i = 0 
        while i < Params.ni and char_idx < len(Data.train.batch_str[0])-Params.sql-1:
            X,Y          = generate_batch(char_idx, Data, Params.sql,False)
            print(X.shape)
            loss, hidden = train_batch(model,X,Y,Data,hidden,Params.lr,optimizer,True,update_hidden)
            loss_train  += loss         
            if i%Params.iv_pl  == 0:  
                Plots.loss_valid.append(get_valid_loss(model,Data,Params,30,50))
                print(Plots.loss_valid[-1])
                Plots.loss_train.append(loss_train/Params.iv_pl)
                loss_train = 0 
            char_idx += Params.sql + 1
            i        += 1
        print(f"""\n epoch {epoch+1} took {time.time() - start:.2f} seconds""")  
    return Plots





def change_char(s, p, r):
    return s[:p]+r+s[p+1:] 

def init_params(in_sz, bs, hd_sz=150):
    Params = Struct()
    Params.hd_sz   = hd_sz
    Params.in_sz   = in_sz
    Params.sql     = 10
    Params.iv_pr   = 200
    Params.iv_pl   = 100
    Params.n_e     = 1
    Params.n_i     = 1000
    Params.use_opt = True 
    Params.lr      = 0.0005
    Params.bsize   = bs
    return Params

## my RNN module

In [217]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN,self).__init__()
        self.hd_sz  = hidden_size
        self.in_sz  = input_size
        self.out_sz = output_size
        
        self.h1 = nn.Linear(input_size + hidden_size, hidden_size)        
        self.o1 = nn.Linear(input_size + hidden_size, input_size)
        
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = torch.tanh(self.h1(combined))
        output = self.o1(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self,bs):
        return cuda(torch.zeros(bs,self.hd_sz))

### DataLoaders, Itterators, DataSets

In [231]:
def make_parentbatch(tweets, bs, sql, symbol='£'):
    f"""each parent-batch will have different numbers of sub-batches depending on how long the tweets are"""
    bch_strs = batch_strings(tweets,bs,sql)
    parent_batches = []
    for pb in range(len(bch_strs)):
        bch       = bch_strs[pb]
        n_tweet   = bs
        n_segment = math.ceil(len(bch[0])/sql)
        sbx = cuda(torch.zeros(n_tweet,n_segment,sql,len(Data.decoder)))
        sby = cuda(torch.zeros(n_tweet,n_segment,sql))

        for tweet in range(n_tweet):
            if re.search(symbol,bch[tweet]): position = re.search(symbol,bch[tweet]).span()[0]
            else:                            position = len(bch[tweet])
            x_str = change_char(bch[tweet],position-1,symbol)
            y_str = bch[tweet][1:len(bch[tweet])]+symbol
            
            for segment in range(n_segment):
                x = x_str[sql*segment:sql*(segment+1)]
                y = y_str[sql*segment:sql*(segment+1)]  
                sbx[tweet,segment] = encodestr(x,Data.encoder)
                sby[tweet,segment] = torch.Tensor([Data.encoder[char] for char in y])                
                
        sb_ds = SBDataLoader(sbx, sby)
        parent_batches.append(sb_ds)
    return parent_batches


def batch_strings(tweets,bs,sql=1):
    f"""creates a list of batchsize-list of strings of same length and sort each batch with longest string first."""
    offset = -1*(int(len(tweets)/bs * 10) % 2 != 0)
    bch_strs = [] 
    for i in range(round(len(tweets)/bs)+offset):
        strings = tweets[i*bs:(i+1)*bs]
        strings.sort(key=len,reverse=True)
        pad_strings = pad(strings,sql)
        bch_strs.append(pad_strings)
    return bch_strs

def pad(str_list,sql=1,token='£'):
    f"""pad all strings in a list to max_len"""
    max_len = math.ceil(len(max(str_list, key=len))/sql)*sql
    for idx, row in enumerate(str_list):        
        str_list[idx] = row + token*(max_len-len(row))
    if len(str_list) == 1: return str_list[0]
    return str_list

def encodestr(string, encoder):
    x = torch.zeros((len(string),len(encoder)))
    x[[idx for idx in range(0,len(string))],[encoder[char] for char in string]] = 1
    return cuda(x)

### start coding 

In [232]:
class ParentDataLoader():
    def __init__(self, ds): 
        self.ds = ds
    def __iter__(self):    
        for i in range(len(self.ds)):
            iterator = iter(self.ds[i])
            yield next(iterator), True
            try:
                while True:                
                    yield next(iterator), False 
            except StopIteration:
                pass

class SBDataLoader():
    def __init__(self, sbx, sby): 
        self.sbx, self.sby = sbx, sby
    def __iter__(self):
        for j in range(self.sbx.shape[0]): yield self.sbx[:,j], self.sby[:,j]
            

In [233]:
bs        = 3 
Data      = pp_trumpdata(path+"/data/trump/", [0.9,0.95], bs)
Params    = init_params(len(Data.encoder),bs)

In [234]:
tweets      = Data.train.tweets[0:100]
dataloader = iter(ParentDataLoader(make_parentbatch(tweets,bs,sql=Params.sql)))

In [235]:
rnn       = cuda(RNN(Params.in_sz, Params.hd_sz, 1))
optimizer = optim.RMSprop(rnn.parameters(), lr=Params.lr)

In [236]:
(X,Y), usezerostate = next(dataloader)




In [237]:
sql

3

In [238]:
usezerostate

True

In [239]:
if usezerostate:
    hidden = rnn.initHidden(bs)
for char in range(X.shape[0]):
    x,y = X[:,char],Y[:,char]
    output,hidden = rnn.forward(x,hidden)
# now we need to know if          

torch.Size([3, 349])
torch.Size([3])
torch.Size([3, 150])
tensor([[0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.]], device='cuda:0') tensor([ 81., 316.,  81.], device='cuda:0')
True
True
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0') tensor([317., 316., 207.], device='cuda:0')
True
True
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0') tensor([ 93., 316.,  99.], device='cuda:0')
True
True


In [None]:
output, hidden = model.forward(x,hidden)
        y = Y[:,char,:]
        loss += criterion(output,y.reshape(X.shape[0]))
    loss.backward()
    if use_opt: optimizer.step()

In [None]:

rnn1 = usecuda(RNN(Params.in_sz, Params.hd_sz, 1))
torch.manual_seed(24)
rnn2 = usecuda(RNN(Params.in_sz, Params.hd_sz, 1))

criterion = nn.NLLLoss()

optimizer1 = optim.RMSprop(rnn1.parameters(), lr=Params.lr)
optimizer2 = optim.RMSprop(rnn2.parameters(), lr=Params.lr)

Params1 = copy.deepcopy(Params)
Params2 = copy.deepcopy(Params)
Params1.bsize = 10
Params2.bsize = 10

Data1 = pp_trumpdata(path+"/data/trump/", [0.9,0.95], Params1.bsize)
Data2 = pp_trumpdata(path+"/data/trump/", [0.9,0.95], Params2.bsize)

# Plots1 = do_training(rnn1,Data1,Params1,optimizer1,True)
Plots2 = do_training(rnn2,Data2,Params2,optimizer2,False)


In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(Plots1.loss_valid[1:-1])

In [None]:
plt.figure()
plt.plot(Plots2.loss_valid[1:-1])

In [None]:
def onehdecode(vector, decoder):
    val, idx = torch.max(vector,0)
    return decoder[idx.item()]


In [None]:
save_checkpoint({
            'epoch': epoch,
            'arch': "1_RNN",
            'state_dict': rnn.state_dict(),
            'hd_sz': rnn.hd_sz,
            'in_sz': rnn.in_sz,
            'out_sz': rnn.out_sz,
            'loss': loss,
            'best_prec1': None,
            'optimizer' : None,
        })

In [None]:
rnn2, epoch, loss = load_checkpoint(filename='models/checkpoint.pth.tar')
rnn2.cuda()

In [None]:
print(generate_seq(rnn2, torch.zeros(1,hsize).cuda(),'T',100,m,42))
print(generate_seq(rnn, torch.zeros(1,hsize).cuda(),'T',100,m,42))