In [1]:

import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
import pickle as pkl
from collections import Counter

%load_ext autoreload
%autoreload 2

from utils import *
from cpc_transformer import *
from cpc_tokenizer import *

In [2]:

path = 'F:/$daten/datasets/pythoncode/train/'
with open(path+'collection.dat', 'rb') as f:
    collection = pkl.load(f)

In [3]:
tokenizer = Tokenizer()

In [25]:
MAX_LEN = 280 # maximale Anzahl Tokens

In [5]:

def encode(string, rnd=True):
    'Tokenizes and encodes a string to a format suitable for the neural network'
    if len(string) > MAX_LEN+1 and rnd:
        i = np.random.randint(0, len(string)-MAX_LEN-1)
        string = string[i:i+MAX_LEN]
    string = '`' + string
    string = tokenizer.tokenize(string)
    x = np.zeros(MAX_LEN+1, dtype=int)
    x[:] = tokenizer.c2t['´']
    for k, l in enumerate(string[:MAX_LEN+1]):
        x[k] = l
    x = x[:MAX_LEN+1]
    return x


def onehot(ys):
    bsize, maxlen = ys.shape
    yt = np.zeros((bsize, maxlen, tokenizer.NTOK))
    for i in range(bsize):
        for j in range(maxlen):
            yt[i,j,ys[i,j]] = 1.
    return yt


def batchgen(bsize=16):
    ep = 0
    while True:
        inds = np.random.permutation(range(100, len(collection)))
        minibatches = [ inds[k*bsize:(k+1)*bsize] for k in range(len(inds)//bsize) ]
        for mb in minibatches:
            xs = np.zeros((bsize, MAX_LEN+1), dtype=int)
            for i, j in enumerate(mb):
                x = collection[j]
                xs[i] = encode(x)
            ohs = onehot(xs)
            ohs = ohs * 0.9 + np.ones_like(ohs)/tokenizer.NTOK * 0.1
            yield xs, ohs
        print(f'========== EPOCH {ep} COMPLETED ==========')
        ep += 1
    
bg = batchgen()
xs, oh = next(bg)
print(tokenizer.detokenize(xs[0]))

`om 2016 & 2017 for the test set.  
<M>
Remove rows for which target column is em


In [6]:
net = Net(n=128, nh=4, ntok=tokenizer.NTOK)
from torch_optimizer import Lookahead, Yogi
net.optim = Lookahead(Yogi(net.parameters(), lr=1e-3, betas=(0.9, 0.99)))
net.iters = 0
net.losses = []
net.vlosses = []
net.vmin = 9999
bg = batchgen()

In [58]:
#net.load_state_dict(torch.load('cpc_weights_231223.dat'), strict=False)

_IncompatibleKeys(missing_keys=['enc5.mha.q.weight', 'enc5.mha.q.bias', 'enc5.mha.k.weight', 'enc5.mha.k.bias', 'enc5.mha.v.weight', 'enc5.mha.v.bias', 'enc5.mha.p.weight', 'enc5.mha.p.bias', 'enc5.ln.gamma', 'enc5.ln.beta', 'enc5.ff.dense1.weight', 'enc5.ff.dense1.bias', 'enc5.ff.dense2.weight', 'enc5.ff.dense2.bias', 'enc5.ff.ln.gamma', 'enc5.ff.ln.beta', 'enc6.mha.q.weight', 'enc6.mha.q.bias', 'enc6.mha.k.weight', 'enc6.mha.k.bias', 'enc6.mha.v.weight', 'enc6.mha.v.bias', 'enc6.mha.p.weight', 'enc6.mha.p.bias', 'enc6.ln.gamma', 'enc6.ln.beta', 'enc6.ff.dense1.weight', 'enc6.ff.dense1.bias', 'enc6.ff.dense2.weight', 'enc6.ff.dense2.bias', 'enc6.ff.ln.gamma', 'enc6.ff.ln.beta'], unexpected_keys=[])

In [7]:
def valloss():
    net.eval()
    bsize = 64
    xs = np.zeros((bsize, MAX_LEN+1), dtype=int)
    for i in range(bsize):
        x = collection[i]
        xs[i] = encode(x)
    ohs = onehot(xs)
    ohs = ohs * 0.9 + np.ones_like(ohs)/tokenizer.NTOK * 0.1
    xs, ohs = np2t(xs, ohs)
    xp = net(xs.long())
    xp = rearrange(xp[:,:-1], 'b p n -> (b p) n')
    ohs = rearrange(ohs[:,1:], 'b p n -> (b p) n')
    return torch.mean(-torch.log_softmax(xp, dim=1) * ohs) * tokenizer.NTOK
        

def loss():
    net.train()
    xs, ohs = next(bg)
    xs, ohs = np2t(xs, ohs)
    xp = net(xs.long())
    xp = rearrange(xp[:,:-1], 'b p n -> (b p) n')
    ohs = rearrange(ohs[:,1:], 'b p n -> (b p) n')
    return torch.mean(-torch.log_softmax(xp, dim=1) * ohs) * tokenizer.NTOK

valloss(), \
loss()

(tensor(5.7206, device='cuda:0', grad_fn=<MulBackward0>),
 tensor(5.7204, device='cuda:0', grad_fn=<MulBackward0>))

In [None]:
losses = []
vlosses = []

for k in trange(99999999):
    l = loss()
    l.backward()
    losses.append(l.item())
    net.optim.step()
    net.optim.zero_grad()
    
    if len(losses) == 100:
        vloss = valloss().item()
        if vloss < net.vmin:
            net.vmin = vloss
            torch.save(net.state_dict(), 'cpc_weights_261223.dat')
        net.losses.append((net.iters, np.mean(losses)))
        net.vlosses.append((net.iters, vloss))
        losses = []
    net.iters += 1
    
    if k % 100 == 0:
        plt.plot(*zip(*net.losses), zorder=+20)
        plt.plot(*zip(*net.vlosses))
        #plt.ylim([0,50])
        plt.grid()
        plt.show()
        s, p = predict()
        print('------')
        print(s)

In [None]:

plt.plot(*zip(*net.losses), zorder=+20)
plt.plot(*zip(*net.vlosses))
#plt.ylim([0,50])
plt.grid()
plt.show()

In [29]:
import bpe

@torch.no_grad()
def predict(string=None, tau=0, length=MAX_LEN):
    if string is None:
        i = np.random.randint(100)
        x = collection[i]
        x = encode(x)
    else:
        x = encode(string, rnd=False)
        string = '`' + string
        string = tokenizer.tokenize(string)
        x = np.zeros(len(string), dtype=int)
        for k, l in enumerate(string):
            x[k] = l
    
    ys = np2t([x]).long()
    print(tokenizer.detokenize(x))

    net.eval()
    probs = []
    for i in range(0, length):
        yp = net(ys)
        dist = t2np(F.softmax(yp[0,-1], dim=0))
        if tau>0:
            k = np.random.choice(range(tokenizer.NTOK), p=t2np(F.softmax(yp[0,-1]/tau, dim=0))) 
        else:
            k = dist.argmax()
        ys = torch.cat([ys, np2t([[k]])], dim=1).long()
        ys = ys[:,-MAX_LEN:]
        p = dist[k]
        probs.append(p)
        if k == tokenizer.c2t['´']:
            break

    p = np.prod(probs)

    ys = t2np(ys[0, 0:i])
    s = tokenizer.detokenize(ys)

    return s, p

In [63]:
string = """
net."""
print(predict(string, tau=0.5, length=80)[0])

`
net.
`
net.reset_index(drop=True, inplace=True)
net.reset_index(drop=True, inplace=T


In [45]:
print(predict(tau=0.5)[0])

`>
import numpy as np
<C>
import matplotlib.pyplot as plt
test_df=pd.read_csv("../input/testPrice.csv")
<C>
cols_to_use = ["supply_area", "exclusive_use_area", "floor", "room_count", "bathroom_count","total_household_count_in_sites","total_parking_capacity_in_site","apartment_buil
d_capacity_in_site","total_parking_capacity_in_site","total_parking_capacity_in_site","total_parking_capacity_in_site","total_parking_capacity_in_site","total_parking_capacity_in_site","total_parking_capacity_in_site","total_parking_capacity_in_site","total_parking_capacity_in_s
