<a href="https://colab.research.google.com/github/msh2481/CodeStyler/blob/main/Torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!rm -rf ./*
!git clone https://github.com/msh2481/CodeStyler.git && mv CodeStyler/* . && rm -rf CodeStyler
!ls

Cloning into 'CodeStyler'...
remote: Enumerating objects: 7936, done.[K
remote: Counting objects: 100% (7936/7936), done.[K
remote: Compressing objects: 100% (6536/6536), done.[K
remote: Total 7936 (delta 1405), reused 7917 (delta 1399), pack-reused 0[K
Receiving objects: 100% (7936/7936), 9.12 MiB | 10.14 MiB/s, done.
Resolving deltas: 100% (1405/1405), done.
Baseline.ipynb	   filenames.txt  README.md	   Recurrent.ipynb
Feedforward.ipynb  files	  Reccurent.ipynb


In [3]:
from random import shuffle, choices, choice
from collections import deque, defaultdict, Counter
from itertools import islice
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [14]:
CHUNK_SIZE = 6
BATCH_SIZE = 256
BATCHES_IN_TRAIN = 10
BATCHES_IN_TEST = 2
TRAIN_SIZE = BATCH_SIZE * BATCHES_IN_TRAIN
TEST_SIZE = BATCH_SIZE * BATCHES_IN_TEST
MIN_OCCURENCES = 10
MEMORY = 100

In [15]:
def fmt(number):
    return '{:.5f}'.format(number)

In [16]:
rawTexts = []
alphabet = Counter()
import string
printable = set(string.printable)

for filename in open('filenames.txt'):
    if len(rawTexts) > TRAIN_SIZE + TEST_SIZE:
        break
    text = open(filename.strip(), encoding='utf-8').read()
    text = ''.join([x for x in text if  x in printable])
    if 'debug' in text or 'DEBUG' in text or '000' in text:
        continue
    for c in text:
        assert c in printable
    print(text)
    alphabet.update(text)
    for pos in range(0, len(text) - CHUNK_SIZE + 1):
        rawTexts.append(text[pos : pos + CHUNK_SIZE])
alphabetCount = Counter()
alphabetCount['%'] = 0
for x, y in alphabet.items():
    if y >= MIN_OCCURENCES:
        alphabetCount[x] += y
    else:
        alphabetCount['%'] += y
alphabet = [x for x, y in alphabetCount.items()]
ALPHABET_SIZE = len(alphabet)
print(f'alphabet of length {len(alphabet)}: {alphabetCount}')

shuffle(rawTexts)
print(f'{len(rawTexts)} texts in total')

// !LANGUAGE: +NewInference +ProhibitInvisibleAbstractMethodsInSuperclasses
// !DIAGNOSTICS: -UNUSED_VARIABLE -ASSIGNED_BUT_NEVER_ACCESSED_VARIABLE -UNUSED_VALUE -UNUSED_PARAMETER -UNUSED_EXPRESSION
// SKIP_TXT
// FULL_JDK

// MODULE: base
// FILE: AbstractClassCase1.kt
package base

// TESTCASE NUMBER: 1
abstract class AbstractClassCase1() {
    <!INCOMPATIBLE_MODIFIERS!>private<!> <!INCOMPATIBLE_MODIFIERS!>abstract<!> fun priv()
    protected abstract fun prot()
    internal abstract fun int()
    public abstract fun pub()

    <!INCOMPATIBLE_MODIFIERS!>private<!> <!INCOMPATIBLE_MODIFIERS!>abstract<!> val priv1: String
    protected abstract val prot1: String
    internal abstract val int1: String
    public abstract val pub1: String
}

<!INVISIBLE_ABSTRACT_MEMBER_FROM_SUPER_ERROR!>class Case1<!> : AbstractClassCase1(){
    override fun prot() {}

    override fun int() {
        prot()
    }

    override fun pub() {}

    override val prot1: String
        get() = ""
    override v

In [17]:
charToIndexMap = { c : i for i, c in enumerate(alphabet) }
def charToIndex(c):
    return torch.as_tensor(charToIndexMap.get(c, ALPHABET_SIZE - 1), dtype=torch.long)

def stringToTensor(cur):
    x = torch.zeros(size=(len(cur), ALPHABET_SIZE))
    for j in range(len(cur)):
        x[j][charToIndex(cur[j])] = 1
    return x

class StringDataset(Dataset):
    def __init__(self, strings):
        super(StringDataset, self).__init__()
        self.strings = strings
    def __len__(self):
        return len(self.strings)
    def __getitem__(self, i):
        return stringToTensor(self.strings[i])

trainSet = DataLoader(StringDataset(rawTexts[: TRAIN_SIZE]), batch_size=BATCH_SIZE, shuffle=True)
testSet = DataLoader(StringDataset(rawTexts[TRAIN_SIZE : TRAIN_SIZE + TEST_SIZE]), batch_size=BATCH_SIZE, shuffle=False)


In [18]:
print(len(trainSet), len(testSet))
print('---')
# print(next(iter(trainSet)))
print('---')

10 2
---
---


In [62]:
lossFunction = nn.CrossEntropyLoss()

def parametersTensor(predictor):
    return torch.cat(tuple(elem.view(-1) for elem in predictor.parameters()))

def gradientsTensor(predictor):
    return torch.cat(tuple(elem.grad.view(-1) for elem in predictor.parameters()))

X_ORT = None
Y_ORT = None

def tensorTo2D(v):
    global X_ORT, Y_ORT
    if X_ORT is None:
        assert Y_ORT is None
        X_ORT = torch.rand(v.shape, dtype=torch.double)
        Y_ORT = torch.rand(v.shape, dtype=torch.double)
        X_ORT = F.normalize(X_ORT, dim=0)
        Y_ORT = F.normalize(Y_ORT, dim=0)
    print(X_ORT.sum(), X_ORT.mean(), X_ORT.std())
    vx = torch.mul(v, X_ORT)
    print(vx.mean(), vx.std())
    vy = torch.mul(v, Y_ORT)
    return vx.sum(), vy.sum()

def evaluateOnBatch(predictor, batch):
    N = batch.shape[0]
    assert batch.shape == (N, CHUNK_SIZE, ALPHABET_SIZE)
    h0 = torch.randn((1, N, ALPHABET_SIZE + MEMORY))
    data = batch[:, :-1, :]
    answer = batch[:, -1, :].argmax(dim=-1)
    assert answer.shape == (N, )
    output = predictor(data, h0)[0][:, -1, :]
    output = output[:, :ALPHABET_SIZE]
    assert output.shape == (N, ALPHABET_SIZE)
    loss = lossFunction(output, answer)
    accuracy = (output.argmax(dim=-1) == answer).float().mean()
    return accuracy, loss
        
def train(predictor, optimizer, startEpoch):
    predictor.train()
    trainAccuracy = 0
    trainLogLoss = 0
    trainSize = 0
    for batch in islice(trainSet, BATCHES_IN_TRAIN):
        optimizer.zero_grad()
        accuracy, loss = evaluateOnBatch(predictor, batch)
        loss.backward()
        optimizer.step()
        trainAccuracy += accuracy
        trainLogLoss += loss.item()
    trainAccuracy /= BATCHES_IN_TRAIN
    trainLogLoss /= BATCHES_IN_TRAIN

    with torch.no_grad():
        predictor.eval()
        testAccuracy = 0
        testLogLoss = 0
        testSize = 0
        for batch in islice(testSet, BATCHES_IN_TEST):
            accuracy, logLoss = evaluateOnBatch(predictor, batch)
            testAccuracy += accuracy
            testLogLoss += loss.item()
        testAccuracy /= BATCHES_IN_TEST
        testLogLoss /= BATCHES_IN_TEST

        p = parametersTensor(predictor)
        g = gradientsTensor(predictor)
        print(f'State: parameters = ({fmt(p.mean())}, {fmt(p.std())}) gradients = ({fmt(g.mean())},  {fmt(g.std())})')
        print(f'#{startEpoch}: {fmt(trainAccuracy)} {fmt(trainLogLoss)} {fmt(testAccuracy)} {fmt(testLogLoss)}')
        print(flush=True)

In [63]:
def guessNext(predictor, text):
    data = stringToTensor(text).view(1, -1, ALPHABET_SIZE)
    h0 = torch.randn((1, 1, ALPHABET_SIZE + MEMORY))
    output = predictor(data, h0)[0][:, -1, :]
    output = output[0, :ALPHABET_SIZE]
    return output

def guessNextK(predictor, prefix, k):
    for it in range(k):
        p = guessNext(predictor, prefix)
        i = p.argmax(dim=0).item()
        c = alphabet[i]
        prefix += c
    return prefix

In [None]:
predictor = nn.GRU(input_size=ALPHABET_SIZE,
                   hidden_size=ALPHABET_SIZE+MEMORY,
                   num_layers=1,
                   bias=True,
                   batch_first=True,
                   dropout=0,
                   bidirectional=False)
optimizer = torch.optim.Adam(predictor.parameters())

for i in range(10 ** 9):
    train(predictor, optimizer, i)
    print(guessNextK(predictor, choice(rawTexts), 100))

State: parameters = (-0.00039, 0.04620) gradients = (0.00000,  0.00080)
#0: 0.02617 4.03544 0.04297 4.01374

// !DI >  r                                                                                               
State: parameters = (-0.00055, 0.04655) gradients = (0.00000,  0.00074)
#1: 0.08711 3.99345 0.11523 3.96973

NUSED_                                                                                                    
State: parameters = (-0.00068, 0.04712) gradients = (0.00000,  0.00072)
#2: 0.13438 3.94480 0.14648 3.92562

()
                                                                                                       
State: parameters = (-0.00082, 0.04804) gradients = (0.00000,  0.00067)
#3: 0.14609 3.86899 0.14648 3.83229

FROM_S                                                                                                    
State: parameters = (-0.00101, 0.04942) gradients = (0.00000,  0.00053)
#4: 0.14766 3.76935 0.14453 3.75009

: 2
cl                     

In [None]:
optimizer = torch.optim.SGD(predictor.parameters(), lr=0.1, momentum=0.9)

for i in range(10 ** 9):
    train(predictor, optimizer, i)
    print(i)
    samplePrediction(predictor, 64)