In [208]:
import string
import random
import time
import math

import torch.nn as nn
import shapefile
from tqdm import tqdm_notebook as tqdm
import sklearn.feature_extraction
import numpy
import torch
import torch.random
import matplotlib.pyplot as plt

In [200]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, inp, hidden):
        batch_size = inp.size(0)
        encoded = self.encoder(inp)
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        output = self.decoder(output.view(batch_size, -1))
        return output, hidden

    def forward2(self, inp, hidden):
        encoded = self.encoder(inp.view(1, -1))
        output, hidden = self.rnn(encoded.view(1, 1, -1), hidden)
        output = self.decoder(output.view(1, -1))
        return output, hidden

    def init_hidden(self, batch_size):
        return (torch.autograd.Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)),
                torch.autograd.Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)))

In [10]:
suburb_shape = shapefile.Reader('../NSW_LOCALITY_POLYGON_shp/NSW_LOCALITY_POLYGON_shp')

In [14]:
names = [r.NSW_LOCA_2 for r in suburb_shape.records()]

In [100]:
letters = sorted(string.ascii_uppercase + "'- *")

In [270]:
def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        try:
            tensor[c] = letters.index(string[c])
        except:
            continue
    return tensor

# Readable time elapsed

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def random_training_set(corpus, chunk_len, batch_size):
    inp = torch.LongTensor(batch_size, chunk_len)
    target = torch.LongTensor(batch_size, chunk_len)
    for bi in range(batch_size):
        start_index = random.randint(0, len(corpus) - chunk_len - 1)
        end_index = start_index + chunk_len + 1
        chunk = corpus[start_index:end_index]
        inp[bi] = char_tensor(chunk[:-1])
        target[bi] = char_tensor(chunk[1:])
    inp = torch.autograd.Variable(inp)
    target = torch.autograd.Variable(target)
    return inp, target

def train(inp, target, batch_size, chunk_len, decoder, optimiser):
    hidden = decoder.init_hidden(batch_size)
    decoder.zero_grad()
    loss = 0

    for c in range(chunk_len):
        output, hidden = decoder(inp[:,c], hidden)
        loss += criterion(output.view(batch_size, -1), target[:,c])

    loss.backward()
    optimiser.step()

    return loss.data[0] / chunk_len

In [271]:
def generate(decoder, prime_str='****', predict_len=100, temperature=0.8):
    hidden = decoder.init_hidden(1)
    prime_input = torch.autograd.Variable(char_tensor(prime_str).unsqueeze(0))

    predicted = prime_str

    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[:,p], hidden)
        
    inp = prime_input[:,-1]
    
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]

        # Add predicted character to string and use as next input
        predicted_char = letters[top_i]
        predicted += predicted_char
        inp = torch.autograd.Variable(char_tensor(predicted_char).unsqueeze(0))

    return predicted

In [272]:
# inputs = []
# outputs = []
# window_size = 4
# for name in tqdm(names):
#     name = '*' * window_size + name + '****'
#     for i in range(len(name) - window_size):
#         inputs.append(name[i:i + window_size])
#         outputs.append(name[i + window_size])

In [273]:
input_corpus = ''.join('*' * 4 + n for n in names) + '****'

In [274]:
# letter_to_int = {l:i for i, l in enumerate(letters)}

In [275]:
# input_arr = numpy.zeros((len(inputs), 4, len(letters)))
# for n, inp in enumerate(inputs):
#     for i, l in enumerate(inp):
#         input_arr[n, i, letter_to_int[l]] = 1

In [276]:
# output_arr = numpy.zeros((len(inputs), len(letters)))
# for n, out in enumerate(outputs):
#     output_arr[n, letter_to_int[l]] = 1

In [277]:
decoder = RNN(
    len(letters),
    16,
    len(letters),
    n_layers=1,
)

In [278]:
decoder_optimiser = torch.optim.Adam(decoder.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [279]:
start = time.time()
all_losses = []
loss_avg = 0
n_epochs = 10000
batch_size = 128
chunk_len = 4
for epoch in tqdm(range(1, n_epochs + 1)):
    loss = train(*random_training_set(input_corpus, chunk_len, batch_size), batch_size, chunk_len,
                 decoder, decoder_optimiser)
    loss_avg += loss

    if epoch % 100 == 1:
        print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / n_epochs * 100, loss))
        print(generate(decoder, '****', 100), '\n')

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))



[0m 0s (1 0%) 3.4014]
****OGDK-BUPUIOXHEUWQOEAONAKFMUUJK-UDFKHHWAQE HRIQDRJHENFEETQDDQ'RQKHHTGYNCHUNTYDPBCCCWOEOCIG*JP-NZFLCXZ 

[0m 2s (101 1%) 2.7168]
****P*R*M**S*CW********L******O***MG*****E *****R******E*A*G*E**SZRA*OME*O*O************OU GY*HXEWHXMTA* 

[0m 3s (201 2%) 2.5910]
*********OHITOSNUODFEGTERH********LDIEXLG*****TEORN*****UPL AEG****UGHTLLA *DOEXERA******RIA****AWAHRAEJ 

[0m 5s (301 3%) 2.3597]
****BOEGOT'JLLERAT****AGTBA****UEL*****SUN*****ZOA****ERILLE****CRODLONOR*TS CHLIR*IAA*****ALRR******AT* 

[0m 7s (401 4%) 2.3739]
*****UPY***TERARILLERR******LURAN******UAGAHT BOENS*****OOE****BOIWVE KBRREONINE***OREG****EEREAR******* 

[0m 9s (501 5%) 2.2300]
****CAY*****NOGEHEEL*****PAT-*****BEEON*****SBTAOLLARRANGARA******EOARAR******U*RINORIT******SIE*****ARO 

[0m 11s (601 6%) 2.2035]
****TARE*******NOONLEE ******EINRAHWOT******LAMIRNA*******BORAT***XOE*****LALER****EEL******WALS RALLA** 

[0m 13s (701 7%) 2.1982]
****MBURLINLEK*****FIN*****RAEN*****LEA*****SORLE******PIAN

[1m 53s (6201 62%) 1.8655]
****TARESTY VALLLAH***************THEENDALLEYBRA***********WERICKN**********NS SOUNTI CREEK POYANDOSEN V 

[1m 55s (6301 63%) 1.9512]
****CUREELLINGOONDA**************THEREY****************MANERINGASTEN***************CONGIMBUE************ 

[1m 57s (6401 64%) 1.9568]
*****WANDA*****************MARSTN*****************HEEK**************HILA WEA CREEK***********BAMBURRONS  

[1m 59s (6501 65%) 1.9452]
****BAPERINGE***************STRIRWANGAR*************CONOAR***************PONPLERS TROOD***************YA 

[2m 1s (6601 66%) 1.9031]
****EMOINAMOOKE*****************GUPBUNGIS******************BOWACH****************TING******************C 

[2m 2s (6701 67%) 1.8811]
****BURA COEN*******************CRELLE******************HIWPA***************SOON********************MORA 

[2m 4s (6801 68%) 1.8183]
****TARAC***************BALLEMBARRADA*****************GLASTOAT*********************WARLI**************** 

[2m 6s (6901 69%) 1.8794]
****COOL***************WAREEK**********

In [280]:
import pdb

In [None]:
pdb.pm()