In [208]:
import string
import random
import time
import math

import torch.nn as nn
import shapefile
from tqdm import tqdm_notebook as tqdm
import sklearn.feature_extraction
import numpy
import torch
import torch.random
import matplotlib.pyplot as plt

In [200]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, inp, hidden):
        batch_size = inp.size(0)
        encoded = self.encoder(inp)
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        output = self.decoder(output.view(batch_size, -1))
        return output, hidden

    def forward2(self, inp, hidden):
        encoded = self.encoder(inp.view(1, -1))
        output, hidden = self.rnn(encoded.view(1, 1, -1), hidden)
        output = self.decoder(output.view(1, -1))
        return output, hidden

    def init_hidden(self, batch_size):
        return (torch.autograd.Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)),
                torch.autograd.Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)))

In [10]:
suburb_shape = shapefile.Reader('../NSW_LOCALITY_POLYGON_shp/NSW_LOCALITY_POLYGON_shp')

In [14]:
names = [r.NSW_LOCA_2 for r in suburb_shape.records()]

In [100]:
letters = sorted(string.ascii_uppercase + "'- *")

In [270]:
def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        try:
            tensor[c] = letters.index(string[c])
        except:
            continue
    return tensor

# Readable time elapsed

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def random_training_set(corpus, chunk_len, batch_size):
    inp = torch.LongTensor(batch_size, chunk_len)
    target = torch.LongTensor(batch_size, chunk_len)
    for bi in range(batch_size):
        start_index = random.randint(0, len(corpus) - chunk_len - 1)
        end_index = start_index + chunk_len + 1
        chunk = corpus[start_index:end_index]
        inp[bi] = char_tensor(chunk[:-1])
        target[bi] = char_tensor(chunk[1:])
    inp = torch.autograd.Variable(inp)
    target = torch.autograd.Variable(target)
    return inp, target

def train(inp, target, batch_size, chunk_len, decoder, optimiser):
    hidden = decoder.init_hidden(batch_size)
    decoder.zero_grad()
    loss = 0

    for c in range(chunk_len):
        output, hidden = decoder(inp[:,c], hidden)
        loss += criterion(output.view(batch_size, -1), target[:,c])

    loss.backward()
    optimiser.step()

    return loss.data[0] / chunk_len

In [271]:
def generate(decoder, prime_str='****', predict_len=100, temperature=0.8):
    hidden = decoder.init_hidden(1)
    prime_input = torch.autograd.Variable(char_tensor(prime_str).unsqueeze(0))

    predicted = prime_str

    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[:,p], hidden)
        
    inp = prime_input[:,-1]
    
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]

        # Add predicted character to string and use as next input
        predicted_char = letters[top_i]
        predicted += predicted_char
        inp = torch.autograd.Variable(char_tensor(predicted_char).unsqueeze(0))

    return predicted

In [272]:
# inputs = []
# outputs = []
# window_size = 4
# for name in tqdm(names):
#     name = '*' * window_size + name + '****'
#     for i in range(len(name) - window_size):
#         inputs.append(name[i:i + window_size])
#         outputs.append(name[i + window_size])

In [273]:
input_corpus = ''.join('*' * 4 + n for n in names) + '****'

In [274]:
# letter_to_int = {l:i for i, l in enumerate(letters)}

In [275]:
# input_arr = numpy.zeros((len(inputs), 4, len(letters)))
# for n, inp in enumerate(inputs):
#     for i, l in enumerate(inp):
#         input_arr[n, i, letter_to_int[l]] = 1

In [276]:
# output_arr = numpy.zeros((len(inputs), len(letters)))
# for n, out in enumerate(outputs):
#     output_arr[n, letter_to_int[l]] = 1

In [277]:
decoder = RNN(
    len(letters),
    16,
    len(letters),
    n_layers=1,
)

In [278]:
decoder_optimiser = torch.optim.Adam(decoder.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
all_losses = []

In [None]:
start = time.time()
loss_avg = 0
n_epochs = 100000
batch_size = 128
chunk_len = 4
for epoch in tqdm(range(1, n_epochs + 1)):
    loss = train(*random_training_set(input_corpus, chunk_len, batch_size), batch_size, chunk_len,
                 decoder, decoder_optimiser)
    loss_avg += loss

    if epoch % 100 == 1:
        print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / n_epochs * 100, loss))
        print(generate(decoder, '****', 100), '\n')

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))



[0m 0s (1 0%) 1.8827]
****CAMBA***********************BRUND****************KESHILMTON****************************BOONDI******* 

[0m 2s (101 0%) 1.7728]
****BAGE****************LEAD****************TANDEN HOLLE*************BARRIGH****************WAMBURRONG** 

[0m 4s (201 0%) 1.7724]
****REEMBOIWARE****************THALLERIGLENAYANAH*********************WOOWALLEI************************* 

[0m 5s (301 0%) 1.9111]
****COOUN BRIEM*****************BERREEK*************CAH HURNTS**************RUNT**********************LA 

[0m 7s (401 0%) 1.9237]
****MOOT PARRABAIT*****************NINHAN******************THENG************NCHAVILL*****************ALL 

[0m 10s (501 0%) 1.8637]
****BA BIONG*****************TACKILLE***************DEENNORS MOULBI**************GOODHON**************** 

[0m 12s (601 0%) 1.8346]
****RROOREE************NTAH****************NERINGSY ROWONTHOO***************MARAB*******GRACH*********** 

[0m 14s (701 0%) 1.7630]
****CONMIST***************MONG VALE************THORTH PALI

[2m 7s (6201 6%) 1.7286]
****MUNDEN****************************************COOD************************************TOALMUNTH BEAR 

[2m 9s (6301 6%) 1.7596]
****LASWAREE*******************BOBINT*************************DOONBEMPI NOGREEFT HILLE HILLE************ 

[2m 10s (6401 6%) 1.9274]
****TALETER*****************WEACK**********************MIIGA******************MARRINER HILL************* 

[2m 12s (6501 6%) 1.9390]
****RIGTH****************AMARRS GROO***********EANT****************EEM**********ROWN CREEK*******EBUBAL* 

[2m 14s (6601 6%) 1.8166]
****CERAMBRA****************BIDMIOND PARKE****************KINTON****************MOURRIDGE**************T 

[2m 17s (6701 6%) 1.8186]
****CATEEK************KIRROCKEON*****************LEANGLA*****************BOGHEY**************EAST CREEK* 

[2m 19s (6801 6%) 1.8444]
****YENT CROST************ELILLS*************EBOMOOK****************EGRELLIG******************CREEK***** 

[2m 21s (6901 6%) 1.9081]
****WOON****************EANT NANDA***********

[4m 22s (12401 12%) 1.7649]
*****MIOD********LULLE ROVE***********EADVILLE**********EAWTON*********HILLE***********EAGHE POINT CREEK 

[4m 25s (12501 12%) 1.9008]
****GREEBOW**********ELAMARK*********ELUNTER PARDON***********EEAVEN**********OYEN************EAGHATT HI 

[4m 27s (12601 12%) 1.8004]
****FREEK**********HAID*******FALLGERGALA************EACDALE***********EATHANGI**********ALEEK*********E 

[4m 28s (12701 12%) 1.8620]
****MUGLE GABABROS*********ELADDI CREEK********EATHAMBALONVALETHER*********EOVE***********EIRDOY******** 

[4m 30s (12801 12%) 1.7894]
****WEELGRA************EIEENANG HILLE************EDUDGER POINGAND************ALEDIS CREEK*********EUCK** 

[4m 32s (12901 12%) 2.0077]
****EUTTIS PIRONDS***********EANGETS PARKTARRI********EWEEDDNERA************EARST CREED******SPRIND***** 

[4m 34s (13001 13%) 1.7184]
****FRSTAINT CREELDUN***********EENTEN NOTE**********EASTTANGLA************EIRAPPORI***********EESTH HIL 

[4m 36s (13101 13%) 1.8445]
****CELADDARRAY*********EDE

[6m 42s (18501 18%) 1.7685]
****PILDA*************EWOUDS CREEK************MIONTH ERAVE***********ELAVEAHOWGRESTEN*************EAREEK 

[6m 48s (18601 18%) 1.8212]
****MOUNTWAMBULLI POINT ALE***********EACK************EAGHEAVILLY***********ASVEISF PLAREEK**********EAS 

[6m 50s (18701 18%) 1.8323]
****INDELLS*************OONITS*************YBOPPARK************ELIONTIE**********EITHEFLEMBILD MING MING 

[6m 52s (18801 18%) 1.7956]
****ABK************EATHS*********EACDSORTON***********YOUGALONS BALLE CREEK*********AILDE**********EURID 

[6m 55s (18901 18%) 1.7755]
****MOUNT*************OIDA************EUDEEDIANGLEALAWORTH**************MARRA**********YOOCDANGALA****** 

[6m 59s (19001 19%) 1.8599]
****BARVERWELLIMBUL*************EPPEAKE***********EIGUNCH***********EAGHE*********EUILLLADE**********EAD 

[7m 2s (19101 19%) 1.8564]
****WILDIE***********EINTH*************LIA**************EATTH**************RUOLINCHALLANADAND ROYS BANGI 

[7m 5s (19201 19%) 1.8014]
****WARTHTON**********ERAMA**

[9m 10s (24601 24%) 1.8348]
****ELODS BAY***********EEAD***********BEADS**************IPPESTIVERSTALE*****************ELISTH HILLE** 

[9m 12s (24701 24%) 1.8819]
****RMBY RIVICAREMOUDABARK***********EACKY HEENNOUNEBOAN HILLE*************BIDGERREY**********OUNDENGAMB 

[9m 14s (24801 24%) 1.9284]
****OAPTAWOONA**************YALLANGAHBALMAMONGONS BERS CREEK************SMIESI**************ALLEMA****** 

[9m 17s (24901 24%) 1.7641]
****COOROOM*************BUDASE SHT BUSIWRAH************ELIEBKINGONS BAON****************EUDINCROSSHINGWO 

[9m 20s (25001 25%) 1.7359]
****GOF CREEK**********OUNDARNE JILLE************HILLA VALLE**********EANGOOLLYG*************EWOEN****** 

[9m 24s (25101 25%) 1.8789]
****THERA WAMBU VALE***********BUNTHARRAWABE MANGA*************ELAKE***********LOICH**********YALLE***** 

[9m 26s (25201 25%) 1.8662]
*****BALMORELLIN****************EANGA**************EAMIRO************EADGLEVILLI**************AYAREELLON 

[9m 29s (25301 25%) 1.8544]
****CREEK***********EALCK**

[11m 25s (30701 30%) 1.8097]
****BREE**************HILLONG DOYOLLALL PARK***************ELOUCTHN CREEN PARK****************ELCARRAIN* 

[11m 27s (30801 30%) 1.8484]
****ARINGLEY***********APPEER HEIGT CREEK**************AINT***************WILOBANDER**************RYARTH 

[11m 29s (30901 30%) 1.8033]
****NOOT FLABBALONS PARK***************HILLLE*****************ELAKIE****************EACHTON************* 

[11m 31s (31001 31%) 1.8032]
****MOOBYARNT COMALLARRAGHTON**************EDOACO***************EIGHES************EACK****************EU 

[11m 32s (31101 31%) 2.0066]
****BOPPER FLABETTY************AULLARA*************EWILLA*************EADDAWDILLE****************EGREELD 

[11m 34s (31201 31%) 1.9434]
****HOA******************EASTH***************YALCHABBELY**************WOUNGAL******************EBEK***** 

[11m 36s (31301 31%) 1.7950]
****MULMIRAMS CREEK************WISSTSITHEY**********ELSWUNT**************EILDSLANGAREAST************EIGL 

[11m 38s (31401 31%) 1.9303]
****COAST**********

[13m 38s (36801 36%) 1.9516]
****BERRILL*********************EDTHANDSY BER PANGAN GREKSYS FLASDAND****************HILD*************** 

[13m 42s (36901 36%) 1.7457]
****DOODEAK**************EUWPOIDJINGSARRA************ELIANT**************EBARRIDGE***********EADKEACH HE 

[13m 46s (37001 37%) 1.9024]
****UPPEAWMORG********************LIALNING********************OLMA*************OOBARANDA**************** 

[13m 49s (37101 37%) 1.6929]
****CHOUK****************BIRAST PERGEENS**************ELOUTH**************EMACK ARK***************EATTON 

[13m 53s (37201 37%) 1.8513]
****BALLILIORTOVASOWTH********************EASTZ*************EDCKONAL************************WACKONGY**** 

[13m 54s (37301 37%) 1.6950]
****BEACH HILL*******************ARME***************EDSFFLALLE********************BERRIDEE************** 

[13m 56s (37401 37%) 1.8519]
****MARRAY VALE**************PARLIVER********MIDSTING***************EIENURRILL****************WOODARA*** 

[13m 58s (37501 37%) 1.8692]
****MOUGGARA*******

[15m 52s (42901 42%) 1.8313]
****NEDE*********EAVE*************EIGHEACH**********EUKHOCKEWONG***************EDOONNEE COLA************ 

[15m 54s (43001 43%) 1.8143]
****AMBLI******************BORTH COLMONGEE********************EDTHANNEACONGH RAGH*******AITH************ 

[15m 56s (43101 43%) 1.7722]
****BRIGH*********JEELLONGARYARA**************EACKS MACKSTOITH BOOLHOMBOW*************EIGHERIE********** 

[15m 58s (43201 43%) 1.7534]
****EA**TRILLA*****************GLIATHALGATER MOUNTLAH**************EUNGAT***************BEDDAINNA******* 

[16m 0s (43301 43%) 1.7601]
****WEE**************JENTWORGOUTHATON****************EDEST*************EDEENGAROCK***************HILLIS* 

[16m 2s (43401 43%) 1.8042]
****TAMBRUMBARRA**************HEENNIVETER CREE***********LINDESTGO****************OOPNORAL WOLAMARIGHWIN 

[16m 4s (43501 43%) 1.9317]
****UPPIEK***********QUERRILLE****************ELUVEAMA****************EALDS****************EDENT PARK*** 

[16m 6s (43601 43%) 1.7594]
****WILDERONDA*********

[17m 51s (49001 49%) 1.8093]
****PESTON MOUNT CREEK*******************YBAZEAROGSURWIN*********************EUTORIGOL****************** 

[17m 53s (49101 49%) 1.8898]
****MELLE****************EDENTH CREEK*************AEKALE****************EIEK*****************ELINGANGA** 

[17m 55s (49201 49%) 1.8167]
****LUVILLLE**********************EAST**********************OLSTON*****************POXLAIN************** 

[17m 57s (49301 49%) 1.8333]
****ERAH***************EDTELE**********************EBIGHT PNS CREELD*****************THEAVE************* 

[17m 59s (49401 49%) 1.7630]
****GATHAZE*********************WARCKST********************TEYAMAR*************************HASVILLE***** 

[18m 1s (49501 49%) 1.8703]
****GALLAINTLOR**********************ESTH******************GHEALD*************OLVE******************MOOT 

[18m 3s (49601 49%) 1.9875]
****DALLE********************ELIGSWOOLLORES MOONALAINS*******************REEK*********************UPPERS 

[18m 5s (49701 49%) 1.8941]
****BLANA*************

[19m 51s (55101 55%) 1.7243]
****POODAN VALE*********************AITTH*******************EPP HILLE PARK********************EUMANS HOO 

[19m 53s (55201 55%) 1.7832]
****BARLINGH ARPHADE CREERSTON*********************EBLEAMUNGERS ULLANGO**********************COOMARLEMOO 

[19m 55s (55301 55%) 1.7881]
****KAKE***************BUCHERRILL*********************BANTINVILLE********************ELUREELLA********** 

[19m 57s (55401 55%) 1.7831]
****MARKOUTH BAYNAREANS BELLIEROM************************PEEMBERROWIGROBILS**************************WRI 

[19m 59s (55501 55%) 1.8935]
****CREEK***********************EBEDCA*******************SHUEMYS GOOLA******************************YPEM 

[20m 1s (55601 55%) 1.7513]
****LOCKEN***********************ARK********************EBEAGROODS**********************EBEACH********** 

[20m 3s (55701 55%) 1.7392]
****NUNGLENGANS********************************EPTER RIDELD**************************EPPER GULPE MORTH B 

[20m 5s (55801 55%) 1.8612]
****COLANCH CREEK*****

[22m 8s (61201 61%) 1.7849]
****CHADDS**********************ELIDE********************MIULLIE PARK*********************ELUND*****SUNS 

[22m 10s (61301 61%) 1.8165]
****WANDOMAREENDI***************************YULLAPPER VALE*****************************CALLA************ 

[22m 12s (61401 61%) 1.8251]
****MOUNTS*******************************ELAKI****************************EBATILAMLEST****************** 

[22m 14s (61501 61%) 1.7981]
****LEACH**************************************YEEWARRULGROSELLS BEAFLARLEY***************************** 

[22m 16s (61601 61%) 1.8481]
****NUNGOSHANG****************************MELIGHERRILL CREE*********OODGAROOK*************************** 

[22m 18s (61701 61%) 1.9099]
****WILDGE*****************************ELAK*****************************EBUPPEAND*********************** 

[22m 20s (61801 61%) 1.8329]
****SHT*****************************EMASOUTH************************************UCREEK****************** 

[22m 22s (61901 61%) 1.9431]
****WOWRING*********

[24m 7s (67301 67%) 1.8315]
****LELLE*********************************************************************************************** 

[24m 10s (67401 67%) 1.6982]
****MPUWING********************************************************************************************* 

[24m 12s (67501 67%) 1.7250]
****BANDACORA******************************************************************************************* 

[24m 14s (67601 67%) 1.8654]
****BELLANVILLE****************************************************************************UPPEEENE WEST 

[24m 16s (67701 67%) 1.7970]
****STY LIRARANG**************************************************************************************** 

[24m 19s (67801 67%) 1.8442]
****COOMBTLEN****************************************************************EDARTWELDS***************** 

[24m 21s (67901 67%) 1.8304]
****MUNTHA***************************************************************CAPPING************************ 

[24m 23s (68001 68%) 1.9145]
****BLARINVE********

[26m 13s (73401 73%) 1.8860]
****RIGHE PLEAST CREEN********************************************************************************** 

[26m 15s (73501 73%) 1.8043]
****NILLADA********************************************************************************************* 

[26m 17s (73601 73%) 1.9129]
****GOCD************************************************************************************************ 

[26m 19s (73701 73%) 1.8304]
****GAPSIN********************************************************************************************** 

[26m 21s (73801 73%) 1.7065]
****COSTONES ALASTEK************************************************************************************ 

[26m 22s (73901 73%) 1.8291]
****THER CROST****************************************************************************************** 

[26m 24s (74001 74%) 1.8032]
****COALGIMYRICKENLE PANHIN***************************************************************************** 

[26m 26s (74101 74%) 1.9248]
****COOLLANVINE****

[28m 18s (79501 79%) 1.7918]
****DOLSHACK******************************************************************************************** 

[28m 20s (79601 79%) 1.7984]
*****DUNGERRAYS RIVER****************************************************ELYDELDONS FLAREE************** 

[28m 23s (79701 79%) 1.7260]
****HAVE************************************************************************************************ 

[28m 25s (79801 79%) 1.8434]
****SQUIED POINT**************************************************************************************** 

[28m 27s (79901 79%) 1.8271]
****WOLESP********************************************************************************************** 

[28m 29s (80001 80%) 1.8064]
****COOM************************************************************************************************ 

[28m 31s (80101 80%) 1.8953]
****WAKEE FOREER**************************************************************************************** 

[28m 33s (80201 80%) 1.6503]
****MUNGHAPCUNNOW**

[31m 4s (85601 85%) 1.8885]
****MOUNTER CREEK*************************************************************************************** 

[31m 7s (85701 85%) 1.6797]
****COE BEARTON***************************************************************************************** 

[31m 10s (85801 85%) 1.8272]
****MOURRAWOONT SOUTTLE********************************************************************************* 

[31m 13s (85901 85%) 1.7766]
****BELLAND RIGLEBACH*********************************************************************************** 

[31m 16s (86001 86%) 1.7070]
****WARCH*********************************************************************************************** 

[31m 18s (86101 86%) 1.6675]
****BEMBO*********************************************************************************************** 

[31m 21s (86201 86%) 1.7704]
****BAREEK********************************************************************************************** 

[31m 23s (86301 86%) 1.7677]
****RACHEET**********

[33m 49s (91701 91%) 1.7820]
****WEWANS PARKELONG************************************************************************************ 

[33m 52s (91801 91%) 1.9033]
****BRYDING********************************************************************************************* 

[33m 56s (91901 91%) 1.7742]
****ROGBELL********************************************************************************************* 

[34m 1s (92001 92%) 1.8820]
****NOME************************************************************************************************ 

[34m 4s (92101 92%) 1.8010]
****NAPINMESTABUGARONG********************************************************************************** 

[34m 7s (92201 92%) 1.8843]
****DALLAND********************************************************************************************* 

[34m 10s (92301 92%) 1.7830]
****BURINDED******************************************************************************************** 

[34m 13s (92401 92%) 1.9198]
****HLAND*************

In [280]:
import pdb

In [None]:
pdb.pm()

In [281]:
for i in range(10):
    print(generate(decoder, '****', 100), '\n')

****WEEST HELL***********************UPPERS RODFA********************DARALNANUNS HEACH**************KAR  

****UPPPER MART************************GOOK***************************WELKONS WALEN CREEK*********KENBTO 

****BOANGINGAROTRARRESELDALTONN*******************GURARRULD*******************PELWURRA**************MER  

****BA*******************TAREEK************MPEY***************************MOINT************************* 

****WEENTAN****************************NUNT BRI**********BATHALBILLUNTIN****************************MA** 

****GOUTHTS*******************TARRAGO**********************COOPBA*********************KENS PARK********* 

****MOOD*****************KELE HELS MOOMERT***************************TANTAH*********************MAMBAN** 

****SWOUO*****************************MOANT SOOREW BAREELL*************************COREEN*************** 

****MARTH VALLOO****************************TINVA*************WHYS***********REEK************THUNDMOBRIN 

****COOOD**********************CAT SI