In [1]:
import numpy as np
import pandas as pd
import re, random
from tqdm import tqdm
import time

In [2]:
from bwv import *

# Initialize Training Data

#### Clean text

In [8]:
raw_text = ''
with open('data/frankenstein.txt') as f:
    for line in f:
        if 'End of the Project Gutenberg' in line: break
        raw_text += line

In [39]:
raw_text[:100] # cleaned in BWV's __init__

'Frankenstein;\n\n\nor, the Modern Prometheus\n\n\n\n\nby\n\n\nMary Wollstonecraft (Godwin) Shelley\n\n\n\n\n\n\nCONTEN'

In [13]:
bwv = BWV([raw_text], # the text should come in the form of a list of documents
          m=50, # dimensionality of word embeddings
          tau=1.0, # prior precision
          gamma=0.7, # decay
          n_without_stochastic_update=5,
          vocab_size=20000, 
          sample=0.001) # discard probability during training

In [24]:
bwv.corpus[0][20]

['my',
 'education',
 'was',
 'neglected',
 'yet',
 'i',
 'was',
 'passionately',
 'fond',
 'of',
 'reading']

In [37]:
epochs = 10
for i in range(epochs):
    bwv.train()

100%|██████████| 7154/7154 [00:56<00:00, 127.59it/s]


6.7033436335104275
i
('my', (0.8939452654938163, 0.0053996843582574845))
('as', (0.8931884572906233, 0.008229069096445928))
('which', (0.8914001780482579, 0.00715768977735576))
('to', (0.882285267285531, 0.005365697450127906))
('me', (0.8726310113454073, 0.006593103929436099))


100%|██████████| 7154/7154 [01:05<00:00, 109.74it/s]


4.961012386484738
i
('my', (0.8739828936321659, 0.007300644340333341))
('to', (0.8664588411242059, 0.006936439874769696))
('should', (0.8629923140431768, 0.014587072986239976))
('for', (0.860130832817147, 0.009611553146975003))
('this', (0.8555225422779064, 0.011558626171014292))


100%|██████████| 7154/7154 [01:00<00:00, 117.90it/s]


4.2186061040209175
i
('but', (0.9103611945649952, 0.009209571130478121))
('to', (0.9038115820165493, 0.007266483703761167))
('have', (0.9027340090972306, 0.011525105340652429))
('my', (0.8971356209035777, 0.007702425394886703))
('should', (0.8896501536153981, 0.015719628681166307))


100%|██████████| 7154/7154 [01:02<00:00, 114.11it/s]


3.8793064735703053
i
('to', (0.9535319217000036, 0.006477483372904824))
('my', (0.9508872094881784, 0.007038906377372512))
('that', (0.9443306766045727, 0.008069496444729065))
('as', (0.9315101285399576, 0.010035998161841315))
('this', (0.9242596215553383, 0.01053803995925408))


100%|██████████| 7154/7154 [00:59<00:00, 507.80it/s]


3.5650991168892237
i
('that', (0.9621024012984499, 0.007459255406521707))
('my', (0.9601379699531287, 0.007343938111240671))
('to', (0.9594963780198998, 0.006450196132721288))
('but', (0.9580000340411636, 0.008560566994566534))
('now', (0.9457278909637348, 0.014393254017822899))


100%|██████████| 7154/7154 [01:01<00:00, 116.70it/s]


2.785005737817819
i
('that', (0.968959957671477, 0.007588746744068574))
('my', (0.9673621931425691, 0.007303702226941126))
('but', (0.95964945283522, 0.008621016140221323))
('to', (0.9580952362985768, 0.006303233738569153))
('them', (0.9462486572018276, 0.016839535108979252))


100%|██████████| 7154/7154 [00:55<00:00, 129.24it/s]


2.4353971913100225
i
('that', (0.9793515150579643, 0.007490678497196039))
('but', (0.9765252434681292, 0.008427540559696753))
('to', (0.9637342748362759, 0.006196268213427134))
('my', (0.9611794754115979, 0.0070480744464493945))
('as', (0.9596235465355305, 0.009393276297168598))


100%|██████████| 7154/7154 [00:57<00:00, 123.49it/s]


2.228999544579802
i
('that', (0.9822774860599697, 0.007422742465607627))
('but', (0.9795788570302898, 0.008473453043556069))
('my', (0.9677307805812579, 0.006771268172080873))
('to', (0.9653253971859264, 0.006176094865259565))
('as', (0.959216230331605, 0.009556874515813473))


100%|██████████| 7154/7154 [00:58<00:00, 121.83it/s]


2.1226618530358663
i
('that', (0.9873805550054718, 0.007426109895335396))
('but', (0.9786980312561118, 0.008621138453170939))
('my', (0.9784143160250026, 0.006690938143928287))
('to', (0.9737562905279421, 0.00622818694419941))
('as', (0.9634879181968117, 0.009537268633507312))


100%|██████████| 7154/7154 [00:56<00:00, 599.65it/s]


2.041517970905439
i
('that', (0.9867771097578886, 0.007420603211714717))
('to', (0.9807728042530476, 0.006272575575317925))
('my', (0.9755888226979269, 0.0066339058475623465))
('but', (0.9751172043099771, 0.008439433912940521))
('as', (0.965460522188013, 0.0094364719698533))


In [38]:
bwv.most_similar(bwv.vocab_id['frankenstein'])

[('repair', (0.9114051265822207, 4.000606589045391)),
 ('generous', (0.9033787547572756, 1.3560567689126537)),
 ('pitied', (0.9031875421802058, 5.957182994912279)),
 ('believe', (0.9028013896238349, 0.31829570687076425)),
 ('even', (0.9020945477237103, 0.2050366327691989)),
 ('dear', (0.8971253941882256, 0.1682351455142113)),
 ('infatuation', (0.8959493075904278, 4.054429359757156)),
 ('faults', (0.8944437981614649, 4.621389582787896)),
 ('whom', (0.8934161702830831, 0.1799124765226602)),
 ('pity', (0.8924250192050187, 0.8806871300259771)),
 ('fastidious', (0.8904503994766294, 4.387444683248204)),
 ('self-devoted', (0.8894575877745767, 5.438041424898311)),
 ('prepossess', (0.888428602407585, 4.379274069744737)),
 ('forbid', (0.8883012952370507, 3.766554517463197)),
 ('favour', (0.8876408425784285, 1.512250121817028)),
 ('misdeed', (0.8875606467426327, 6.193639960008444)),
 ('innocent', (0.8859169282913336, 0.48574574896129474)),
 ('thank', (0.8847375702497426, 1.1938744999338822)),
 ('