In [1]:
from bs4 import BeautifulSoup as BS
import numpy as np
import requests
import spacy
import random
import re
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

ModuleNotFoundError: No module named 'spacy'

In [366]:
# Fetching all essay webpages

BASE_URL = 'http://www.paulgraham.com/'
ARTICLES_URL = BASE_URL + 'articles.html'
BASE_URL
response = requests.get(ARTICLES_URL)
if response.status_code == 200:
    print('Fetched the articles page.')
else:
    print(f'Request failed with code {response.status_code}')

pgsoup = BS(response.text, 'lxml')
print(f"Total articles fetched: {len(pgsoup.select('a')[1:])}")

all_links = [BASE_URL + link.get('href') for link in pgsoup.find_all('a')]
success = 0
failure = 0
finished_requests = 0
all_pages = []
total_requests = len(all_links)
for link in all_links:
    finished_requests = finished_requests + 1
    rs = requests.get(link)
    if finished_requests % 10 == 0:
        print(f'Finished {finished_requests} out of {total_requests} requests')
    if rs.status_code == 200:
        success = success + 1
        all_pages.append(rs.text)
    else: 
        failure = failure + 1
print(f'total_success: {success}, total_failure: {failure}')
print(f'Approximately {float(sum([len(t) for t in all_pages])) / (1024 * 1024):.2f} MB of data fetched')

Fetched the articles page.
Total articles fetched: 218
Finished 10 out of 219 requests
Finished 20 out of 219 requests
Finished 30 out of 219 requests
Finished 40 out of 219 requests
Finished 50 out of 219 requests
Finished 60 out of 219 requests
Finished 70 out of 219 requests
Finished 80 out of 219 requests
Finished 90 out of 219 requests
Finished 100 out of 219 requests
Finished 110 out of 219 requests
Finished 120 out of 219 requests
Finished 130 out of 219 requests
Finished 140 out of 219 requests
Finished 150 out of 219 requests
Finished 160 out of 219 requests
Finished 170 out of 219 requests
Finished 180 out of 219 requests
Finished 190 out of 219 requests
Finished 200 out of 219 requests
Finished 210 out of 219 requests
total_success: 217, total_failure: 2


In [367]:
spacy.cli.download('en_core_web_sm')
tk = get_tokenizer('spacy', language='en_core_web_sm')

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 20.0 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [368]:
# Getting a list of essays and tokens
essays = [BS(p).find('table') for p in all_pages]
essays = [e.text.replace("\'", "").replace("\n", " ") for e in essays if (e is not None and len(e.text) >= 500)]
tokens = [tk(e)[1:] for e in essays] 
for t in tokens:
    t[0] = t[0][4:]
print(f'Approximately {float(sum([len(e) for e in essays])) / (1024 * 1024):.2f} MB of essays')

Approximately 2.81 MB of essays


In [373]:
# Building vocab
def yield_tokens(essays):
    for text in essays:
        yield tk(text)
vocab = build_vocab_from_iterator(yield_tokens(essays), specials=["<unk>"])

In [376]:
# Making n-grams
EMBED_SIZE = 10
CONTEXT_SIZE = 3
all_tokens = []
for ts in tokens:
    for t in ts:
        all_tokens.append(t)
ngrams = [
    (
        [all_tokens[i - j - 1] for j in range(CONTEXT_SIZE)],
        all_tokens[i]
    )
    for i in range(CONTEXT_SIZE, len(all_tokens))
]

n1 = int(0.8 * len(ngrams))
n2 = int(0.9 * len(ngrams))
ngrams_train = ngrams[0:n1]
ngrams_valid = ngrams[n1:n2]
ngrams_test = ngrams[n2:]
print(f'Training size: {len(ngrams_train)}')
print(f'Validation size: {len(ngrams_valid)}')
print(f'Testing size: {len(ngrams_test)}')

Training size: 492772
Validation size: 61597
Testing size: 61597


In [384]:
# Setup NN model
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBED_SIZE, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [407]:
for epoch in range(1):
    total_loss = 0
    steps = 0
    for context, target in ngrams_train[0:1000]:
        not_found = False 
        if target not in vocab:
            not_found = True
        for c in context:
            if c not in vocab:
                not_found = True
        if not_found:
            continue
        steps = steps + 1
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([vocab[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([vocab[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        #loss.backward()
        #optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        if steps % 100 == 0:
            print(f'{steps} steps done.')
            print(f'average loss: {total_loss / steps:.2f}')
    print(total_loss)
    losses.append(total_loss)


100 steps done.
average loss: 6.65
200 steps done.
average loss: 6.79
300 steps done.
average loss: 6.83
400 steps done.
average loss: 6.79
500 steps done.
average loss: 6.74
600 steps done.
average loss: 6.74
700 steps done.
average loss: 6.85
800 steps done.
average loss: 6.95
900 steps done.
average loss: 7.01
1000 steps done.
average loss: 7.12
1100 steps done.
average loss: 7.16
1200 steps done.
average loss: 7.08
1300 steps done.
average loss: 7.06
1400 steps done.
average loss: 7.06
1500 steps done.
average loss: 7.21
1600 steps done.
average loss: 7.34
1700 steps done.
average loss: 7.47
1800 steps done.
average loss: 7.50
1900 steps done.
average loss: 7.43
2000 steps done.
average loss: 7.40
2100 steps done.
average loss: 7.34
2200 steps done.
average loss: 7.38
2300 steps done.
average loss: 7.43
2400 steps done.
average loss: 7.43
2500 steps done.
average loss: 7.41
2600 steps done.
average loss: 7.39
2700 steps done.
average loss: 7.37
2800 steps done.
average loss: 7.37
2

In [416]:
def predict_next(sentence):
    words = sentence.split(' ')
    if len(words) != 3:
        return ''
    input = torch.tensor(vocab(words)[::-1], dtype=torch.long)
    logit = model(input)
    return vocab.lookup_tokens([torch.argmax(torch.exp(logit))])[0]
    

In [443]:
predict_next('be a chance')

'to'

In [438]:
ngrams_train

[(['some', 'are', 'There'], 'kinds'),
 (['kinds', 'some', 'are'], 'of'),
 (['of', 'kinds', 'some'], 'work'),
 (['work', 'of', 'kinds'], 'that'),
 (['that', 'work', 'of'], 'you'),
 (['you', 'that', 'work'], 'ca'),
 (['ca', 'you', 'that'], 'nt'),
 (['nt', 'ca', 'you'], 'do'),
 (['do', 'nt', 'ca'], 'well'),
 (['well', 'do', 'nt'], 'without'),
 (['without', 'well', 'do'], 'thinking'),
 (['thinking', 'without', 'well'], 'differently'),
 (['differently', 'thinking', 'without'], 'from'),
 (['from', 'differently', 'thinking'], 'your'),
 (['your', 'from', 'differently'], 'peers'),
 (['peers', 'your', 'from'], '.'),
 (['.', 'peers', 'your'], 'To'),
 (['To', '.', 'peers'], 'be'),
 (['be', 'To', '.'], 'a'),
 (['a', 'be', 'To'], 'successful'),
 (['successful', 'a', 'be'], 'scientist'),
 (['scientist', 'successful', 'a'], ','),
 ([',', 'scientist', 'successful'], 'for'),
 (['for', ',', 'scientist'], 'example'),
 (['example', 'for', ','], ','),
 ([',', 'example', 'for'], 'its'),
 (['its', ',', 'examp