In [224]:
from bs4 import BeautifulSoup as BS
import numpy as np
import requests
import spacy
import random
import re
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [119]:
BASE_URL = 'http://www.paulgraham.com/'
ARTICLES_URL = BASE_URL + 'articles.html'
BASE_URL
response = requests.get(ARTICLES_URL)
if response.status_code == 200:
    print('Fetched the articles page.')
else:
    print(f'Request failed with code {response.status_code}')

Fetched the articles page.


In [120]:
pgsoup = BS(response.text, 'lxml')
print(f"Total articles fetched: {len(pgsoup.select('a')[1:])}")

Total articles fetched: 218


In [121]:
all_links = [BASE_URL + link.get('href') for link in pgsoup.find_all('a')]
success = 0
failure = 0
finished_requests = 0
all_pages = []
total_requests = len(all_links)
for link in all_links:
    finished_requests = finished_requests + 1
    rs = requests.get(link)
    if finished_requests % 10 == 0:
        print(f'Finished {finished_requests} out of {total_requests} requests')
    if rs.status_code == 200:
        success = success + 1
        all_pages.append(rs.text)
    else: 
        failure = failure + 1
print(f'total_success: {success}, total_failure: {failure}')

Finished 10 out of 219 requests
Finished 20 out of 219 requests
Finished 30 out of 219 requests
Finished 40 out of 219 requests
Finished 50 out of 219 requests
Finished 60 out of 219 requests
Finished 70 out of 219 requests
Finished 80 out of 219 requests
Finished 90 out of 219 requests
Finished 100 out of 219 requests
Finished 110 out of 219 requests
Finished 120 out of 219 requests
Finished 130 out of 219 requests
Finished 140 out of 219 requests
Finished 150 out of 219 requests
Finished 160 out of 219 requests
Finished 170 out of 219 requests
Finished 180 out of 219 requests
Finished 190 out of 219 requests
Finished 200 out of 219 requests
Finished 210 out of 219 requests
total_success: 217, total_failure: 2


In [122]:
print(f'Approximately {float(sum([len(t) for t in all_pages])) / (1024 * 1024):.2f} MB of data fetched')

Approximately 4.44 MB of data fetched


In [196]:

essays = [BS(p).find('table') for p in all_pages]
essays = [e.text.replace("\'", "").replace("\n", " ") for e in essays if (e is not None and len(e.text) >= 500)]


In [184]:
print(f'Approximately {float(sum([len(e) for e in essays])) / (1024 * 1024):.2f} MB of essays')


Approximately 2.81 MB of essays


In [159]:
spacy.cli.download('en_core_web_sm')
tk = get_tokenizer('spacy', language='en_core_web_sm')

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 12.6 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [197]:
tokens = [tk(e)[1:] for e in essays] 
for t in tokens:
    t[0] = t[0][4:]

In [206]:
def yield_tokens(essays):
    for text in essays:
        yield tk(text)
vocab = build_vocab_from_iterator(yield_tokens(essays))

In [230]:
EMBED_SIZE = 10
embeds = nn.Embedding(len(vocab), EMBED_SIZE)

In [231]:
lookup_tensor = torch.tensor([vocab['hello']], dtype=torch.long)
embeds(lookup_tensor)

tensor([[-1.2555, -0.2704,  0.7193,  1.0877, -1.1312,  0.0713, -0.3876, -0.3571,
          1.2550, -0.1371]], grad_fn=<EmbeddingBackward0>)

In [259]:
CONTEXT_SIZE = 3
all_tokens = []
for ts in tokens:
    for t in ts:
        all_tokens.append(t)
ngrams = [
    (
        [all_tokens[i - j - 1] for j in range(CONTEXT_SIZE)],
        all_tokens[i]
    )
    for i in range(CONTEXT_SIZE, len(all_tokens))
]

n1 = int(0.8 * len(ngrams))
n2 = int(0.8 * len(ngrams))
ngrams_train = ngrams[0:n1]
ngrams_valid = ngrams[n1:n2]
ngrams_tes = ngrams[n2:]

In [237]:
len(ngrams)

615966

In [257]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBED_SIZE, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [246]:
embeds(torch.tensor(vocab(['I', 'am'])))

tensor([[-0.2855, -0.2556, -0.3491,  1.0564,  0.4563,  1.3726, -1.8844, -0.7771,
         -0.2747, -0.2610],
        [ 1.3154, -0.7622,  0.8609,  0.9889, -0.3409,  1.0949,  0.3211,  0.0279,
          1.4135,  0.8031]], grad_fn=<EmbeddingBackward0>)

In [None]:
for epoch in range(10):
    total_loss = 0
    steps = 0
    for context, target in ngrams_train[0:2000]:
        if target not in vocab:
            continue
        steps = steps + 1
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([vocab[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([vocab[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        if steps % 100 == 0:
            print(f'{steps} steps done.')
    print(total_loss)
    losses.append(total_loss)


100 steps done.
200 steps done.
300 steps done.
400 steps done.
500 steps done.
600 steps done.
700 steps done.
800 steps done.
900 steps done.
1000 steps done.
1100 steps done.
1200 steps done.
1300 steps done.
1400 steps done.
1500 steps done.
1600 steps done.
1700 steps done.
1800 steps done.
1900 steps done.
2000 steps done.
10372.334538988769
100 steps done.
200 steps done.
300 steps done.
400 steps done.
500 steps done.
600 steps done.
700 steps done.
800 steps done.
900 steps done.
1000 steps done.
1100 steps done.
1200 steps done.
1300 steps done.
1400 steps done.
1500 steps done.
1600 steps done.
1700 steps done.
1800 steps done.
1900 steps done.
2000 steps done.
10177.664019532502
100 steps done.
200 steps done.
300 steps done.
400 steps done.
500 steps done.
600 steps done.
700 steps done.
800 steps done.
900 steps done.
1000 steps done.
1100 steps done.
1200 steps done.
1300 steps done.
1400 steps done.
1500 steps done.
1600 steps done.
1700 steps done.
1800 steps done.
190

In [272]:
model.embeddings(torch.tensor(vocab['sad'], dtype=torch.long))

tensor([-1.5463, -0.0729, -1.7869,  0.2753, -0.0032,  0.6494, -0.2053, -0.3618,
        -0.2261,  1.1085], grad_fn=<EmbeddingBackward0>)