In [1]:
import pandas as pd
import re
import torch
import time
from random import shuffle
import pickle

In [2]:
data = pd.read_json('data.json')[0]
data.head()

0    With Luke Perry gone, our idols are getting ol...
1    Science says you should be careful how you use...
2    Royal Family's new social media guidelines wan...
3    Facebook prepares to push Oculus VR headsets f...
4    Shazam gets unexpected help from Batman in new...
Name: 0, dtype: object

In [3]:
class Dictionary(object):
    def __init__(self):
        self.char2idx = {}
        self.idx2char = []

    def add_char(self, char):
        if char not in self.char2idx:
            self.idx2char.append(char)
            self.char2idx[char] = len(self.idx2char) - 1
        return self.char2idx[char]

    def __len__(self):
        return len(self.idx2char)

In [4]:
data = data.drop_duplicates()

In [5]:
dictry = Dictionary()
def fill_dict(title: str):
    chars = list(title)
    for char in chars:
        dictry.add_char(char)

In [6]:
data.apply(lambda title: fill_dict(title))
pass

In [7]:
print('~' in dictry.char2idx)
# '~' will denote the end of a sequence
dictry.add_char('~')

False


120

In [8]:
def tokenize(title):
    chars = list(title)
    chars.append('~')
    tokenized = list(map(lambda char: dictry.char2idx[char], chars))
    return tokenized
  
tokenizedData = data.apply(tokenize)

from sklearn.model_selection import train_test_split
trainData, valData = train_test_split(tokenizedData, test_size=0.01)

trainData = trainData.tolist()
valData = valData.tolist()

In [9]:
batch_size = 8
def construct_batch(source, i):
    begin = i * batch_size
    if begin >= len(source): return None, None
    end = min([len(source), begin + batch_size])
    vecs = source[begin:end]
    len_vec = max([len(title) for title in vecs])
    for vec in vecs:
        vec.extend([dictry.char2idx['~']] * (len_vec - len(vec)))
    x = [torch.tensor(vec, dtype=torch.long).to(device) for vec in vecs]
    y = [torch.tensor([*vec[1:], dictry.char2idx['~']], dtype=torch.long).to(device) for vec in vecs]
    return x, y

In [10]:
import torch.nn as nn
class LSTMModel(nn.Module):

    def __init__(self, ntoken, emsize, nhid, nlayers, dropout=0.5):
        super(LSTMModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, emsize)
        self.lstm = nn.LSTM(emsize, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)
        self.nhid = nhid
        self.nlayers = nlayers

        self.init_weights()
        self.hidden = self.init_hidden()
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input):
        emb = self.drop(torch.cuda.FloatTensor(self.encoder(input)))
        output, _ = self.lstm(emb.view(len(input), 1, -1))
        output = self.drop(torch.cuda.FloatTensor(output))
        decoded = self.decoder(output.view(len(input), -1))
        
        return decoded

    def init_hidden(self):
        return (torch.zeros(self.nlayers, 1, self.nhid),
            torch.zeros(self.nlayers, 1, self.nhid))

In [11]:
def train(epoch):
    # Turn on training mode which enables dropout.
    model.train()
    
    print_after_batch_num = 100
    total_loss = 0.
    start_time = time.time()
    num_batches = len(trainData) // batch_size
    #num_batches = 101

    shuffle(trainData)
    
    for batch in range(0, num_batches):
        x, y = construct_batch(trainData, batch)
        
        for i in range(batch_size):
            model.zero_grad()
            output = model(x[i])
            loss = criterion(output, y[i])
            loss.backward()
            opt.step()
            
            total_loss += loss.item()

        if batch % print_after_batch_num == 0 and batch > 0:
            cur_loss = total_loss / (batch_size * print_after_batch_num)
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                    'loss {:5.2f}'.format(
                epoch, batch, num_batches,
                elapsed * 1000 / print_after_batch_num, cur_loss))
            total_loss = 0
    start_time = time.time()

In [12]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        num_batches = len(data_source) // batch_size
        for batch in range(0, num_batches):
            x, y = construct_batch(data_source, batch)
            for i in range(batch_size):
                output = model(x[i])
                total_loss += criterion(output, y[i])
    return total_loss / (len(data_source) - 1)

In [13]:
ntokens = len(dictry.idx2char)
nhid = 256
emsize = ntokens
nlayers = 1
dropout = 0.3
device = torch.device("cuda")
model = LSTMModel(ntokens, emsize, nhid, nlayers, dropout).to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

  "num_layers={}".format(dropout, num_layers))


In [14]:
num_epochs = 5
best_val_loss = 10000
for epoch in range(0, num_epochs):
        epoch_start_time = time.time()
        train(epoch)
        val_loss = evaluate(valData)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} |'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if val_loss < best_val_loss:
            with open("lstm_char_level_model.tch", 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss

| epoch   0 |   100/13284 batches | ms/batch 133.35 | loss  2.36
| epoch   0 |   200/13284 batches | ms/batch 263.45 | loss  2.04
| epoch   0 |   300/13284 batches | ms/batch 390.66 | loss  1.96
| epoch   0 |   400/13284 batches | ms/batch 520.10 | loss  1.84
| epoch   0 |   500/13284 batches | ms/batch 648.68 | loss  1.83
| epoch   0 |   600/13284 batches | ms/batch 777.25 | loss  1.77
| epoch   0 |   700/13284 batches | ms/batch 906.43 | loss  1.72
| epoch   0 |   800/13284 batches | ms/batch 1035.53 | loss  1.68
| epoch   0 |   900/13284 batches | ms/batch 1165.80 | loss  1.65
| epoch   0 |  1000/13284 batches | ms/batch 1290.61 | loss  1.68
| epoch   0 |  1100/13284 batches | ms/batch 1422.34 | loss  1.61
| epoch   0 |  1200/13284 batches | ms/batch 1546.92 | loss  1.62
| epoch   0 |  1300/13284 batches | ms/batch 1674.25 | loss  1.62
| epoch   0 |  1400/13284 batches | ms/batch 1803.40 | loss  1.57
| epoch   0 |  1500/13284 batches | ms/batch 1934.24 | loss  1.56
| epoch   0 |  16

| epoch   0 | 12500/13284 batches | ms/batch 15832.03 | loss  1.36
| epoch   0 | 12600/13284 batches | ms/batch 15958.56 | loss  1.33
| epoch   0 | 12700/13284 batches | ms/batch 16086.60 | loss  1.34
| epoch   0 | 12800/13284 batches | ms/batch 16213.77 | loss  1.31
| epoch   0 | 12900/13284 batches | ms/batch 16341.26 | loss  1.38
| epoch   0 | 13000/13284 batches | ms/batch 16466.65 | loss  1.36
| epoch   0 | 13100/13284 batches | ms/batch 16595.31 | loss  1.33
| epoch   0 | 13200/13284 batches | ms/batch 16720.82 | loss  1.35
-----------------------------------------------------------------------------------------
| end of epoch   0 | time: 1689.42s | valid loss  1.20 |
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "


| epoch   1 |   100/13284 batches | ms/batch 146.52 | loss  1.11
| epoch   1 |   200/13284 batches | ms/batch 292.42 | loss  1.11
| epoch   1 |   300/13284 batches | ms/batch 438.38 | loss  1.11
| epoch   1 |   400/13284 batches | ms/batch 582.20 | loss  1.12
| epoch   1 |   500/13284 batches | ms/batch 726.13 | loss  1.13
| epoch   1 |   600/13284 batches | ms/batch 872.81 | loss  1.09
| epoch   1 |   700/13284 batches | ms/batch 1015.71 | loss  1.13
| epoch   1 |   800/13284 batches | ms/batch 1160.31 | loss  1.12
| epoch   1 |   900/13284 batches | ms/batch 1305.10 | loss  1.10
| epoch   1 |  1000/13284 batches | ms/batch 1449.70 | loss  1.10
| epoch   1 |  1100/13284 batches | ms/batch 1596.80 | loss  1.12
| epoch   1 |  1200/13284 batches | ms/batch 1741.84 | loss  1.12
| epoch   1 |  1300/13284 batches | ms/batch 1887.31 | loss  1.10
| epoch   1 |  1400/13284 batches | ms/batch 2031.51 | loss  1.13
| epoch   1 |  1500/13284 batches | ms/batch 2177.72 | loss  1.11
| epoch   1 |  1

| epoch   1 | 12500/13284 batches | ms/batch 18053.99 | loss  1.10
| epoch   1 | 12600/13284 batches | ms/batch 18200.90 | loss  1.06
| epoch   1 | 12700/13284 batches | ms/batch 18344.55 | loss  1.09
| epoch   1 | 12800/13284 batches | ms/batch 18486.35 | loss  1.08
| epoch   1 | 12900/13284 batches | ms/batch 18629.35 | loss  1.07
| epoch   1 | 13000/13284 batches | ms/batch 18771.37 | loss  1.07
| epoch   1 | 13100/13284 batches | ms/batch 18914.64 | loss  1.09
| epoch   1 | 13200/13284 batches | ms/batch 19058.35 | loss  1.09
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 1923.49s | valid loss  1.16 |
-----------------------------------------------------------------------------------------
| epoch   2 |   100/13284 batches | ms/batch 163.43 | loss  0.93
| epoch   2 |   200/13284 batches | ms/batch 328.12 | loss  0.92
| epoch   2 |   300/13284 batches | ms/batch 491.52 | loss  0.92
| epoch   2 |   400/13284 batche

| epoch   2 | 11300/13284 batches | ms/batch 18286.47 | loss  0.92
| epoch   2 | 11400/13284 batches | ms/batch 18449.22 | loss  0.92
| epoch   2 | 11500/13284 batches | ms/batch 18612.25 | loss  0.90
| epoch   2 | 11600/13284 batches | ms/batch 18774.74 | loss  0.91
| epoch   2 | 11700/13284 batches | ms/batch 18936.29 | loss  0.90
| epoch   2 | 11800/13284 batches | ms/batch 19097.40 | loss  0.91
| epoch   2 | 11900/13284 batches | ms/batch 19260.76 | loss  0.91
| epoch   2 | 12000/13284 batches | ms/batch 19424.92 | loss  0.90
| epoch   2 | 12100/13284 batches | ms/batch 19584.05 | loss  0.93
| epoch   2 | 12200/13284 batches | ms/batch 19745.19 | loss  0.92
| epoch   2 | 12300/13284 batches | ms/batch 19906.71 | loss  0.88
| epoch   2 | 12400/13284 batches | ms/batch 20071.23 | loss  0.91
| epoch   2 | 12500/13284 batches | ms/batch 20232.94 | loss  0.90
| epoch   2 | 12600/13284 batches | ms/batch 20395.96 | loss  0.90
| epoch   2 | 12700/13284 batches | ms/batch 20555.45 | loss  

| epoch   3 | 10100/13284 batches | ms/batch 18136.74 | loss  0.82
| epoch   3 | 10200/13284 batches | ms/batch 18315.91 | loss  0.83
| epoch   3 | 10300/13284 batches | ms/batch 18495.05 | loss  0.81
| epoch   3 | 10400/13284 batches | ms/batch 18671.67 | loss  0.81
| epoch   3 | 10500/13284 batches | ms/batch 18848.50 | loss  0.83
| epoch   3 | 10600/13284 batches | ms/batch 19027.08 | loss  0.82
| epoch   3 | 10700/13284 batches | ms/batch 19204.87 | loss  0.81
| epoch   3 | 10800/13284 batches | ms/batch 19384.17 | loss  0.81
| epoch   3 | 10900/13284 batches | ms/batch 19563.94 | loss  0.82
| epoch   3 | 11000/13284 batches | ms/batch 19743.04 | loss  0.81
| epoch   3 | 11100/13284 batches | ms/batch 19922.59 | loss  0.82
| epoch   3 | 11200/13284 batches | ms/batch 20100.88 | loss  0.80
| epoch   3 | 11300/13284 batches | ms/batch 20279.74 | loss  0.81
| epoch   3 | 11400/13284 batches | ms/batch 20459.41 | loss  0.81
| epoch   3 | 11500/13284 batches | ms/batch 20638.15 | loss  

| epoch   4 |  8900/13284 batches | ms/batch 16574.08 | loss  0.77
| epoch   4 |  9000/13284 batches | ms/batch 16759.63 | loss  0.75
| epoch   4 |  9100/13284 batches | ms/batch 16943.75 | loss  0.76
| epoch   4 |  9200/13284 batches | ms/batch 17127.74 | loss  0.76
| epoch   4 |  9300/13284 batches | ms/batch 17313.51 | loss  0.77
| epoch   4 |  9400/13284 batches | ms/batch 17498.38 | loss  0.75
| epoch   4 |  9500/13284 batches | ms/batch 17683.66 | loss  0.74
| epoch   4 |  9600/13284 batches | ms/batch 17870.96 | loss  0.76
| epoch   4 |  9700/13284 batches | ms/batch 18058.63 | loss  0.77
| epoch   4 |  9800/13284 batches | ms/batch 18244.31 | loss  0.75
| epoch   4 |  9900/13284 batches | ms/batch 18429.66 | loss  0.76
| epoch   4 | 10000/13284 batches | ms/batch 18615.36 | loss  0.77
| epoch   4 | 10100/13284 batches | ms/batch 18802.16 | loss  0.76
| epoch   4 | 10200/13284 batches | ms/batch 18988.50 | loss  0.77
| epoch   4 | 10300/13284 batches | ms/batch 19176.03 | loss  

In [56]:
with open('preprocessed.pkl', 'wb') as output:
    pickle.dump([dictry, data, trainData, valData], output, pickle.HIGHEST_PROTOCOL)

In [19]:
with open('512_preprocessed.pkl', 'rb') as input:
    loaded = pickle.load(input)
    dictry = loaded[0]
    data = loaded[1]
    trainData = loaded[2]
    valData = loaded[3]
with open("512_lstm_char_level_model.tch", 'rb') as f:
    model = torch.load(f)

In [46]:
model.eval()
with torch.no_grad():
    initial_char = 'G'
    ch = initial_char
    title = ''
    i = 0
    while ch != '~':
        title += ch
        vecs = model(torch.tensor(list(map(lambda x: dictry.char2idx[x], list(title)))).to(device))
        ch = dictry.idx2char[torch.argmax(vecs[len(vecs) - 1])]
        i += 1
    print(title)
    print()

Google shows to be a star and star and star and star and star service

