In [1]:
#In this notebook i play with a sample i found before.
#this pytorch sample creates text from logits a random   probable item at next iteration 
#choice is not totally random nor totally persistent
#result of torch.nn.functional.softmax is given to choice as parameter so it chooses a semi random item
#method return logits without applying any transformation

In [2]:
import argparse
import torch
from torch import nn
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader
import pandas as pd
from collections import Counter

In [3]:
train_df = pd.read_csv('D:/data7/reddit-cleanjokes.csv')
train_df.head(10)

Unnamed: 0,ID,Joke
0,1,What did the bartender say to the jumper cable...
1,2,Don't you hate jokes about German sausage? The...
2,3,Two artists had an art contest... It ended in ...
3,4,Why did the chicken cross the playground? To g...
4,5,What gun do you use to hunt a moose? A moosecut!
5,6,"If life gives you melons, you might have dysle..."
6,7,Broken pencils... ...are pointless.
7,8,What did one snowman say to the other snowman?...
8,9,How many hipsters does it take to change a lig...
9,10,Where do sick boats go? The dock!


In [4]:
train_df['Joke'].str.cat(sep=' ')
train_df.head(10)

Unnamed: 0,ID,Joke
0,1,What did the bartender say to the jumper cable...
1,2,Don't you hate jokes about German sausage? The...
2,3,Two artists had an art contest... It ended in ...
3,4,Why did the chicken cross the playground? To g...
4,5,What gun do you use to hunt a moose? A moosecut!
5,6,"If life gives you melons, you might have dysle..."
6,7,Broken pencils... ...are pointless.
7,8,What did one snowman say to the other snowman?...
8,9,How many hipsters does it take to change a lig...
9,10,Where do sick boats go? The dock!


In [5]:
class StepLogger():
    def __init__(self,capacity):
        self.tensor_datas = {}        
        self.capacity = capacity
        self.added_labels = []
        
    
    def add_info(self,tensor_data,tensor_label):
        if tensor_label not in self.added_labels:
            self.added_labels.append( tensor_label )
        
        if tensor_label in self.tensor_datas.keys():
            current_arr = self.tensor_datas.get(tensor_label)
            if len(current_arr) < self.capacity:
                current_arr = self.tensor_datas.get(tensor_label, [])
                current_arr.append(tensor_data)
        else:
            self.tensor_datas[tensor_label] = [tensor_data]
    
    def get_default_summary(self,show_data=False,summary_count=1):
        self.get_summary(self.added_labels,show_data,summary_count)
        
    def get_summary(self,labels,show_data=False,summary_count=1):
        print("summary_count",summary_count,"   self.capacity ",self.capacity)
        count = 0
        for i in range(summary_count):
            print(i," ------------------------------------------------")
            for l in labels:
                label_data = self.tensor_datas.get(l)[i]
                print(l)
                if torch.is_tensor(label_data):
                    print( list(label_data.size() ) )
                if not show_data and not torch.is_tensor(label_data):
                    print(label_data)
                if show_data:    
                    print(label_data)

In [6]:
class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        step_logger.add_info(x,"forward x")
        
        embed = self.embedding(x)
        step_logger.add_info(embed,"forward embed")
        
        output, state = self.lstm(embed, prev_state)
        step_logger.add_info(output,"forward output")
        step_logger.add_info(state,"forward state")
        
        logits = self.fc(output)
        step_logger.add_info(logits,"forward logits")

        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size))

In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self,sequence_length,):
        self.sequence_length = sequence_length
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def load_words(self):
        train_df = pd.read_csv('D:/data7/reddit-cleanjokes.csv')
        text = train_df['Joke'].str.cat(sep=' ')
        return text.split(' ')

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+sequence_length+1]),
        )

In [44]:
def train(dataset, model, batch_size,sequence_length,max_epochs):
    model.train()

    dataloader = DataLoader(dataset,batch_size=batch_size,)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        state_h, state_c = model.init_state(sequence_length)

        for batch, (x, y) in enumerate(dataloader):

            step_logger.add_info(x,"epoch x")
            step_logger.add_info(y,"epoch y")
            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            step_logger.add_info(y_pred,"model y_pred")
            step_logger.add_info(state_h,"model state_h")
            step_logger.add_info(state_c,"model state_c")
            
            loss = criterion(y_pred.transpose(1, 2), y)
            step_logger.add_info(loss,"model loss")

            state_h = state_h.detach()
            state_c = state_c.detach()
            
            step_logger.add_info(state_h,"model state_h")
            step_logger.add_info(state_c,"model state_c")

            loss.backward()
            optimizer.step()

            print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })

def predict(dataset, model, text, next_words=100,use_max_possible=False):
    words = text.split(' ')
    model.eval()

    state_h, state_c = model.init_state(len(words))

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])
        step_logger.add_info(x,"predict x")
        
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        step_logger.add_info(y_pred,"predict y_pred")
        step_logger.add_info(state_h,"predict state_h")
        step_logger.add_info(state_c,"predict state_c")

        last_word_logits = y_pred[0][-1]
        step_logger.add_info(last_word_logits,"predict last_word_logits")
        
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        step_logger.add_info(p,"predict p")

        
        word_index = np.random.choice(len(last_word_logits), p=p)
        step_logger.add_info(word_index,"predict word_index")
        if use_max_possible :
            word_index = np.argmax(p)
            
        
        words.append(dataset.index_to_word[word_index])
        step_logger.add_info(dataset.index_to_word[word_index],"predict dataset.index_to_word[word_index]")

    return words

In [None]:
step_logger =  StepLogger(2)

sequence_length = 4
dataset = Dataset(sequence_length)
model = Model(dataset)

In [14]:
print(model)

Model(
  (embedding): Embedding(6925, 128)
  (lstm): LSTM(128, 128, num_layers=3, dropout=0.2)
  (fc): Linear(in_features=128, out_features=6925, bias=True)
)


In [51]:
dataloader = DataLoader(dataset,batch_size=4)
for batch, (x, y) in enumerate(dataloader):
    if batch < 4:
        print("x", x)
        print("y", y)
        print("")

x tensor([[  2,   8,   0, 248],
        [  8,   0, 248,  20],
        [  0, 248,  20,   4],
        [248,  20,   4,   0]])
y tensor([[   8,    0,  248,   20],
        [   0,  248,   20,    4],
        [ 248,   20,    4,    0],
        [  20,    4,    0, 1905]])

x tensor([[  20,    4,    0, 1905],
        [   4,    0, 1905, 1906],
        [   0, 1905, 1906,   64],
        [1905, 1906,   64,  534]])
y tensor([[   4,    0, 1905, 1906],
        [   0, 1905, 1906,   64],
        [1905, 1906,   64,  534],
        [1906,   64,  534,   73]])

x tensor([[1906,   64,  534,   73],
        [  64,  534,   73,  535],
        [ 534,   73,  535,    4],
        [  73,  535,    4, 1907]])
y tensor([[  64,  534,   73,  535],
        [ 534,   73,  535,    4],
        [  73,  535,    4, 1907],
        [ 535,    4, 1907, 1908]])

x tensor([[ 535,    4, 1907, 1908],
        [   4, 1907, 1908,  225],
        [1907, 1908,  225,    3],
        [1908,  225,    3,  226]])
y tensor([[   4, 1907, 1908,  225],
    

In [9]:


train(dataset, model, batch_size=256,sequence_length=4,max_epochs=5)


{'epoch': 0, 'batch': 0, 'loss': 8.852972030639648}
{'epoch': 0, 'batch': 1, 'loss': 8.851600646972656}
{'epoch': 0, 'batch': 2, 'loss': 8.838411331176758}
{'epoch': 0, 'batch': 3, 'loss': 8.834159851074219}
{'epoch': 0, 'batch': 4, 'loss': 8.8235445022583}
{'epoch': 0, 'batch': 5, 'loss': 8.825240135192871}
{'epoch': 0, 'batch': 6, 'loss': 8.818626403808594}
{'epoch': 0, 'batch': 7, 'loss': 8.799959182739258}
{'epoch': 0, 'batch': 8, 'loss': 8.786689758300781}
{'epoch': 0, 'batch': 9, 'loss': 8.756302833557129}
{'epoch': 0, 'batch': 10, 'loss': 8.71680736541748}
{'epoch': 0, 'batch': 11, 'loss': 8.64470386505127}
{'epoch': 0, 'batch': 12, 'loss': 8.556174278259277}
{'epoch': 0, 'batch': 13, 'loss': 8.465197563171387}
{'epoch': 0, 'batch': 14, 'loss': 8.262255668640137}
{'epoch': 0, 'batch': 15, 'loss': 8.12722110748291}
{'epoch': 0, 'batch': 16, 'loss': 7.903187274932861}
{'epoch': 0, 'batch': 17, 'loss': 7.840312957763672}
{'epoch': 0, 'batch': 18, 'loss': 7.749034881591797}
{'epoch'

['Knock', 'knock.', 'Whos', 'there?', 'strip?', 'work?', 'type', 'a', 'chickens?', 'like', "can't", 'college', 'TV', 'LED', 'What', 'you', 'a', 'corner.', 'Bros.', 'rows', 'vote', 'an', 'major', 'to', 'a', 'Aonther', 'named', 'E,', 'I', 'native', 'I', 'Karl', 'do', 'did', 'four', 'be', 'dino', 'want', 'be', 'the', '-so', '"Kelp', 'were', 'felt', 'gold', 'disappointed.', 'if', 'the', 'tissues?', 'and', "What's", 'would', 'Vulgar', 'who?\'"', 'from', 'with', 'teeth!', 'moving', 'woof,', 'brand', 'How', 'do', 'you', 'the', '/r/cleanjokes', 'instead', 'had', 'fitness', 'many', 'sentence', 'I', 'would', 'hallucination?', 'Vector', 'She', 'Chucky.', 'think', 'Thai', 'programming', 'elephant', 'more', 'funny.', 'a', 'plant', 'Who', 'doors?', 'SC', 'Farts.', 'two', 'up', 'saying,', 'like?"', 'What', 'do', 'cow', 'What', 'is', 'there?', "Kellog's", 'a', '"Stay', 'dogs', 'remembered', 'their']


In [11]:
step_logger.get_default_summary(False)

summary_count 1    self.capacity  2
0  ------------------------------------------------
epoch x
[256, 4]
epoch y
[256, 4]
forward x
[256, 4]
forward embed
[256, 4, 128]
forward output
[256, 4, 128]
forward state
(tensor([[[ 0.0896, -0.0261,  0.0599,  ..., -0.1363,  0.3841,  0.0164],
         [ 0.1657, -0.1122,  0.0972,  ...,  0.0735,  0.0454,  0.0112],
         [ 0.1641, -0.0294,  0.1691,  ..., -0.1587,  0.0311, -0.1640],
         [ 0.0763, -0.0298,  0.1172,  ..., -0.2956,  0.1862, -0.0689]],

        [[ 0.0805,  0.0146,  0.0583,  ...,  0.0585, -0.0125,  0.0429],
         [ 0.0648,  0.0095,  0.0372,  ...,  0.0621, -0.0077,  0.0078],
         [ 0.0726, -0.0410,  0.0234,  ...,  0.0719, -0.0014, -0.0121],
         [ 0.0669, -0.0242,  0.0345,  ...,  0.0983,  0.0069,  0.0097]],

        [[-0.0025,  0.0107,  0.0083,  ..., -0.0345,  0.0055, -0.0223],
         [-0.0178,  0.0065,  0.0159,  ..., -0.0346,  0.0046, -0.0082],
         [-0.0117,  0.0136,  0.0143,  ..., -0.0329, -0.0005, -0.0131],
  

In [13]:
step_logger.get_default_summary(True)

summary_count 1    self.capacity  2
0  ------------------------------------------------
epoch x
[256, 4]
tensor([[   2,    8,    0,  248],
        [   8,    0,  248,   20],
        [   0,  248,   20,    4],
        ...,
        [ 105, 1959,   32,    4],
        [1959,   32,    4,   20],
        [  32,    4,   20,   25]])
epoch y
[256, 4]
tensor([[   8,    0,  248,   20],
        [   0,  248,   20,    4],
        [ 248,   20,    4,    0],
        ...,
        [1959,   32,    4,   20],
        [  32,    4,   20,   25],
        [   4,   20,   25, 1960]])
forward x
[256, 4]
tensor([[   2,    8,    0,  248],
        [   8,    0,  248,   20],
        [   0,  248,   20,    4],
        ...,
        [ 105, 1959,   32,    4],
        [1959,   32,    4,   20],
        [  32,    4,   20,   25]])
forward embed
[256, 4, 128]
tensor([[[-0.7877, -1.1969, -0.6383,  ...,  0.6313,  0.0760,  0.2087],
         [-0.1420,  0.3932,  0.4376,  ..., -0.5294, -0.7462,  0.3916],
         [-0.7515, -0.5439, -0.3029

In [46]:
#'Knock knock. Whos there?'
def predict_for_word(index):
    sentence = train_df.iloc[index][1]
    half_len= len(sentence.split( " ")) //2
    half_sentence = " ".join(sentence.split( " ")[0:half_len]) + " "

    print("original :",sentence)
    print("predicted :",predict(dataset, model, text=half_sentence,next_words=half_len))
    print("predicted with max :",predict(dataset, model, text=half_sentence,next_words=half_len,use_max_possible=True))
    
    print(" - ")


for i in range(5):
    predict_for_word(i)

original : What did the bartender say to the jumper cables? You better not try to start anything.
predicted : ['What', 'did', 'the', 'bartender', 'say', 'to', 'the', 'jumper', '', 'Houdini', 'Park', 'mama', 'knock', 'to', 'escaped', 'they', 'atoms']
predicted with max : ['What', 'did', 'the', 'bartender', 'say', 'to', 'the', 'jumper', '', 'and', 'the', 'the', 'What', 'do', 'the', 'the', 'What']
 - 
original : Don't you hate jokes about German sausage? They're the wurst!
predicted : ["Don't", 'you', 'hate', 'jokes', 'about', '', 'pretend', 'local', 'out;', 'electrical', 'as']
predicted with max : ["Don't", 'you', 'hate', 'jokes', 'about', '', 'and', 'the', 'the', 'What', 'do']
 - 
original : Two artists had an art contest... It ended in a draw
predicted : ['Two', 'artists', 'had', 'an', 'art', '', 'why', 'hose', 'aircraft.', 'mummies?', 'hotel...']
predicted with max : ['Two', 'artists', 'had', 'an', 'art', '', 'and', 'the', 'the', 'What', 'do']
 - 
original : Why did the chicken cross 