- all the input data has to be lowercase for the embeddings, it does not recoginize "The"
- only 20 unique sentences, they go in a loop.
- construct your own dataset from some book


In [1]:
# download embeddings:
import torchtext.vocab as vocab

def load_glove_embeddings(dim):
    # Loads the specified GloVe embeddings
    glove = vocab.GloVe(name='6B', dim=dim)
    return glove

# Specify the desired dimension
embed_dim = 300  # For 100-dimensional embeddings
glove_embeddings = load_glove_embeddings(embed_dim)

# Now, `glove_embeddings` holds the loaded embeddings
print(f"Loaded {len(glove_embeddings.stoi)} words with {embed_dim}-dimensional embeddings.")


Loaded 400000 words with 300-dimensional embeddings.


In [151]:
import json
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')  # Ensure the punkt tokenizer is downloaded

def create_dataset_from_file(input_file, output_file, window_size=9, step_size=1):
    # Read the text from file
    with open(input_file, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Generate data points
    data = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > window_size:
            for i in range(0, len(words) - window_size, step_size):
                current_sequence = " ".join(words[i:i + window_size])
                next_word = words[i + window_size]
                data.append({"sentence": current_sequence, "next_word": next_word})
    
    # Save to JSON file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump({"data": data}, outfile, indent=4, ensure_ascii=False)

# Set the path to your .txt file and the output file
input_file_path = './frankenstein.txt'
output_file_path = './frankenstein_dataset.json'

# Create the dataset
create_dataset_from_file(input_file_path, output_file_path, window_size=7)

[nltk_data] Downloading package punkt to /home/piragi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [160]:
import pandas as pd
import json

# Load JSON data
with open('./frankenstein_dataset.json', 'r') as file:
    data = json.load(file)

# Create DataFrame
df = pd.DataFrame(data['data'])

# Get the distribution of 'next_word'
next_word_distribution = df['next_word'].value_counts()

# Print the distribution
print(next_word_distribution.head(25))

# If you want to see the distribution in percentage
next_word_percentage = df['next_word'].value_counts(normalize=True) * 100
print(next_word_percentage.head(15))

next_word
the      3035
and      2335
of       2195
to       1664
I        1515
my       1252
a        1001
in        839
that      752
was       551
with      493
which     480
had       457
but       453
his       428
me        405
as        372
for       367
by        349
he        333
on        327
from      314
not       312
it        274
you       256
Name: count, dtype: int64
next_word
the      5.590451
and      4.301055
of       4.043176
to       3.065078
I        2.790621
my       2.306176
a        1.843836
in       1.545433
that     1.385179
was      1.014939
with     0.908103
which    0.884157
had      0.841791
but      0.834423
his      0.788373
Name: proportion, dtype: float64


In [71]:
import pandas as pd

# Enhanced dataset with more variety
sentences = [
    "The cat sat on the", "The dog ran around the", "The sun was shining in the",
    "The baby laughed at the", "The teacher wrote on the", "A car drove over the",
    "He opened the", "She closed the", "The player won the", "The artist drew a",
    "The girl danced in the", "The boy played in the", "The wind blew through the",
    "The author discussed the", "The scientist discovered a", "The historian studied the",
    "The chef cooked a", "The farmer planted a", "The journalist wrote about the",
    "The programmer debugged the"
]
next_words = [
    "mat", "block", "sky", "toy", "board", "bridge", "door", "window", "game", "portrait",
    "room", "yard", "trees", "topic", "method", "artifact", "dish", "seed", "event", "program"
]

df = pd.DataFrame({"sentence": sentences, "next_word": next_words})


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
# Load GloVe Embeddings
glove_file = "./.vector_cache/glove.6B.300d.txt"
glove_embeddings = {}
with open(glove_file, "r") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = torch.tensor([float(x) for x in values[1:]], dtype=torch.float32)
        glove_embeddings[word] = vector

In [62]:
import pandas as pd

df_json = pd.read_json('./frankenstein_dataset.json')
# turn into dataframe make the right columns
df = pd.DataFrame( columns=['sentence', 'next_word'])
for i in range(len(df_json['data'])):
    # no append
    df.loc[i] = [df_json['data'][i]['sentence'].lower(), df_json['data'][i]['next_word'].lower()]
'''
This is only an example. Change as you see fit. Right now there is not split between train and test data. You will have to implement that.
'''    
# Assuming glove_embeddings has been loaded as shown earlier
def sentence_to_embedding(sentence):
    words = sentence.split()
    embeddings = [glove_embeddings.get(word, torch.zeros(300, dtype=torch.float32)) for word in words]
    return torch.stack(embeddings)

class SentenceDataset(Dataset):
    def __init__(self, df):
        self.df = df
        next_words = df["next_word"].tolist()
        self.word_to_idx = {word: i for i, word in enumerate(sorted(set(next_words)))}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        sentence = self.df.iloc[idx, 0] 
        next_word = self.df.iloc[idx, 1] 
        embedding = sentence_to_embedding(sentence)
        next_word_idx = self.word_to_idx[next_word]
        return {"sentence": embedding, "next_word": next_word_idx}

In [43]:
# shuffle data randomly
# split into 0.7, 0.15, 0.15
# evaluate using k-fold cross evaluation

#randomized_df = df.sample(frac=1, random_state=69).reset_index(drop=True)
data_len = len(df_json['data'])
train_split = int(data_len * 0.7)
test_split = train_split + int(data_len * 0.15)

train_df = df.iloc[:train_split]
test_df = df.iloc[train_split:test_split]
val_df = df.iloc[test_split:]

train_dataset = SentenceDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = SentenceDataset(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)
val_dataset = SentenceDataset(val_df)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [82]:
print(test_df.iloc[0, 1])
print(test_df.head(1))
print(test_dataset[0]['sentence'])
print(sentence_to_embedding(test_df.iloc[0,0]))

its
                                 sentence next_word
21508  edinburgh, its romantic castle and       its
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.3251,  0.0184, -0.0720,  ..., -0.1204, -0.2595, -0.4693],
        [-0.2959, -0.7609,  0.1986,  ...,  0.3912, -0.1467,  0.0756],
        [-0.1766, -0.1067,  0.0927,  ...,  0.6462, -0.2600,  0.5391],
        [ 0.0385, -0.0398,  0.0827,  ..., -0.3343,  0.0118,  0.0597]])
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.3251,  0.0184, -0.0720,  ..., -0.1204, -0.2595, -0.4693],
        [-0.2959, -0.7609,  0.1986,  ...,  0.3912, -0.1467,  0.0756],
        [-0.1766, -0.1067,  0.0927,  ...,  0.6462, -0.2600,  0.5391],
        [ 0.0385, -0.0398,  0.0827,  ..., -0.3343,  0.0118,  0.0597]])


RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [52]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim  # Save hidden_dim as an instance variable
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, num_layers=1)
        # self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, h0=None, c0=None):
        # Initialize hidden and cell states with dimensions: (num_layers, batch_size, hidden_dim)
        if h0 is None or c0 is None:
            h0 = torch.zeros(1, x.size(0), self.hidden_dim)
            c0 = torch.zeros(1, x.size(0), self.hidden_dim)
        # Forward pass through LSTM layer
        out, (hn,cn) = self.lstm(x, (h0, c0))
        # Pass the output of the last time step to the fully connected layer
        out = self.fc(out[:, -1, :])
        return out, (hn,cn)


output_dim = len(train_dataset.word_to_idx)  # Number of unique next words
model = LSTMModel(input_dim=300, hidden_dim=128, output_dim=output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

def calculate_perplexity(loss):
    return torch.exp(loss)

# Training Loop with Perplexity Calculation
for epoch in range(5):
    total_loss = 0
    train_accuracy = 0
    model.train()
    for batch in train_dataloader:
        sentences = batch["sentence"]
        next_words = batch["next_word"]
        optimizer.zero_grad()
        outputs, _ = model(sentences)
        loss = criterion(outputs, next_words)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        accuracy = 0
        for prediction, target in zip(outputs.argmax(1), next_words):
            if prediction == target:
                accuracy += 1
        train_accuracy += accuracy
    avg_loss = total_loss / len(train_dataloader.dataset)
    train_accuracy = (train_accuracy / len(train_dataloader.dataset)) * 100
    perplexity = calculate_perplexity(torch.tensor(avg_loss))

    model.eval()
    test_total_loss = 0
    test_accuracy = 0
    for test_batch in test_dataloader:
        sentences = test_batch["sentence"]
        next_words = test_batch["next_word"]
        outputs, _ = model(sentences)
        loss = criterion(outputs, next_words)
        test_total_loss += loss.item()
        accuracy = 0
        for prediction, target in zip(outputs.argmax(1), next_words):
            if prediction == target:
                accuracy += 1
        test_accuracy += accuracy
    test_avg_loss = test_total_loss / len(test_dataloader.dataset)
    test_accuracy = (test_accuracy / len(test_dataloader.dataset)) * 100

    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}, Perplexity: {perplexity.item():.4f}, Test Loss: {test_avg_loss:.4f}, Test Accuracy: {test_accuracy:.2f}% Training Accuracy: {train_accuracy:.2f}%")
    


Epoch 1, Loss: 0.2084, Perplexity: 1.2317, Test Loss: 0.3138, Test Accuracy: 0.02% Training Accuracy: 10.99%
Epoch 2, Loss: 0.1791, Perplexity: 1.1961, Test Loss: 0.3419, Test Accuracy: 0.04% Training Accuracy: 14.58%
Epoch 3, Loss: 0.1547, Perplexity: 1.1673, Test Loss: 0.3677, Test Accuracy: 0.04% Training Accuracy: 16.80%
Epoch 4, Loss: 0.1283, Perplexity: 1.1369, Test Loss: 0.4071, Test Accuracy: 0.02% Training Accuracy: 21.76%
Epoch 5, Loss: 0.1026, Perplexity: 1.1080, Test Loss: 0.4464, Test Accuracy: 0.02% Training Accuracy: 30.57%


In [None]:
train_total_loss = 0
train_accuracy = 0
for train_batch in test_dataloader:
    accuracy = 0
    sentences = train_batch["sentence"]
    next_words = train_batch["next_word"]
    outputs, _ = model(sentences)
    loss = criterion(outputs, next_words)
    train_total_loss += loss.item()
    for prediction, target in zip(outputs.argmax(1), next_words):
        if prediction == target:
            accuracy += 1
    train_accuracy += accuracy
print(f'Training Accuracy: {train_accuracy / len(train_dataloader.dataset)}')

Training Accuracy: 0.40068811604984195


In [39]:
train_total_loss = 0
train_accuracy = 0
for train_batch in test_dataloader:
    accuracy = 0
    sentences = train_batch["sentence"]
    next_words = train_batch["next_word"]
    outputs, _ = model(sentences)
    loss = criterion(outputs, next_words)
    train_total_loss += loss.item()
    for prediction, target in zip(outputs.argmax(1), next_words):
        if prediction == target:
            accuracy += 1
    train_accuracy += accuracy
print(f'Training Accuracy: {train_accuracy / len(train_dataloader.dataset)}')

Training Accuracy: 0.40068811604984195


In [42]:
correct = 0
for test_batch in val_dataloader:
    model.eval()
    sentences = test_batch["sentence"]
    next_words = test_batch["next_word"]
    outputs, _ = model(sentences)
    loss = criterion(outputs, next_words)
    test_total_loss += loss.item()
    for i in range(len(outputs)):
        if outputs[i].argmax() == next_words[i]:
            print(list(train_dataset.word_to_idx.keys())[next_words[i]])
    correct += (outputs.argmax(1) == next_words).sum().item()


print(f'correct {correct} out of {len(test_dataloader)}')

and
correct 1 out of 577


In [112]:
# check the model
model.eval()
sentence = "the cat sat on the "
sentence_embedding = sentence_to_embedding(sentence).unsqueeze(0)
output, _ = model(sentence_embedding)
topk_values, topk_indices = torch.topk(output, 5, dim=1)
predicted_words = [list(train_dataset.word_to_idx.keys())[idx.item()] for idx in topk_indices[0]]
random_index = torch.randint(0, 5, (1,)).item()
sentence = sentence + predicted_words[random_index]
print(sentence)

the cat sat on the table,


In [50]:

import torch.nn.functional as F
def generate_sentence(model, start_sentence, max_length=35):
    model.eval()  # Ensure the model is in evaluation mode
    sentence = start_sentence
    for _ in range(max_length):
        output, _ = model(sentence_to_embedding(sentence).unsqueeze(0))
        _, predicted_idx = torch.topk(output, 4, dim=1)
        random_index = torch.randint(0, 4, (1,)).item()
        predicted_idx = predicted_idx[0][random_index]
        predicted_word = list(train_dataset.word_to_idx.keys())[predicted_idx.item()]
        sentence += predicted_word + ' '

    words = sentence.split()
    return sentence

# Example usage after training
start_fragment = "The cat sat on the "
generated_sentence = generate_sentence(model, start_fragment)
#used_words = set(words)  # Keep track of words used in the sentence to apply penalties
print("Generated Sentence:", generated_sentence)



Generated Sentence: The cat sat on the heart, over the chairs, perish that ten where i mentioned the are of henry? but there is only all the dreadful vast sensations on the summit he left that departed. which i had at some 


Ideas to mitigate repeating words.
change the hidden state?

In [103]:
import torch

# Load GloVe embeddings
def load_glove_embeddings(path, embedding_dim=50):
    with open(path, 'r', encoding='utf-8') as f:
        vocab, vectors = {}, []
        for line in f:
            parts = line.split()
            word = parts[0]
            vector = [float(x) for x in parts[1:]]
            vocab[word] = len(vectors)
            vectors.append(vector)
        return vocab, torch.tensor(vectors, dtype=torch.float32)

glove_vocab, glove_vectors = load_glove_embeddings('.vector_cache/glove.6B.50d.txt')
print('loaded glove embedding')

from sklearn.model_selection import train_test_split

class TextDataset(Dataset):
    def __init__(self, data, glove_vocab, glove_vectors):
        self.data = data
        self.glove_vocab = glove_vocab
        self.glove_vectors = glove_vectors

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence, next_word = self.data[idx]['sentence'], self.data[idx]['next_word']
        # Convert sentence to indices
        indices = [self.glove_vocab[word] if word in self.glove_vocab else 0 for word in sentence.split()]
        # Convert next_word to index
        next_word_idx = self.glove_vocab.get(next_word, 0)  # using 0 for unknown words
        return torch.tensor(indices), next_word_idx

# Assuming `data` is loaded as shown in your JSON structure example
train_data, test_data = train_test_split(data['data'], test_size=0.3, random_state=420)
train_dataset = TextDataset(train_data, glove_vocab, glove_vectors)
test_dataset = TextDataset(test_data, glove_vocab, glove_vectors)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

loaded glove embedding


In [161]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import json

# Function to load GloVe embeddings
def load_glove_embeddings(path, embedding_dim=50):
    with open(path, 'r', encoding='utf-8') as f:
        vocab, vectors = {}, []
        for line in f:
            parts = line.split()
            word = parts[0]
            vector = [float(x) for x in parts[1:]]
            vocab[word] = len(vectors)
            vectors.append(vector)
        return vocab, torch.tensor(vectors, dtype=torch.float32)

# Load the GloVe embeddings
glove_vocab, glove_vectors = load_glove_embeddings('.vector_cache/glove.6B.50d.txt')
print('Loaded GloVe embeddings')

# Define the custom dataset class
class TextDataset(Dataset):
    def __init__(self, data, glove_vocab):
        self.data = data
        self.glove_vocab = glove_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence, next_word = self.data[idx]['sentence'], self.data[idx]['next_word']
        # Convert sentence to indices based on GloVe vocabulary
        indices = [self.glove_vocab.get(word.lower(), 0) for word in sentence.split()]  # Unknown words are indexed as 0
        # Convert next_word to its index, using 0 for unknown words
        next_word_idx = self.glove_vocab.get(next_word.lower(), 0)
        return torch.tensor(indices, dtype=torch.long), next_word_idx

# Load data from JSON file
with open('frankenstein_dataset.json', 'r') as file:
    data = json.load(file)

# Split data into training and test sets
train_data, test_data = train_test_split(data['data'], test_size=0.3, random_state=69)

# Create datasets
train_dataset = TextDataset(train_data, glove_vocab)
test_dataset = TextDataset(test_data, glove_vocab)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

Loaded GloVe embeddings


In [162]:
# Fetch the first batch from the DataLoader
first_batch_indices, first_batch_next_word_idxs = next(iter(test_loader))

# Display the first five entries from the first batch
for i in range(15):
    indices = first_batch_indices[i]
    next_word_idx = first_batch_next_word_idxs[i]
    print(f"Sentence indices: {indices}, Next word index: {next_word_idx}")

Sentence indices: tensor([ 4838,    44,  1454,     4, 24507,     0,  2677]), Next word index: 53714
Sentence indices: tensor([   0,    0,   14,   56, 3958,    3,  392]), Next word index: 8625
Sentence indices: tensor([   19,     0,  4821,     0,    41, 23672,    75]), Next word index: 7
Sentence indices: tensor([   0,   42,   41,   40, 1435,    3,  151]), Next word index: 6
Sentence indices: tensor([  67,   40,  227,    0, 1058,    3,    0]), Next word index: 629
Sentence indices: tensor([   41,   238,    12,     0, 19740,    35, 16265]), Next word index: 21
Sentence indices: tensor([    22,      0,   3453,   2641,      0,  71977, 122900]), Next word index: 12
Sentence indices: tensor([   40, 44416,     0,     5,     0,   215,     0]), Next word index: 4
Sentence indices: tensor([ 2025,   249,     5,  5173,     6, 27388,     5]), Next word index: 0
Sentence indices: tensor([ 26,   0,  18, 822,   7,   0,  17]), Next word index: 7
Sentence indices: tensor([103,  19,  18,  16,   0,  26, 6

In [163]:
import torch.nn as nn
import torch.optim.lr_scheduler as lr_scheduler

class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(glove_vectors, freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=2, dropout=0.2)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        logits = self.fc(lstm_out[:, -1, :])  # We only use the output of the last time step
        return logits

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMModel(50, 256, len(glove_vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

def evaluate_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for sentences, targets in test_loader:
            sentences, targets = sentences.to(device), targets.to(device)
            outputs = model(sentences.long())
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == targets).sum().item()
            total_samples += targets.size(0)

    avg_loss = total_loss / len(test_loader)
    accuracy = total_correct / total_samples
    return avg_loss, accuracy

# Training and evaluation loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for sentences, targets in train_loader:
        sentences, targets = sentences.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(sentences.long())
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
    
    avg_train_loss = total_train_loss / len(train_loader)
    test_loss, test_accuracy = evaluate_model(model, test_loader, criterion)
    print(f'Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

Epoch 1, Train Loss: 6.5213, Test Loss: 5.9896, Test Accuracy: 0.2001
Epoch 2, Train Loss: 5.6480, Test Loss: 5.9448, Test Accuracy: 0.2001


KeyboardInterrupt: 

In [148]:
import torch.nn.functional as F
import random as rand

def embed_sentence(sentence, glove_vocab): return [glove_vocab.get(word.lower(), 0) for word in sentence.split()] 
def deembed_sentence(indices, glove_vocab): return [list(glove_vocab.keys())[idx] for idx in indices]
def generate_sentence(model, start_sentence, max_length=20):
    model.eval()
    for _ in range(max_length):
        embedded_sentence = torch.tensor(embed_sentence(start_sentence, glove_vocab)).unsqueeze(0)
        embedded_sentence.to(device)
        outputs = model(embedded_sentence)
        _, predicted_idx = torch.topk(outputs, 5, dim=1)
        print(predicted_idx)
        random_index = rand.randint(0, 4)
        predicted_idx = predicted_idx[0][random_index]
        predicted_word = list(glove_vocab.keys())[predicted_idx.item()]
        start_sentence += predicted_word + ' '
    return start_sentence

sentence = "the cat sat on the "
generate_sentence(model, sentence)

tensor([[ 0,  5, 34, 41, 42]])
tensor([[ 0,  5, 41, 34, 42]])
tensor([[41, 40, 18, 20, 15]])
tensor([[ 40,   0, 913,  15, 189]])
tensor([[33,  0, 36, 30, 41]])
tensor([[ 0, 34,  5, 41, 18]])
tensor([[ 0, 41,  5, 34, 42]])
tensor([[ 40,   0,  15, 913, 189]])
tensor([[ 0,  4, 40, 36, 33]])
tensor([[  0,  51,  36, 192,  40]])
tensor([[   0,  261,  629,    7, 1773]])
tensor([[ 0, 40, 15,  5, 14]])
tensor([[  0,  41, 192,   7,  12]])
tensor([[ 40,   0,  15, 913,  33]])
tensor([[  0,  36, 114, 192,   6]])
tensor([[ 0,  5, 34, 41,  4]])
tensor([[ 40,   0,  15, 913, 189]])
tensor([[   0,   36,  114,    6, 2282]])
tensor([[ 4,  0,  6,  7, 17]])
tensor([[ 5,  0, 34, 41,  4]])


'the cat sat on the the which i should the the i am have my father and i was the i was now the i '