In [1]:
import pandas as pd

# Enhanced dataset with more variety
sentences = [
    "The cat sat on the", "The dog ran around the", "The sun was shining in the",
    "The baby laughed at the", "The teacher wrote on the", "A car drove over the",
    "He opened the", "She closed the", "The player won the", "The artist drew a",
    "The girl danced in the", "The boy played in the", "The wind blew through the",
    "The author discussed the", "The scientist discovered a", "The historian studied the",
    "The chef cooked a", "The farmer planted a", "The journalist wrote about the",
    "The programmer debugged the"
]
next_words = [
    "mat", "block", "sky", "toy", "board", "bridge", "door", "window", "game", "portrait",
    "room", "yard", "trees", "topic", "method", "artifact", "dish", "seed", "event", "program"
]

df = pd.DataFrame({"sentence": sentences, "next_word": next_words})


In [21]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
# Load GloVe Embeddings
glove_file = "glove.6B.100d.txt"
glove_embeddings = {}
with open(glove_file, "r") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = torch.tensor([float(x) for x in values[1:]], dtype=torch.float32)
        glove_embeddings[word] = vector
import pandas as pd
df_json = pd.read_json('./dataset_rnn.json')
# turn into dataframe make the right columns
df = pd.DataFrame( columns=['sentence', 'next_word'])
for i in range(len(df_json['data'])):
    # no append
    df.loc[i] = [df_json['data'][i]['sentence'], df_json['data'][i]['next_word']]
'''
This is only an example. Change as you see fit. Right now there is not split between train and test data. You will have to implement that.
'''    

# Assuming glove_embeddings has been loaded as shown earlier
def sentence_to_embedding(sentence):
    words = sentence.split()
    embeddings = [glove_embeddings.get(word, torch.zeros(100, dtype=torch.float32)) for word in words]
    return torch.stack(embeddings)

class SentenceDataset(Dataset):
    def __init__(self, df):
        self.df = df
        next_words = df["next_word"].tolist()
        self.word_to_idx = {word: i for i, word in enumerate(sorted(set(next_words)))}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        sentence = self.df.iloc[idx, 0]
        next_word = self.df.iloc[idx, 1]
        embedding = sentence_to_embedding(sentence)
        next_word_idx = self.word_to_idx[next_word]
        return {"sentence": embedding, "next_word": next_word_idx}

dataset = SentenceDataset(df)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim  # Save hidden_dim as an instance variable
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden and cell states with dimensions: (num_layers, batch_size, hidden_dim)
        h0 = torch.zeros(1, x.size(0), self.hidden_dim)
        c0 = torch.zeros(1, x.size(0), self.hidden_dim)
        # Forward pass through LSTM layer
        out, _ = self.lstm(x, (h0, c0))
        # Pass the output of the last time step to the fully connected layer
        out = self.fc(out[:, -1, :])
        return out


output_dim = len(dataset.word_to_idx)  # Number of unique next words
model = LSTMModel(input_dim=100, hidden_dim=128, output_dim=output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def calculate_perplexity(loss):
    return torch.exp(loss)

# Training Loop with Perplexity Calculation
for epoch in range(10):
    total_loss = 0
    for batch in dataloader:
        model.train()
        sentences = batch["sentence"]
        next_words = batch["next_word"]
        optimizer.zero_grad()
        outputs = model(sentences)
        loss = criterion(outputs, next_words)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    perplexity = calculate_perplexity(torch.tensor(avg_loss))
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}, Perplexity: {perplexity.item():.4f}")


Epoch 1, Loss: 2.1249, Perplexity: 8.3717
Epoch 2, Loss: 0.3114, Perplexity: 1.3653
Epoch 3, Loss: 0.0739, Perplexity: 1.0767
Epoch 4, Loss: 0.0269, Perplexity: 1.0273
Epoch 5, Loss: 0.0136, Perplexity: 1.0137
Epoch 6, Loss: 0.0086, Perplexity: 1.0086
Epoch 7, Loss: 0.0062, Perplexity: 1.0062
Epoch 8, Loss: 0.0048, Perplexity: 1.0048
Epoch 9, Loss: 0.0038, Perplexity: 1.0038
Epoch 10, Loss: 0.0031, Perplexity: 1.0031


In [12]:
# check the model
model.eval()
sentence = "The cat sat on the"
sentence_embedding = sentence_to_embedding(sentence).unsqueeze(0)
output = model(sentence_embedding)
_, predicted_idx = torch.max(output, 1)
predicted_word = list(dataset.word_to_idx.keys())[predicted_idx.item()]
print(f"Predicted next word: {predicted_word}")
# Output: Predicted next word: mat

Predicted next word: mat


In [27]:

import torch.nn.functional as F
def generate_sentence(model, start_sentence, max_length=20):
    model.eval()  # Ensure the model is in evaluation mode
    sentence = start_sentence
    words = sentence.split()
    return sentence

# Example usage after training
start_fragment = "The cat sat on the"
generated_sentence = generate_sentence(model, start_fragment)
#used_words = set(words)  # Keep track of words used in the sentence to apply penalties
print("Generated Sentence:", generated_sentence)



Generated Sentence: The cat sat on the mat mat mat mat mat mat mat toy toy toy toy toy toy toy toy
Generated Sentence: I am lucky to trees window block game trees trees trees trees trees seed trees trees trees trees sky dish
