In [14]:
import torch
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader


In [2]:
# Function to preprocess text (replace with your preferred method)
def preprocess_text(text):
    # Lowercase, remove punctuation, tokenize (consider stemming/lemmatization)
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])
    words = text.split()
    return words

In [4]:
df = pd.read_csv('/home/nadun/Documents/projects/flower/simulation_env/data/stack_overflow/train.csv')

In [5]:
words = df['text'][0]

In [6]:
vocabulary = set(words)
vocab_size = len(vocabulary)

In [7]:
word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}

In [8]:
# Convert words to numerical representation (one-hot encoding)
def word_to_bow(word, vocab_size):
    bow = torch.zeros(vocab_size)
    if word in word_to_idx:
        bow[word_to_idx[word]] = 1
    return bow

In [9]:
# Convert entire paragraph to BoW sequences
bow_sequences = []
for i in range(1, len(words)):
    context = words[i-1]  # Use previous word as context
    bow_sequences.append((word_to_bow(context, vocab_size), word_to_bow(words[i], vocab_size)))


In [10]:
# Convert to PyTorch tensors
all_inputs, all_targets = zip(*bow_sequences)
inputs_tensor = torch.stack(all_inputs)
targets_tensor = torch.stack(all_targets)

In [11]:
# Create a simple neural network (consider exploring more complex architectures for better results)
class NextWordPredictor(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NextWordPredictor, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [12]:
# Hyperparameters (adjust based on your dataset and task)
input_size = vocab_size
hidden_size = 128
output_size = vocab_size  # Predict from same vocabulary

model = NextWordPredictor(input_size, hidden_size, output_size)


In [15]:
# Define loss function and optimizer (consider experimenting with different options)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model (adjust epochs and batch size as needed)
num_epochs = 10
batch_size = 32

dataset = TensorDataset(inputs_tensor, targets_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [21]:
for epoch in range(num_epochs):
    for i, (inputs, targets) in enumerate(dataloader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item():.4f}')

Epoch [1/10], Step [1/6243], Loss: 2.2905
Epoch [1/10], Step [101/6243], Loss: 2.7493
Epoch [1/10], Step [201/6243], Loss: 2.1032
Epoch [1/10], Step [301/6243], Loss: 2.8214
Epoch [1/10], Step [401/6243], Loss: 2.3467
Epoch [1/10], Step [501/6243], Loss: 2.3989
Epoch [1/10], Step [601/6243], Loss: 2.8484
Epoch [1/10], Step [701/6243], Loss: 2.4473
Epoch [1/10], Step [801/6243], Loss: 2.9426
Epoch [1/10], Step [901/6243], Loss: 2.8750
Epoch [1/10], Step [1001/6243], Loss: 2.5477
Epoch [1/10], Step [1101/6243], Loss: 3.0320
Epoch [1/10], Step [1201/6243], Loss: 2.2162
Epoch [1/10], Step [1301/6243], Loss: 2.6120
Epoch [1/10], Step [1401/6243], Loss: 2.1599
Epoch [1/10], Step [1501/6243], Loss: 2.2253
Epoch [1/10], Step [1601/6243], Loss: 2.1433
Epoch [1/10], Step [1701/6243], Loss: 2.6234
Epoch [1/10], Step [1801/6243], Loss: 2.2519
Epoch [1/10], Step [1901/6243], Loss: 2.4371
Epoch [1/10], Step [2001/6243], Loss: 2.4474
Epoch [1/10], Step [2101/6243], Loss: 2.2856
Epoch [1/10], Step [22

In [22]:
def predict_next_word(model, word, word_to_idx, vocab_size):
    """Predicts the next word based on the given word."""
    with torch.no_grad():  # Disable gradient calculation for prediction
        bow = word_to_bow(word, vocab_size)
        bow = bow.unsqueeze(0)  # Add a batch dimension for the model
        output = model(bow)
        _, predicted_idx = torch.max(output.data, dim=1)
        predicted_word = [word for word, idx in word_to_idx.items() if idx == predicted_idx.item()][0]
        return predicted_word

In [25]:
sample_word = "programación"
predicted_word = predict_next_word(model, sample_word, word_to_idx, vocab_size)
print(f"Predicted next word for '{sample_word}': {predicted_word}")

Predicted next word for 'programación': J
