In [None]:
!pip install torch


In [None]:
!pip install torchtext

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import numpy as np

# Set random seeds for reproducibility
SEED = 1234
torch.manual_seed(SEED)


In [None]:
!pip install fasttext gensim


In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out)
        return out

In [None]:
df=pd.read_csv("/")
df.head()

In [None]:
df.info()

In [None]:
corpus=[]
for i in range(len(df)):
  corpus.append(str(df['Overview'][i]))

In [None]:
corpus[0]

In [None]:
corpus = [sentence.lower() for sentence in corpus]

# Tokenize the text
tokenizer = {}
for sentence in corpus:
    for word in sentence.split():
        if word not in tokenizer:
            tokenizer[word] = len(tokenizer) + 1

# Create sequences
#what are you?<EOS>
sequences = []
for sentence in corpus:
    words = sentence.split()
    seq = [tokenizer[word] for word in words]
    for i in range(1, len(seq)):
        sequences.append(seq[:i+1])

# Padding sequences
max_len = max([len(seq) for seq in sequences])
sequences = [seq + [0]*(max_len-len(seq)) for seq in sequences]

# Convert to tensors
sequences = torch.tensor(sequences)
print(sequences)

# Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx][:-1], self.sequences[idx][1:]

dataset = TextDataset(sequences)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [None]:
vocab_size = len(tokenizer) + 1
embedding_dim = 10
hidden_dim = 20
model = RNN(vocab_size, embedding_dim, hidden_dim)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
accuracy = 0

num_epochs = 15
for epoch in range(num_epochs):
    for inputs, targets in dataloader:
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

        accuracy = (torch.argmax(outputs, dim=2) == targets).float().mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Accuracy: {accuracy}')


In [None]:
def predict_next_word(model, tokenizer, input_text, max_len):
    model.eval()
    words = input_text.split()
    seq = [tokenizer[word] for word in words if word in tokenizer]
    input_seq = torch.tensor(seq).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_seq)
    predicted_word_idx = torch.argmax(outputs[0][-1]).item()

    for word, idx in tokenizer.items():
        if idx == predicted_word_idx:
            return word

In [None]:
input_text = "my name is say, i"
next_word = predict_next_word(model, tokenizer, input_text, max_len)
print(f'The next word is: {next_word}')