In [1]:
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader,TensorDataset,random_split
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('clickbait_data.gz',sep='\t',names=['clickbait'], header=None)

In [3]:
df.head()

Unnamed: 0,clickbait
0,Should I Get Bings
1,Which TV Female Friend Group Do You Belong In
2,"The New ""Star Wars: The Force Awakens"" Trailer..."
3,"This Vine Of New York On ""Celebrity Big Brothe..."
4,A Couple Did A Stunning Photo Shoot With Their...


In [4]:
def preprocess_headlines(headlines):
    headlines = [i for i in headlines if i.isalpha() or i.isspace()] 
    headlines = ''.join(headlines)
    headlines = headlines.lower()
    return headlines

df['clickbait'] = df['clickbait'].apply(preprocess_headlines)
df

Unnamed: 0,clickbait
0,should i get bings
1,which tv female friend group do you belong in
2,the new star wars the force awakens trailer is...
3,this vine of new york on celebrity big brother...
4,a couple did a stunning photo shoot with their...
...,...
15994,there was a mini sisterhood of the traveling p...
15995,dogs who are thankful for their best friends
15996,people are proving no dick is too big by dropp...
15997,im an atheist but im not


In [5]:
import torchtext

# Define the tokenizer
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

# Tokenize the headlines
tokenized_headlines = [tokenizer(headline) for headline in df.clickbait]

word_counts = Counter()
for headline in tokenized_headlines:
    word_counts.update(headline)

# Create a dictionary that maps each word to a unique index
word_to_idx = {}
for i, (word, count) in enumerate(word_counts.items()):
    word_to_idx[word] = i
    
# Build the vocabulary
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_headlines)
vocab_size = len(word_to_idx)

# Convert the tokenized headlines to numerical sequences
numerical_headlines = [[word_to_idx[token] for token in headline] for headline in tokenized_headlines]

In [6]:

# Convert the numerical headlines to PyTorch tensors
tensor_headlines = [torch.tensor(headline) for headline in numerical_headlines]

# Pad the tensors to the same length
padded_headlines = torch.nn.utils.rnn.pad_sequence(tensor_headlines, batch_first=True)

# Split the padded tensors into input and target sequences
input_seqs = padded_headlines[:, :-1]
target_seqs = padded_headlines[:, 1:]

# Create a dataset from the input and target sequences
dataset = TensorDataset(input_seqs, target_seqs)

max_seq_length = 100
pad_idx = 0
from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    inputs = [torch.tensor(item[0]) for item in batch]
    labels = [torch.tensor(item[1]) for item in batch]
    inputs = pad_sequence(inputs, batch_first=True, padding_value=pad_idx)
    labels = pad_sequence(labels, batch_first=True, padding_value=pad_idx)
    return inputs, labels

# Define data loaders for the dataset
batch_size = 32
train_size = int(0.7 * len(dataset))
eval_size = len(dataset) - train_size
train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [7]:
class cLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, embedding_size, max_seq_length):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.max_seq_length = max_seq_length

    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs)
        output, (hidden, cell) = self.lstm(embedded)
        logits = self.fc(output)
        return output.reshape(-1, output.size(2)), hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.num_layers, batch_size, self.hidden_size).zero_()
        cell = weight.new(self.num_layers, batch_size, self.hidden_size).zero_()
        return hidden, cell


In [8]:
vocab_size

12184

In [9]:
def truncate_sequence(seq, max_len):
    """
    Truncates a sequence to a maximum length.
    """
    if len(seq) > max_len:
        return seq[:max_len]
    else:
        return seq

In [17]:
vocab_size = 12184
embed_size = 300
hidden_size = 512
num_layers = 2
dropout = 0.5
learning_rate = 0.001
max_seq_length = 100
model = cLSTM(vocab_size, hidden_size=hidden_size, num_layers=num_layers, embedding_size=embed_size, max_seq_length=max_seq_length)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Set the number of training epochs
num_epochs = 10

# Train the model
for epoch in range(num_epochs):
    # Initialize hidden state
    hidden = model.init_hidden(batch_size)

    # Set model to train mode
    model.train()

    # Loop over training set
    for i, (inputs, labels) in enumerate(train_loader):
      # truncate the input sequence if it is longer than the maximum length
      if inputs.shape[1] > max_seq_length:
        inputs = truncate_sequence(inputs, max_seq_length)

      # Initialize hidden state
      hidden = model.init_hidden(batch_size)

      # Zero out gradients
      optimizer.zero_grad()

      # Forward pass
      output, hidden = model(inputs, hidden)
      output_flat = output.view(-1, output.shape[-1])
      labels_flat = labels.view(-1)
      loss = criterion(output_flat, labels_flat)

      # Backward pass and optimization
      loss.backward()
      optimizer.step()

      # Print statistics
      if i % print_every == 0:
          print(f"Epoch {epoch+1}/{num_epochs}, Step {i+1}/{num_batches}, Loss: {loss.item():.4f}")


    
    # Set evaluation mode
    model.eval()

    # Disable gradient computation
    with torch.no_grad():
        total_loss = 0
        total_batches = 0
        for inputs, labels in eval_loader:
          # Forward pass
          outputs = model(inputs)
          output_flat = output.view(-1, output.shape[-1])
          labels_flat = labels.view(-1) 
          loss = criterion(output_flat, labels_flat)
          
          # Update total loss and batches
          total_loss += loss.item() * inputs.size(0)
          total_batches += inputs.size(0)
        
        # Calculate average loss
        avg_loss = total_loss / total_batches
        print(f'Test Loss: {avg_loss:.4f}')

  inputs = [torch.tensor(item[0]) for item in batch]
  labels = [torch.tensor(item[1]) for item in batch]


IndexError: ignored

In [None]:
import torch.nn.functional as F

# Set the model to evaluation mode
model.eval()

# Set the seed text
seed_text = "Artificial Intelligence"

# Set the number of words to generate
num_words = 10

# Convert the seed text to a tensor
input_tensor = torch.LongTensor([[word_to_index.get(word, 0) for word in seed_text.split()]])

# Generate new headlines
for i in range(num_words):
    # Feed the input tensor to the model to get the output logits
    with torch.no_grad():
        output, _, _ = model(input_tensor)
    
    # Get the last predicted index of the next word
    logits = output[0, -1]
    index = torch.argmax(F.softmax(logits, dim=0)).item()
    
    # Convert the predicted index to its corresponding word and add it to the input text
    predicted_word = index_to_word.get(index, '')
    seed_text += ' ' + predicted_word
    
    # Convert the updated input text to a tensor for the next iteration
    input_tensor = torch.LongTensor([[word_to_index.get(word, 0) for word in seed_text.split()]])
    
print(seed_text)
