In [1]:
import pandas as pd
import sentencepiece as spm
import math
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [2]:
!pip install kaggle
!kaggle datasets download -d rmisra/news-category-dataset
!unzip news-category-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/rmisra/news-category-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading news-category-dataset.zip to /content
 19% 5.00M/26.5M [00:00<00:00, 35.5MB/s]
100% 26.5M/26.5M [00:00<00:00, 122MB/s] 
Archive:  news-category-dataset.zip
  inflating: News_Category_Dataset_v3.json  


In [3]:
# https://www.kaggle.com/datasets/rmisra/news-category-dataset/data

directory = "./News_Category_Dataset_v3.json"
data = pd.read_json(directory, lines=True)
data = data.drop('link', axis=1)
data.head()

Unnamed: 0,headline,category,short_description,authors,date
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
def encode(
    df: pd.DataFrame,
    bos_token: int,
    eos_token: int,
    mean: float = None,
    std:float = None
    ):
  """ Tokenize dataframe in the form

      | headline |	category |	short_description |	authors |	date |
            ||          ||              ||             ||       ||
      sentencepiece  index-based   sentencepiece  index-based  Normalized

  Args:
      df: The dataframe to Tokenize.
      mean: Mean for normalization of date.
      std: Standard deviation for normalization of date.

  Returns:
      The tokenized dataframe.
  """

  categories         = df.category.unique().tolist()
  category_tokens_ix = dict(enumerate(categories))
  category_tokens_xi = {x:i for i, x in category_tokens_ix.items()}

  authors           = df.authors.unique().tolist()
  authors_tokens_ix = dict(enumerate(authors))
  authors_tokens_xi = {x:i for i, x in authors_tokens_ix.items()}

  print("Total number of categories:", len(category_tokens_ix))
  print("Total number of Authors:",    len(authors_tokens_ix))

  df['date']    = (df['date'] - df['date'].min()).dt.total_seconds()

  mean = mean if mean else df['date'].mean()
  std  = std if std else df['date'].std()

  print('Using mean as {} for date'.format(mean))
  print('Using std as {} for date'.format(std))

  df['date']              = (df['date'] - mean)/std
  df['category']          = df['category'].map(category_tokens_xi)
  df['authors']           = df['authors'].map(authors_tokens_xi)
  df['headline']          = df['headline'].map(lambda x: [bos_token] + sp.encode_as_ids(x) + [eos_token])
  df['short_description'] = df['short_description'].map(lambda x: [bos_token] + sp.encode_as_ids(x) + [eos_token])

  return df, category_tokens_ix, authors_tokens_ix, mean, std, (len(category_tokens_ix),
                                                                len(authors_tokens_ix))

In [5]:
def pad_dataset(df: pd.DataFrame, PAD_TOKEN: int):
  """Pads the headline and short_description columns of the dataframe with the given pad token.

  Args:
      df: The dataframe to pad.
      pad_token: The token to use for padding.

  Returns:
      The padded dataframe.
  """
  MAX_HEADLINE_LENGHT       = df['headline'].str.len().max()
  MAX_SHORT_DESCRIPTION_LEN = df['short_description'].str.len().max()

  print("Maxlen for headline:", MAX_HEADLINE_LENGHT)
  print("Maxlen for short_description:", MAX_SHORT_DESCRIPTION_LEN)

  df['headline'] = df['headline'].map(lambda x: x + [PAD_TOKEN] * (MAX_HEADLINE_LENGHT - len(x)))
  df['short_description'] = df['short_description'].map(lambda x: x + [PAD_TOKEN] * (MAX_SHORT_DESCRIPTION_LEN - len(x)))

  return df, MAX_HEADLINE_LENGHT, MAX_SHORT_DESCRIPTION_LEN


In [6]:
!curl -o m.vocab https://raw.githubusercontent.com/messi10tom/Fake-news-Generator/main/m.vocab
!curl -o m.model https://raw.githubusercontent.com/messi10tom/Fake-news-Generator/main/m.model

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 88432  100 88432    0     0   333k      0 --:--:-- --:--:-- --:--:--  333k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  313k  100  313k    0     0   985k      0 --:--:-- --:--:-- --:--:--  987k


In [7]:
sp = spm.SentencePieceProcessor()
sp.load('m.model')

# dataset, token_category, token_author, mean, std, vocabsizes = encode(data.copy(),
#                                                                         sp.bos_id(),
#                                                                        sp.eos_id())
# dataset, maxlen_H, maxlen_S = pad_dataset(dataset, sp.unk_id())

True

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [32]:
class FN_Dataset(Dataset):
    """  Fake News dataset """
    def __init__(self,
                 df: pd.DataFrame,
                 bos_token: int,
                 eos_token: int,
                 pad_token: int,
                 mean: float = None,
                 std:float = None
                 ):
      """

      Args:
        df              : Dataframe containing the data.
        bos_token       : Beginning of sentence token.
        eos_token       : End of sentence token.
        pad_token       : Padding token.
        mean(Optional)  : Mean for normalization of date.
        std(Optional)   : Standard deviation for normalization of date.

      """

      super().__init__()

      self.df, self.Category_decoder, self.Author_decoder, self.mean, self.std, self.vocabsizes = encode(df, bos_token, eos_token, mean, std)
      self.df, self.maxlen_H, self.maxlen_S = pad_dataset(self.df, pad_token)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
      sample = self.df.iloc[idx]

      return (torch.tensor(sample['headline']),
              torch.tensor([sample['authors'], sample['category'], sample['date']]).to(torch.float32),
              torch.tensor(sample['short_description']))

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 embed_d: int,
                 dropout: float,
                 max_len: int
                 ):
        """
        Args:
          d_model: Dimension of the embedding.
          dropout: Dropout rate.
          max_len: Maximum length of the sequence.

        """

        super().__init__()

        self.dropout = nn.Dropout(dropout)

        pos_encoding = torch.zeros(max_len, embed_d)
        positions_list = torch.arange(0,
                                      max_len,
                                      dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, 4, 5

        division_term = torch.exp(torch.arange(0, embed_d, 2).float() * (-math.log(10000.0)) / embed_d) # 1000^(2i/dim_model)

        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)

        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)

        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)

    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
        # Residual connection + pos encoding
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])

In [28]:
class FN_Generator(nn.Module):
    """ Generate short_description """
    def __init__(self,
                 output_dim: int,
                 embed_dim: int,
                 headline_vocabsize: int,
                 inputsize_H: int,
                 inputsize_S: int,
                 cat_vocabsize: int,
                 auth_vocabsize: int,
                 d_model: int = 512,
                 nhead: int = 8,
                 num_encoder_layers: int = 6,
                 num_decoder_layers: int = 6,
                 dropout: float = 0.1
                 ) -> None:

        """
        Args:
          output_dim          : Dimension of the output.
          embed_dim           : Dimension of the embedding.
          headline_vocabsize  : Vocabulary size of the headline.
          inputsize_H         : Input size of the headline.
          inputsize_S         : Input size of the short_description.
          cat_vocabsize       : Vocabulary size of the category.
          auth_vocabsize      : Vocabulary size of the authors.
          d_model             : Dimension of the Transformer.
          nhead               : Number of heads.
          num_encoder_layers  : Number of encoder layers.
          num_decoder_layers  : Number of decoder layers.
          dropout             : Dropout rate.

        """
        super(FN_Generator, self).__init__()

        self.positional_encoder = PositionalEncoding(embed_d=d_model,
                                                     dropout=dropout,
                                                     max_len=inputsize_H)
        self.data_embed = nn.Linear(3, d_model)
        self.relu       = nn.ReLU()
        self.flatten    = nn.Flatten()

        self.headline_embed = nn.Embedding(headline_vocabsize, d_model)

        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, batch_first=True)
        self.transformer = nn.TransformerDecoder(decoder_layer, num_decoder_layers)

        self.output = nn.Linear(d_model * (inputsize_H+1), output_dim)

    def forward(self, src_o, src_h, tgt):
        lin = self.relu(self.data_embed(src_o)).unsqueeze(1)

        headline = self.positional_encoder(self.headline_embed(src_h))
        short_description = self.positional_encoder(self.headline_embed(tgt))

        trans_in = torch.cat((headline, lin), dim=1)
        trans_out = self.transformer(trans_in, short_description)

        return nn.functional.softmax(self.output(self.flatten(trans_out)), dim=1)

In [33]:
dataset = FN_Dataset(df=data.copy(),
                     bos_token=sp.bos_id(),
                     eos_token=sp.eos_id(),
                     pad_token=sp.unk_id(),
                     )
dataloader = DataLoader(dataset,
                        batch_size=32,
                        shuffle=True,
                        num_workers=2,
                        pin_memory=True)

Total number of categories: 42
Total number of Authors: 29169
Using mean as 102645854.34430885 for date
Using std as 65527224.61900345 for date
Maxlen for headline: 187
Maxlen for short_description: 419


In [30]:

"""
Args:
  output_dim          : Dimension of the output.
  embed_dim           : Dimension of the embedding.
  headline_vocabsize  : Vocabulary size of the headline.
  inputsize_H         : Input size of the headline.
  inputsize_S         : Input size of the short_description.
  cat_vocabsize       : Vocabulary size of the category.
  auth_vocabsize      : Vocabulary size of the authors.
  d_model             : Dimension of the Transformer.
  nhead               : Number of heads.
  num_encoder_layers  : Number of encoder layers.
  num_decoder_layers  : Number of decoder layers.
  dropout             : Dropout rate.

"""
generator = FN_Generator(output_dim=sp.vocab_size(),
                         embed_dim=8,
                         headline_vocabsize=sp.vocab_size(),
                         inputsize_H=dataset.maxlen_H,
                         inputsize_S=dataset.maxlen_S,
                         cat_vocabsize=dataset.vocabsizes[0],
                         auth_vocabsize=dataset.vocabsizes[1]).to(device)

In [34]:
src_h, src_o, tgt = next(iter(dataloader))
generator(src_o.to(device),
          src_h.to(device),
          tgt.to(device)).shape

torch.Size([32, 5000])

In [35]:
crossentropy = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(generator.parameters(), lr=0.005)

AttributeError: 'AdamW' object has no attribute 'to'

In [None]:
for epoch in range(1):
    for i, ds in enumerate(dataloader):
        optimizer.zero_grad(set_to_none=True)
        output = generator(ds)
        loss = crossentropy(output, ds['target'])
        loss.backward()
        optimizer.step()
    print('Epoch',epoch,' Loss -->',loss.item())

TypeError: FN_Generator.forward() missing 1 required positional argument: 'tgt'