In [1]:
import pandas as pd
import sentencepiece as spm
import math
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [2]:
!pip install kaggle
!kaggle datasets download -d rmisra/news-category-dataset
!unzip news-category-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/rmisra/news-category-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading news-category-dataset.zip to /content
 34% 9.00M/26.5M [00:00<00:00, 44.5MB/s]
100% 26.5M/26.5M [00:00<00:00, 100MB/s] 
Archive:  news-category-dataset.zip
  inflating: News_Category_Dataset_v3.json  


In [3]:
# https://www.kaggle.com/datasets/rmisra/news-category-dataset/data

directory = "./News_Category_Dataset_v3.json"
data = pd.read_json(directory, lines=True)
data = data.drop('link', axis=1)
data.head()

Unnamed: 0,headline,category,short_description,authors,date
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
def encode(
    df: pd.DataFrame,
    bos_token: int,
    eos_token: int,
    mean: float = None,
    std:float = None
    ):
  """ Tokenize dataframe in the form

      | headline |	category |	short_description |	authors |	date |
            ||          ||              ||             ||       ||
      sentencepiece  index-based   sentencepiece  index-based  Normalized

  Args:
      df: The dataframe to Tokenize.
      mean: Mean for normalization of date.
      std: Standard deviation for normalization of date.

  Returns:
      The tokenized dataframe.
  """

  categories         = df.category.unique().tolist()
  category_tokens_ix = dict(enumerate(categories))
  category_tokens_xi = {x:i for i, x in category_tokens_ix.items()}

  authors           = df.authors.unique().tolist()
  authors_tokens_ix = dict(enumerate(authors))
  authors_tokens_xi = {x:i for i, x in authors_tokens_ix.items()}

  print("Total number of categories:", len(category_tokens_ix))
  print("Total number of Authors:",    len(authors_tokens_ix))

  df['date']    = (df['date'] - df['date'].min()).dt.total_seconds()

  mean = mean if mean else df['date'].mean()
  std  = std if std else df['date'].std()

  print('Using mean as {} for date'.format(mean))
  print('Using std as {} for date'.format(std))

  df['date']              = (df['date'] - mean)/std
  df['category']          = df['category'].map(category_tokens_xi)
  df['authors']           = df['authors'].map(authors_tokens_xi)
  df['headline']          = df['headline'].map(lambda x: [bos_token] + sp.encode_as_ids(x) + [eos_token])
  df['short_description'] = df['short_description'].map(lambda x: [bos_token] + sp.encode_as_ids(x) + [eos_token])

  return df, category_tokens_ix, authors_tokens_ix, mean, std, (len(category_tokens_ix),
                                                                len(authors_tokens_ix))

In [5]:
def pad_dataset(df: pd.DataFrame, PAD_TOKEN: int):
  """Pads the headline and short_description columns of the dataframe with the given pad token.

  Args:
      df: The dataframe to pad.
      pad_token: The token to use for padding.

  Returns:
      The padded dataframe.
  """
  MAX_HEADLINE_LENGHT       = df['headline'].str.len().max()
  MAX_SHORT_DESCRIPTION_LEN = df['short_description'].str.len().max()

  print("Maxlen for headline:", MAX_HEADLINE_LENGHT)
  print("Maxlen for short_description:", MAX_SHORT_DESCRIPTION_LEN)

  df['headline'] = df['headline'].map(lambda x: x + [PAD_TOKEN] * (MAX_HEADLINE_LENGHT - len(x)))
  df['short_description'] = df['short_description'].map(lambda x: x + [PAD_TOKEN] * (MAX_SHORT_DESCRIPTION_LEN - len(x)))

  return df, MAX_HEADLINE_LENGHT, MAX_SHORT_DESCRIPTION_LEN


In [6]:
def block_dataset(df: pd.DataFrame,
                  blocksize: int,
                  maxlen: int,
                  pad_token: int
                  ):

  context = torch.ones(len(df), maxlen, blocksize, dtype=torch.int32) * pad_token
  target = torch.ones(len(df), maxlen-1, dtype=torch.int32) * pad_token

  for i, row in enumerate(df['short_description']):
    for j in range(maxlen-1):
      context[i, j+1] = torch.concat([context[i, j, 1:], torch.tensor([row[j]])])
      target[i, j] = row[j+1]

  context = context[:, 1:].reshape(-1, blocksize)
  target = target.reshape(-1)

  df = df.loc[df.index.repeat([maxlen-1] * len(df))].reset_index(drop=True)
  df['context'] = context.tolist()
  df['target'] = target.tolist()

  return df


In [7]:
!curl -o m.vocab https://raw.githubusercontent.com/messi10tom/Fake-news-Generator/main/m.vocab
!curl -o m.model https://raw.githubusercontent.com/messi10tom/Fake-news-Generator/main/m.model

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 88432  100 88432    0     0   211k      0 --:--:-- --:--:-- --:--:--  211k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  313k  100  313k    0     0   799k      0 --:--:-- --:--:-- --:--:--  800k


In [8]:
sp = spm.SentencePieceProcessor()
sp.load('m.model')

# dataset, token_category, token_author, mean, std, vocabsizes = encode(data.copy(),
#                                                                         sp.bos_id(),
#                                                                        sp.eos_id())
# dataset, maxlen_H, maxlen_S = pad_dataset(dataset, sp.unk_id())

True

In [24]:
class FN_Dataset(Dataset):
    """  Fake News dataset """
    def __init__(self,
                 df: pd.DataFrame,
                 bos_token: int,
                 eos_token: int,
                 pad_token: int,
                 mean: float = None,
                 std:float = None
                 ):
      """

      Args:
        df              : Dataframe containing the data.
        bos_token       : Beginning of sentence token.
        eos_token       : End of sentence token.
        pad_token       : Padding token.
        mean(Optional)  : Mean for normalization of date.
        std(Optional)   : Standard deviation for normalization of date.

      """

      super().__init__()

      self.df, self.Category_decoder, self.Author_decoder, self.mean, self.std, self.vocabsizes = encode(df, bos_token, eos_token, mean, std)
      self.df, self.maxlen_H, self.maxlen_S = pad_dataset(self.df, pad_token)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
      sample = self.df.iloc[idx]

      return ({
          'headline': torch.tensor(sample['headline']),
          'date': torch.tensor([sample['date']]).to(torch.float32),
          'category': torch.tensor(sample['category']),
          'authors': torch.tensor(sample['authors'])
      }, torch.tensor(sample['short_description']))

In [15]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 d_model: int,
                 dropout: float,
                 max_len: int
                 ):
        """
        Args:
          d_model: Dimension of the embedding.
          dropout: Dropout rate.
          max_len: Maximum length of the sequence.

        """

        super().__init__()

        self.dropout = nn.Dropout(dropout)

        pos_encoding = torch.zeros(max_len, d_model)
        positions_list = torch.arange(0,
                                      max_len,
                                      dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, 4, 5

        division_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)) / d_model) # 1000^(2i/dim_model)

        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)

        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)

        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)

    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
        # Residual connection + pos encoding
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])

In [17]:
class FN_Generator(nn.Module):
    """ Generate short_description """
    def __init__(self,
                 output_dim: int,
                 embed_dim: int,
                 headline_vocabsize: int,
                 inputsize_H: int,
                 inputsize_S: int,
                 cat_vocabsize: int,
                 auth_vocabsize: int,
                 d_model: int = 512,
                 nhead: int = 8,
                 num_encoder_layers: int = 6,
                 num_decoder_layers: int = 6,
                 dropout: float = 0.1
                 ) -> None:

        """
        Args:
          output_dim          : Dimension of the output.
          embed_dim           : Dimension of the embedding.
          headline_vocabsize  : Vocabulary size of the headline.
          inputsize_H         : Input size of the headline.
          inputsize_S         : Input size of the short_description.
          cat_vocabsize       : Vocabulary size of the category.
          auth_vocabsize      : Vocabulary size of the authors.
          d_model             : Dimension of the Transformer.
          nhead               : Number of heads.
          num_encoder_layers  : Number of encoder layers.
          num_decoder_layers  : Number of decoder layers.
          dropout             : Dropout rate.

        """
        super(FN_Generator, self).__init__()

        self.positional_encoder = PositionalEncoding(d_model=d_model,
                                                     dropout=dropout,
                                                     max_len=5000)

        self.embed_headline    = nn.Embedding(headline_vocabsize, embed_dim)
        self.embed_cat         = nn.Embedding(cat_vocabsize, embed_dim)
        self.embed_auth        = nn.Embedding(auth_vocabsize, embed_dim)
        self.embed_desc     = nn.Embedding(headline_vocabsize, embed_dim)

        self.flatten = nn.Flatten()
        self.droupout = nn.Dropout(p=dropout)

        self.Wd = nn.Linear(1, d_model)
        self.Wh = nn.Linear(inputsize_H * embed_dim, d_model)
        self.Wc = nn.Linear(embed_dim, d_model)
        self.Wa = nn.Linear(embed_dim, d_model)
        self.Ws = nn.Linear(inputsize_S * embed_dim, d_model)

        self.transformer = nn.Transformer(d_model=d_model,
                                          nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dropout=dropout
                                         )

        self.output = nn.Linear(d_model, output_dim)

    def forward(self, src, tgt):
        EMB_h    = self.embed_headline(src['headline'])
        EMB_c    = self.embed_cat(src['category'])
        EMB_a    = self.embed_auth(src['authors'])
        EMB_s    = self.embed_desc(tgt)

        EMB_h = self.flatten(EMB_h)
        EMB_s = self.flatten(EMB_s)


        lin = self.droupout(torch.relu(self.Wh(EMB_h) +
                                      self.Wc(EMB_c) +
                                      self.Wa(EMB_a) +
                                      self.Wd(src['date'])
                                      ))

        lin = self.positional_encoder(lin)
        out = self.transformer(lin, self.Ws(EMB_s))


        return torch.softmax(self.output(out), dim=1)

In [25]:
dataset = FN_Dataset(df=data.copy(),
                     bos_token=sp.bos_id(),
                     eos_token=sp.eos_id(),
                     pad_token=sp.unk_id()
                     )
dataloader = DataLoader(dataset, batch_size=32,
                        shuffle=True, num_workers=2)

Total number of categories: 42
Total number of Authors: 29169
Using mean as 102645854.34430885 for date
Using std as 65527224.61900345 for date
Maxlen for headline: 187
Maxlen for short_description: 419


In [18]:

"""
Args:
  output_dim          : Dimension of the output.
  embed_dim           : Dimension of the embedding.
  headline_vocabsize  : Vocabulary size of the headline.
  inputsize_H         : Input size of the headline.
  inputsize_S         : Input size of the short_description.
  cat_vocabsize       : Vocabulary size of the category.
  auth_vocabsize      : Vocabulary size of the authors.
  d_model             : Dimension of the Transformer.
  nhead               : Number of heads.
  num_encoder_layers  : Number of encoder layers.
  num_decoder_layers  : Number of decoder layers.
  dropout             : Dropout rate.

"""
generator = FN_Generator(output_dim=sp.vocab_size(),
                         embed_dim=8,
                         headline_vocabsize=sp.vocab_size(),
                         inputsize_H=dataset.maxlen_H,
                         inputsize_S=dataset.maxlen_S,
                         cat_vocabsize=dataset.vocabsizes[0],
                         auth_vocabsize=dataset.vocabsizes[1])



In [26]:
src, tgt = next(iter(dataloader))

In [28]:
src

{'headline': tensor([[   1,  471, 3160,  ...,    0,    0,    0],
         [   1,   40, 2288,  ...,    0,    0,    0],
         [   1,  258,  671,  ...,    0,    0,    0],
         ...,
         [   1,  803,    7,  ...,    0,    0,    0],
         [   1,  281,  654,  ...,    0,    0,    0],
         [   1, 3894,   13,  ...,    0,    0,    0]]),
 'date': tensor([[ 1.0759],
         [ 1.0746],
         [ 0.0698],
         [ 0.7331],
         [ 0.8227],
         [ 1.1880],
         [ 1.9132],
         [ 0.5986],
         [ 0.0158],
         [ 0.0949],
         [ 0.5326],
         [ 1.4293],
         [-1.4122],
         [ 1.1919],
         [ 0.6447],
         [ 0.9005],
         [-0.0857],
         [ 0.8926],
         [ 0.3560],
         [ 1.9633],
         [-1.2672],
         [-0.6593],
         [ 0.1094],
         [ 1.1075],
         [-1.4887],
         [-1.2896],
         [ 0.7054],
         [ 0.5617],
         [-0.8360],
         [ 0.0593],
         [ 0.5392],
         [-0.8043]]),
 'ca

In [21]:
generator(sample).shape

TypeError: FN_Generator.forward() missing 1 required positional argument: 'tgt'

In [None]:
crossentropy = nn.CrossEntropyLoss()
optimizer = optim.AdamW(generator.parameters(), lr=0.005)

In [None]:
for epoch in range(1):
    for i, ds in enumerate(dataloader):
        optimizer.zero_grad(set_to_none=True)
        output = generator(ds)
        loss = crossentropy(output, ds['target'])
        loss.backward()
        optimizer.step()
    print('Epoch',epoch,' Loss -->',loss.item())

KeyboardInterrupt: 