In [1]:
import torch
import torch.nn as nn
import numpy as np
from datasets import load_dataset
from string import punctuation

In [2]:
!pip install --upgrade datasets
dataset = load_dataset("cnn_dailymail","3.0.0")

Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platfo

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [3]:
len(dataset['train'])

287113

In [4]:
train_articles = np.array(dataset['train'][:14000]['article'])
train_summaries = np.array(dataset['train'][:14000]['highlights'])
text_training = {'articles':train_articles,
                 'summaries':train_summaries}

# Pre-Processing

In [5]:
def removing_unwanted_characters(text):
    punctuations = punctuation.replace("'","")
    punctuations = punctuation.replace(".","")
    punctuations = punctuation.replace(",","")
    punctuations += '•1234567890'

    text = "".join([c for c in text if c not in punctuations])
    text = text.split("\n")
    text = " ".join(text)
    text = text.strip()
    text = " ".join(text.split())

    new_text = str(text[0])
    for char in range(1, len(text)):
        if (text[char] == '.' or text[char] == ',') and new_text[-1] != ' ':
            new_text += ' '
            new_text += text[char]
        elif (new_text[-1] == '.' or new_text[-1] == ',') and text[char] != ' ':
            new_text += ' '
            new_text += text[char]
        else:
            new_text += text[char]

    return new_text

In [6]:
def tokenizing(data):
    all_unique_words = []
    for text in data:
        all_unique_words.extend(text.split())
    vocab = set(all_unique_words)
    vocab = {word:ii for ii, word in enumerate(vocab, 3)}# '<pad>': 0, '<bos>': 1, '<eos>': 2

    didicted_tokens = ['<pad>', '<bos>', '<eos>']
    for i in range(3):
        vocab[didicted_tokens[i]] = i

    return vocab

In [7]:
def convert_words_to_tokens(data, vocab):
    data_ints = []

    for text in data:
        data_ints.append([vocab.get(word) for word in text.split() if vocab.get(word) != None])
        data_ints[-1].insert(0,1)
        data_ints[-1].append(2)

    size = int(data.shape[0] // 2)
    input_data = data_ints[:size]
    target_data = data_ints[size:]

    return input_data, target_data

In [8]:
def remove_short_articles(data, input_data, target_data):
    small_lenghts_indices = [index for index, text in enumerate(input_data) if len(text) < 100]
    small_lenghts_indices = small_lenghts_indices[::-1]
    for index in small_lenghts_indices:
         input_data.pop(index)
         target_data.pop(index)

    size = int(data.shape[0] // 2)
    small_lenghts_indices_targets = [length + size for length in small_lenghts_indices]
    small_lenghts_indices = small_lenghts_indices + small_lenghts_indices_targets
    data = np.delete(data, small_lenghts_indices, axis=0)

    return data, input_data, target_data

In [9]:
def paddings(input_data, length):
    size = len(input_data)
    for i in range(size):
        if len(input_data[i]) < length:
            rem = length - len(input_data[i])
            input_data[i].extend([0]*rem)

        elif len(input_data[i]) > length:
            input_data[i] = input_data[i][:length]

    return input_data

In [10]:
def preprocessing(data, length_data, length_target, vocab):
    keys = list(data.keys())
    data = np.concatenate((data.get(keys[0]), data.get(keys[1])))
    data = np.array([text.lower() for text in data])
    items = data.shape[0]

    for i in range(items):
       data[i] += " ."
       data[i] = removing_unwanted_characters(data[i])
    if not vocab:
        vocab = tokenizing(data)

    input_data, target_data = convert_words_to_tokens(data, vocab)

    data, input_data, target_data = remove_short_articles(data, input_data, target_data)

    input_data = paddings(input_data, length_data)
    target_data = paddings(target_data, length_target)

    return data, torch.tensor(input_data), torch.tensor(target_data), vocab

In [11]:
text_train, train_data, train_target, vocab = preprocessing(text_training, 1000, 85, {})

In [12]:
type(train_data), type(train_target), type(vocab)

(torch.Tensor, torch.Tensor, dict)

In [13]:
train_data.shape, train_target.shape, len(vocab)

(torch.Size([13966, 1000]), torch.Size([13966, 85]), 113285)

In [14]:
embedding_dim = 256
vocab_size = len(vocab)
embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=vocab['<pad>'])

In [15]:
train_data = train_data.cuda()
embedding_layer = embedding_layer.cuda()

In [16]:
training_data = embedding_layer(train_data)

In [17]:
training_data.shape

torch.Size([13966, 1000, 256])

In [18]:
trained = {'vocab':vocab,
        'embdedding_weights':embedding_layer.state_dict()
}

torch.save(trained, 'trained.pt')