In [1]:
!pip install sentencepiece tqdm

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.3 MB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [3]:
!wget https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt

--2023-07-02 12:47:29--  https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.103.128, 108.177.120.128, 142.251.171.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.103.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘shakespeare.txt’


2023-07-02 12:47:29 (129 MB/s) - ‘shakespeare.txt’ saved [1115394/1115394]



In [4]:
import io
import re
import string

import tqdm
import numpy as np
import sentencepiece as spm
import tensorflow as tf
from tensorflow.keras import layers

In [5]:
# sentencepiece 라이브러리를 사용해 vocab 생성
txt_path = '/content/shakespeare.txt'
model_prefix = '/content/tokenize.txt'
vocab_size = 8007
model_type = 'bpe'

spm.SentencePieceTrainer.Train(
    f"--input={txt_path} --model_prefix={model_prefix} --vocab_size={vocab_size}"
    + f" --model_type={model_type}"
    + f" --pad_id=0 --pad_piece=[PAD]"
    + f" --unk_id=1 --unk_piece=[UNK]"
    + f" --bos_id=2 --bos_piece=[BOS]"
    + f" --eos_id=3 --eos_piece=[EOS]"
    + " --user_defined_symbols=[SEP],[CLS],[MASK]"
)

In [6]:
sp = spm.SentencePieceProcessor(model_file=model_prefix + '.model')

In [7]:
%load_ext tensorboard

In [8]:
with open(txt_path) as f:
    lines = f.read().splitlines()

lines = [line for line in lines if len(line) != 0]

for line in lines[:20]:
    print(line)

First Citizen:
Before we proceed any further, hear me speak.
All:
Speak, speak.
First Citizen:
You are all resolved rather to die than to famish?
All:
Resolved. resolved.
First Citizen:
First, you know Caius Marcius is chief enemy to the people.
All:
We know't, we know't.
First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?
All:
No more talking on't; let it be done: away, away!
Second Citizen:
One word, good citizens.
First Citizen:


In [9]:
max_length = 100

def tokenize(lines):
    tokenized = []
    for line in lines:
        if not isinstance(line, str):
            line = str(line)
        token = sp.tokenize(line)
        token += [0] * (max_length - len(token))
        tokenized.append(tf.convert_to_tensor(token))
    return tf.stack(tokenized, 0)

tokenized = tokenize(lines)
tokenized.shape

TensorShape([32777, 100])

In [10]:
text_vector_ds = tf.data.Dataset.from_tensor_slices(tokenized)
text_vector_ds

<_TensorSliceDataset element_spec=TensorSpec(shape=(100,), dtype=tf.int32, name=None)>

In [11]:
sequences = list(text_vector_ds.as_numpy_iterator())
len(sequences)

32777

In [12]:
for seq in sequences[:5]:
    print(f"{seq} => {[sp.decode(int(i)) for i in seq]}")

[ 427  811 7971    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0] => ['First', 'Citizen', ':', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
[2092   88 2442  552 2018 7963  428   72  366 7972    0    0    0    0
   

In [18]:
sequences[1]

array([2092,   88, 2442,  552, 2018, 7963,  428,   72,  366, 7972,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)

In [21]:
tf.keras.preprocessing.sequence.skipgrams(
    sequences[1],
    vocabulary_size=vocab_size,
    window_size=2,
    negative_samples=0
)[0]

[[2018, 428],
 [7963, 428],
 [366, 72],
 [72, 7972],
 [552, 7963],
 [7972, 72],
 [7963, 72],
 [72, 428],
 [2442, 552],
 [2442, 2018],
 [7963, 2018],
 [2018, 7963],
 [366, 7972],
 [2018, 552],
 [2092, 2442],
 [428, 2018],
 [2442, 2092],
 [72, 7963],
 [552, 2018],
 [428, 7963],
 [88, 552],
 [88, 2442],
 [72, 366],
 [552, 88],
 [428, 366],
 [552, 2442],
 [2018, 2442],
 [7972, 366],
 [88, 2092],
 [2442, 88],
 [7963, 552],
 [428, 72],
 [366, 428],
 [2092, 88]]

In [150]:
import random

def generate_training_data(sequences, window_size, vocab_size, seed):
    targets, contexts, labels = [], [], []

    # Build the sampling table for `vocab_size` tokens
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)
    # Iterate over all sequence(sentences) in the dataset.
    for sequence in tqdm.tqdm(sequences):
        # Generate skip_gram pairs for a sequence
        skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
            sequence,
            vocabulary_size=vocab_size,
            sampling_table=sampling_table,
            window_size=window_size,
            negative_samples=0
        )
        sequence = list(sequence)
        # Iterate over each skip-gram pair to produce training examples
        for target_word, context_word in skip_grams:
            label = None
            # find all index of value
            ids = []
            for i, seq in enumerate(sequence):
                if seq == target_word:
                    ids.append(i)
                if seq == 0:
                    break
            # find label index in window size
            for idx in ids:
                for i, skip in enumerate([-2, -1, 1, 2]):
                    if sequence[idx + skip] == context_word:
                        label = i
                if label:
                    break
            if label is None:
                raise ValueError()
            label = label if label < 2 else label - 1


            # Append each element from the training example to global lists
            context_word = tf.constant([context_word], dtype='int64')
            label = tf.constant([label], dtype='float64')
            targets.append(target_word)
            contexts.append(context_word)
            labels.append(label)

    return targets, contexts, labels

In [151]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    vocab_size=vocab_size,
    seed=42
)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print(f'targets.shape: {targets.shape}')
print(f'contexts.shape: {contexts.shape}')
print(f'labels.shape: {labels.shape}')

100%|██████████| 32777/32777 [02:18<00:00, 236.60it/s]


targets.shape: (319668,)
contexts.shape: (319668, 1)
labels.shape: (319668, 1)


In [152]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int32, name=None), TensorSpec(shape=(1024, 1), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 1), dtype=tf.float64, name=None))>


In [153]:
AUTOTUNE = tf.data.AUTOTUNE
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int32, name=None), TensorSpec(shape=(1024, 1), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 1), dtype=tf.float64, name=None))>


In [158]:
# 모델
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(
            vocab_size,
            embedding_dim,
            input_length=1,
            name='w2v_embedding'
        )
        self.context_embedding = layers.Embedding(
            vocab_size,
            embedding_dim,
            input_length=1
        )

    def call(self, pair):
        target, context = pair
        # target: (batch, dummy?)
        # context: (batch, context)
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        # target: (batch,)
        word_emb = self.target_embedding(target)
        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        # context_emb: (batch, context, embed)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        # dots: (batch, context)
        return dots

In [159]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='logs')

word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f881bc4a800>