In [1]:
import re
import pickle
from collections import Counter
import numpy as np
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter

with open("./Data/dokujo-tsushin.txt", mode="r",encoding="utf-8") as f:
    corpus = []
    for line in f:
        cleaned_line = line.replace('\u3000', '').replace('\n', '')
        if cleaned_line!="":
            corpus.append(cleaned_line)

ModuleNotFoundError: No module named 'tensorboard'

In [2]:
# コンテキストとターゲットの作成関数の実装
def create_contexts_target(corpus, window_size=1):
    
    # ターゲットを抽出
    target = corpus[window_size:-window_size]
    
    # コンテキストを初期化
    contexts = []
    
    # ターゲットごとにコンテキストを格納
    for idx in range(window_size, len(corpus) - window_size):
        
        # 現在のターゲットのコンテキストを初期化
        cs = []
        
        # 現在のターゲットのコンテキストを1単語ずつ格納
        for t in range(-window_size, window_size + 1):
            
            # 0番目の要素はターゲットそのものなので処理を省略
            if t == 0:
                continue
            
            # コンテキストを格納
            cs.append(corpus[idx + t])
            
        # 現在のターゲットのコンテキストのセットを格納
        contexts.append(cs)
    
    # NumPy配列に変換
    return np.array(contexts), np.array(target) 

In [3]:
import MeCab
from tqdm.notebook import tqdm
def tokenize_with_mecab(sentences):
    # Initialize MeCab with the specified dictionary
    corpus = []
    for sentence in sentences:
        sentence = re.sub("http://news.livedoor.com/article/detail/[0-9]{7}/","", sentence) # 注2）
        sentence = re.sub("[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\+[0-9]{4}","", sentence) # 注3）
        sentence = re.sub("[「」]","", sentence)
        # Parse the sentence
        node = mecab.parseToNode(sentence)
        # Iterate over all nodes
        while node:
            # Extract the surface form of the word
            word = node.surface
            # Skip empty words and add to the corpus
            if word:
                corpus.append(word)
            node = node.next
    return corpus


# Initialize the MeCab tokenizer
#mecab = MeCab.Tagger()
mecab = MeCab.Tagger()
corpus = tokenize_with_mecab(corpus)

In [4]:
def filter_by_frequency(tokens, min_freq=5):
    # Count word frequencies
    frequency = Counter(tokens)
    # Filter tokens by frequency
    tokens = [token for token in tokens if frequency[token] >= min_freq]
    return tokens

In [5]:
word_to_id = {}
id_to_word = {}

for word in corpus:
    if word not in word_to_id:
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word

In [6]:
# リストに変換
corpus = [word_to_id[word] for word in corpus]

# NumPy配列に変換
corpus = np.array(corpus)

In [7]:
len(set(corpus))

25427

In [8]:
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [9]:
# コンテキストとターゲットを作成
contexts, targets = create_contexts_target(corpus, window_size=2)
contexts = torch.tensor(contexts, dtype=torch.long).to(device)
targets = torch.tensor(targets, dtype=torch.long).to(device)
print(contexts)
print(targets)

tensor([[   0,    1,    3,    4],
        [   1,    2,    4,    5],
        [   2,    3,    5,    6],
        ...,
        [  93,  832, 1225, 1226],
        [ 832,  506, 1226, 1227],
        [ 506, 1225, 1227,   96]], device='cuda:0')
tensor([   2,    3,    4,  ...,  506, 1225, 1226], device='cuda:0')


In [10]:
from torch.utils.data import Dataset, DataLoader

class CBOWDataset(Dataset):
    def __init__(self, contexts, targets):
        self.contexts = contexts
        self.targets = targets
    
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        return self.contexts[idx], self.targets[idx]

# Convert contexts and targets to tensors
contexts_tensor = torch.tensor(contexts, dtype=torch.long).to(device)
targets_tensor = torch.tensor(targets, dtype=torch.long).to(device)

# Create the dataset
dataset = CBOWDataset(contexts_tensor, targets_tensor)

# Create the DataLoader
batch_size = 256  # You can adjust the batch size
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

  contexts_tensor = torch.tensor(contexts, dtype=torch.long).to(device)
  targets_tensor = torch.tensor(targets, dtype=torch.long).to(device)


In [11]:
class SimpleCBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(SimpleCBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear1 = nn.Linear(embedding_size, vocab_size)

    def forward(self, inputs):
        # Embed the input words. 
        # Inputs should have the shape [batch_size, context_size]
        embeds = self.embeddings(inputs)  # Resulting shape [batch_size, context_size, embedding_size]
        
        # Sum the embeddings for each context word to get a single embedding vector per batch sample.
        # The resulting shape should be [batch_size, embedding_size]
        out = torch.sum(embeds, dim=1)
        
        # Pass the summed embeddings through the linear layer
        # The output shape will be [batch_size, vocab_size]
        out = self.linear1(out)
        
        # Apply log softmax to get log probabilities over the vocabulary for each sample in the batch
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [None]:
# パラメータの設定
embedding_size = 10
learning_rate = 0.01
epochs = 500
vocab_size = len(word_to_id)

# モデルのインスタンス化
model = SimpleCBOW(vocab_size, embedding_size).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

writer = SummaryWriter('runs/cbow_experiment_2')

# Training loop with batch processing
for epoch in range(epochs):
    total_loss = 0
    for i, (context_batch, target_batch) in enumerate(data_loader):
        # Zero out the gradients from the last step
        model.zero_grad()
        # Forward pass through the model
        log_probs = model(context_batch)
        # Compute the loss
        loss = loss_function(log_probs, target_batch)
        # Backward pass to compute gradients
        loss.backward()
        # Update the model parameters
        optimizer.step()
        # Accumulate the loss
        total_loss += loss.item()
        writer.add_scalar('Training loss', loss.item(), epoch * len(data_loader) + i)
    # Log the total loss for the epoch
    writer.add_scalar('Total Training loss', total_loss, epoch)
    print(f'Epoch {epoch}, Total loss: {total_loss}')

Epoch 0, Total loss: 31397.376534461975
Epoch 1, Total loss: 28187.27686357498
Epoch 2, Total loss: 26342.247032165527
Epoch 3, Total loss: 25164.694566488266
Epoch 4, Total loss: 24353.24726819992
Epoch 5, Total loss: 23723.983380794525
Epoch 6, Total loss: 23238.247279167175
Epoch 7, Total loss: 22841.27783536911
Epoch 8, Total loss: 22517.91306066513
Epoch 9, Total loss: 22247.803350925446
Epoch 10, Total loss: 21995.72559070587
Epoch 11, Total loss: 21773.0639834404
Epoch 12, Total loss: 21581.493795871735
Epoch 13, Total loss: 21411.580601215363
Epoch 14, Total loss: 21256.410546302795
Epoch 15, Total loss: 21103.692857265472
Epoch 16, Total loss: 20973.788657188416
Epoch 17, Total loss: 20847.244908571243
Epoch 18, Total loss: 20738.265578985214
Epoch 19, Total loss: 20643.255791187286
Epoch 20, Total loss: 20544.659994602203
Epoch 21, Total loss: 20456.14571905136
Epoch 22, Total loss: 20366.695989608765
Epoch 23, Total loss: 20292.23054742813
Epoch 24, Total loss: 20217.2446956

In [None]:
word_embeddings = model.embeddings.weight.data

In [None]:
word_embeddings

tensor([[ 0.2567, -0.6427, -0.5169,  ..., -0.7604, -0.0346,  0.0217],
        [ 0.8107, -0.9065, -1.1830,  ..., -0.4636, -0.0320,  0.9819],
        [ 1.7672, -0.7820, -0.0137,  ...,  2.3656, -0.4327,  0.0266],
        ...,
        [ 1.5716,  0.7011,  1.5195,  ...,  0.6961,  1.1100,  0.1157],
        [ 0.5649,  0.5198, -0.5296,  ...,  0.2395,  0.1166,  0.7280],
        [-2.4054, -0.4683, -1.3342,  ...,  0.9051, -1.2672, -0.4244]],
       device='cuda:0')

In [None]:
words = [id_to_word[i] for i in range(len(id_to_word))]

In [None]:
from torch.utils.tensorboard import SummaryWriter

# Initialize the writer
writer = SummaryWriter('runs/cbow_embeddings')

# Add embedding to the writer
writer.add_embedding(word_embeddings, metadata=words)

# Close the writer
writer.close()