<a href="https://colab.research.google.com/github/no-akatsu/training/blob/main/240910_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 施設データのEmbedding

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [None]:
# ランダムデータ生成のためのパラメータ
num_facilities = 100 # 施設の数
embedding_dim = 8 # Embeddingの次元数
num_samples = 1000 # サンプルの数
num_categories = 10 # カテゴリの種類

In [None]:
# 施設属性のランダム生成（種類、場所、規模など）
facility_attributes = np.random.randint(0, num_categories, size=(num_facilities, 3)) # 3つのカテゴリ属性

# ペアのラベル（良い=1, 悪い=0）のランダム生成
pair_labels = np.random.randint(0, 2, size=(num_samples,))

# ランダムに施設ペアを作成
facility_pairs = np.random.randint(0, num_facilities, size=(num_samples, 2))

# PyTorchようにデータをTensorに変換
facility_pairs_tensor = torch.tensor(facility_pairs)
pair_labels_tensor = torch.tensor(pair_labels, dtype=torch.float32)

In [None]:
facility_pairs_tensor

tensor([[88, 73],
        [69, 26],
        [62, 21],
        ...,
        [ 1, 55],
        [47, 53],
        [ 7, 18]])

In [None]:
pair_labels_tensor

tensor([1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1.,
        1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1.,
        1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
        1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1.,
        0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1.,
        0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0.,
        0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1.,
        0., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0.,
        0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
        0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1.,
        0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 1., 

In [None]:
# Embeddingとペア評価のモデル
class FacilityPairModel(nn.Module):
    def __init__(self, num_facilities, embedding_dim):
        super(FacilityPairModel, self).__init__()
        self.embedding = nn.Embedding(num_facilities, embedding_dim)
        self.fc = nn.Linear(embedding_dim * 2, 1)

    def forward(self, facility_A, facility_B):
        embed_A = self.embedding(facility_A)
        embed_B = self.embedding(facility_B)
        combined = torch.cat((embed_A, embed_B), dim=-1)
        output = torch.sigmoid(self.fc(combined))
        return output

In [None]:
# モデルのインスタンス化
model = FacilityPairModel(num_facilities, embedding_dim)

In [None]:
# 損失関数とオプティマイザ
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# トレーニングループ
num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0
    optimizer.zero_grad()

    # 各サンプルに対してペアの評価を行う
    for i in range(num_samples):
        # facilityIdに当てはまる属性データを持ってくる
        facility_A = facility_pairs_tensor[i, 0]
        facility_B = facility_pairs_tensor[i, 1]
        label = pair_labels_tensor[i]

        # forward
        output = model(facility_A, facility_B)
        # ラベルと出力の比較
        loss = criterion(output, label.unsqueeze(0)) # .unsqueeze(0)で次元数を1つ増やす

        # backwardとパラメータ更新
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/num_samples:.4f}')

Epoch [10/100], Loss: 1.0743
Epoch [20/100], Loss: 0.8521
Epoch [30/100], Loss: 1.3101
Epoch [40/100], Loss: 0.7938
Epoch [50/100], Loss: 1.0251
Epoch [60/100], Loss: 0.8323
Epoch [70/100], Loss: 1.0933
Epoch [80/100], Loss: 0.8194
Epoch [90/100], Loss: 1.0845
Epoch [100/100], Loss: 0.7348


In [None]:
for name, param in model.named_parameters():
    print('name: ', name)
    print('param: ', param)

name:  embedding.weight
param:  Parameter containing:
tensor([[-1.0351e+00,  4.0616e-01,  5.7616e-01, -3.7503e-01, -2.7113e-01,
          1.2983e+00,  1.2164e+00, -1.2338e+00],
        [ 5.7113e-01, -1.0568e+00, -9.5879e-01,  3.6652e-01,  5.7391e-01,
         -6.0868e-01,  4.4244e-01,  4.0115e+00],
        [-2.1642e-02,  2.6350e-02,  7.3855e-02,  4.5061e-01,  1.5336e-01,
          4.6575e-01,  8.8991e-01,  1.0094e+00],
        [-4.6441e-02, -4.0365e-01, -1.4407e+00,  5.0979e-01,  3.2106e-01,
          1.1514e+00,  6.6025e-01,  2.1105e+00],
        [ 7.5675e-01, -6.8885e-01, -1.1736e+00, -1.4521e-01, -3.7170e-01,
          3.6361e-01,  8.1408e-01,  2.5311e+00],
        [-1.6786e-01, -6.7407e-02, -4.0116e-01, -1.4668e+00, -9.8128e-01,
         -9.0988e-01, -3.4569e-01,  2.0675e+00],
        [ 2.4688e-01, -1.4722e-01, -2.9687e-01, -8.8416e-01,  2.7323e-01,
          7.8787e-01,  4.9587e-01, -3.9499e-01],
        [-1.0756e+00,  1.0373e-01, -1.7783e-01, -1.0076e+00, -1.1888e-01,
          1

# Embedding2 null処理

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [3]:
num_facilities = 1000
num_categories_1 = 10
num_categories_2 = 15
num_continuous_features = 5

np.random.seed(42)
category_1_data = np.random.choice([np.nan] + list(range(num_categories_1)), num_facilities, p=[0.1] + [0.9 / num_categories_1] * num_categories_1)
category_2_data = np.random.choice([np.nan] + list(range(num_categories_2)), num_facilities, p=[0.1] + [0.9 / num_categories_2] * num_categories_2)
continuous_data = np.random.rand(num_facilities, num_continuous_features)

df = pd.DataFrame({
    'category_1': category_1_data,
    'category_2': category_2_data,
    'continuous_features': list(continuous_data)
})

# Null値を各カテゴリの数で置換
df['category_1'] = df['category_1'].fillna(num_categories_1)
df['category_2'] = df['category_2'].fillna(num_categories_2)

category_1_tensor = torch.LongTensor(df['category_1'].values)
category_2_tensor = torch.LongTensor(df['category_2'].values)
continuous_tensor = torch.FloatTensor(np.stack(df['continuous_features'].values))

In [6]:
class FacilityEmbeddingModel(nn.Module):
    def __init__(self, num_categories_1, num_categories_2, embedding_dim, num_continuous_features):
        super().__init__()

        # Null値のインデックス用に+1
        self.category_1_embedding = nn.Embedding(num_categories_1 + 1, embedding_dim)
        self.category_2_embedding = nn.Embedding(num_categories_2 + 1, embedding_dim)
        self.fc_continuous = nn.Linear(num_continuous_features, embedding_dim)
        self.fc_output = nn.Linear(embedding_dim * 3, 1)

    def forward(self, category_1, category_2, continuous_features):
        category_1_embed = self.category_1_embedding(category_1)
        category_2_embed = self.category_2_embedding(category_2)
        continuous_embed = self.fc_continuous(continuous_features)

        x = torch.cat([category_1_embed, category_2_embed, continuous_embed], dim=1)
        output = self.fc_output(x)
        return output

In [8]:
embedding_dim = 50
model = FacilityEmbeddingModel(num_categories_1, num_categories_2, embedding_dim, num_continuous_features)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

labels = np.random.rand(num_facilities, 1)
labels_tensor = torch.FloatTensor(labels)

In [10]:
num_epochs = 100
for epoch in range(num_epochs):
    model.train()

    predictions = model(category_1_tensor, category_2_tensor, continuous_tensor)

    loss = nn.MSELoss()(predictions, labels_tensor)

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1} / {num_epochs}], Loss: {loss.item():.4f}')

Epoch [10 / 100], Loss: 0.1115
Epoch [20 / 100], Loss: 0.0908
Epoch [30 / 100], Loss: 0.0826
Epoch [40 / 100], Loss: 0.0815
Epoch [50 / 100], Loss: 0.0800
Epoch [60 / 100], Loss: 0.0795
Epoch [70 / 100], Loss: 0.0792
Epoch [80 / 100], Loss: 0.0790
Epoch [90 / 100], Loss: 0.0789
Epoch [100 / 100], Loss: 0.0788
