In [23]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

## データ読み込み

coraのデータを読み込む

In [2]:
df_cora_edges = pd.read_csv(
    "data/cora/cora.cites", sep="\t", names=["source", "target"]
)
df_cora_edges.head(2)

Unnamed: 0,source,target
0,35,1033
1,35,103482


In [3]:
names = ["node"] + [f"word{i}" for i in range(1433)] + ["label"]
df_cora_contexts = pd.read_csv("data/cora/cora.content", sep="\t", names=names)
df_cora_contexts.head(2)

Unnamed: 0,node,word0,word1,word2,word3,word4,word5,word6,word7,word8,...,word1424,word1425,word1426,word1427,word1428,word1429,word1430,word1431,word1432,label
0,31336,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Neural_Networks
1,1061127,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Rule_Learning


## 前処理

In [4]:
# ノードの連番を作成
df_nodes = pd.concat(
    [
        df_cora_contexts["node"],
        pd.Series(np.arange(len(df_cora_contexts)), name="node_index"),
    ],
    axis="columns",
)
df_nodes

Unnamed: 0,node,node_index
0,31336,0
1,1061127,1
2,1106406,2
3,13195,3
4,37879,4
...,...,...
2703,1128975,2703
2704,1128977,2704
2705,1128978,2705
2706,117328,2706


In [5]:
# labelをエンコード
df_labels = pd.concat(
    [
        pd.Series(df_cora_contexts["label"].unique(), name="label"),
        pd.Series(np.arange(df_cora_contexts["label"].nunique()), name="label_index"),
    ],
    axis="columns",
)
df_labels

Unnamed: 0,label,label_index
0,Neural_Networks,0
1,Rule_Learning,1
2,Reinforcement_Learning,2
3,Probabilistic_Methods,3
4,Theory,4
5,Genetic_Algorithms,5
6,Case_Based,6


In [6]:
# エッジをノードの連番で置き換え
df_edges_renamed = (
    df_cora_edges.merge(df_nodes, how="left", left_on="source", right_on="node")
    .drop(["node", "source"], axis="columns")
    .rename(columns={"node_index": "source_node_index"})
    .merge(df_nodes, how="left", left_on="target", right_on="node")
    .drop(["node", "target"], axis="columns")
    .rename(columns={"node_index": "target_node_index"})
)

df_edges_renamed

Unnamed: 0,source_node_index,target_node_index
0,163,402
1,163,659
2,163,1696
3,163,2295
4,163,1274
...,...,...
5424,1886,745
5425,1886,1902
5426,1887,2258
5427,1902,1887


In [7]:
# ラベルを連番で置き換え
df_labels_renamed = df_cora_contexts[["node", "label"]].merge(
    df_labels, how="left", left_on="label", right_on="label"
)
df_labels_renamed

Unnamed: 0,node,label,label_index
0,31336,Neural_Networks,0
1,1061127,Rule_Learning,1
2,1106406,Reinforcement_Learning,2
3,13195,Reinforcement_Learning,2
4,37879,Probabilistic_Methods,3
...,...,...,...
2703,1128975,Genetic_Algorithms,5
2704,1128977,Genetic_Algorithms,5
2705,1128978,Genetic_Algorithms,5
2706,117328,Case_Based,6


In [8]:
# torch_geometric.data.Dataに必要な形式に変更
x = df_cora_contexts.drop(["node", "label"], axis="columns").values
edge_index = df_edges_renamed.values.T
y = df_labels_renamed["label_index"].values

print("shape x:", x.shape)
print("shape y:", y.shape)
print("shape edge_list:", edge_index.shape)

shape x: (2708, 1433)
shape y: (2708,)
shape edge_list: (2, 5429)


In [9]:
# メモリ解放
del df_cora_edges, df_cora_contexts, df_labels_renamed, df_edges_renamed

## モデルの定義

In [10]:
class Net(torch.nn.Module):
    def __init__(
        self,
        feature_size: int,
        output_size: int,
        hidden_size: int,
        dropout_ratio: float = 0.5,
    ):
        super().__init__()

        self.conv1 = GCNConv(feature_size, hidden_size)
        self.conv2 = GCNConv(hidden_size, output_size)

        self.dropout_ratio = dropout_ratio

    def forward(self, x: torch.tensor, edge_index: torch.tensor) -> torch.tensor:
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout_ratio, training=self.training)
        x = self.conv2(x, edge_index)
        return F.softmax(x, dim=1)


model = Net(x.shape[1], len(df_labels), 16)
print(model)

Net(
  (conv1): GCNConv(1433, 16)
  (conv2): GCNConv(16, 7)
)


## 学習

In [11]:
def train(
    model: Net,
    data: Data,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.CrossEntropyLoss,
) -> float:
    """学習"""
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    accuracy = (
        torch.argmax(out[data.train_mask], dim=1) == data.y[data.train_mask]
    ).sum() / data.train_mask.sum()
    return loss.item(), accuracy


def evaluate(
    model: Net, data: Data, criterion: torch.nn.CrossEntropyLoss
) -> (float, float):
    """評価"""
    model.eval()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.test_mask], data.y[data.test_mask])
    accuracy = (
        torch.argmax(out[data.test_mask], dim=1) == data.y[data.test_mask]
    ).sum() / data.test_mask.sum()
    return loss.item(), accuracy

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

oof = np.zeros(len(x), dtype=int)

for fold, (train_index, test_index) in enumerate(cv.split(x, y)):
    print(f"------------ start fold {fold + 1} ------------")
    torch.manual_seed(42)

    # setup data
    data = Data(
        x=torch.tensor(x, dtype=torch.float32),
        edge_index=torch.tensor(edge_index, dtype=torch.long),
        y=torch.tensor(y, dtype=torch.long),
    )
    data.train_mask = torch.tensor(
        [index in train_index for index in range(len(x))], dtype=torch.bool
    )
    data.test_mask = torch.tensor(
        [index in test_index for index in range(len(x))], dtype=torch.bool
    )
    data.to(device)

    # setup model
    model = Net(x.shape[1], len(df_labels), 16).to(device)
    # setup optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    # setup loss
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(1, 101, 1):
        train_loss, train_accuracy = train(model, data, optimizer, criterion)
        test_loss, test_accuracy = evaluate(model, data, criterion)
        if epoch % 10 == 0:
            print(
                f"epoch: {epoch}, train loss: {train_loss: .4f}, train accuracy: {train_accuracy: .4f}, test loss: {test_loss: .4f}, test accuracy: {test_accuracy: .4f}"
            )

    # predict
    oof[test_index] = torch.argmax(
        model(data.x, data.edge_index)[data.test_mask], dim=1
    )

    # save model
    torch.save(model.to("cpu").state_dict(), f"./output/model_{fold}.pth")

------------ start fold 1 ------------
epoch: 10, train loss:  1.6243, train accuracy:  0.6570, test loss:  1.5812, test accuracy:  0.7768
epoch: 20, train loss:  1.4617, train accuracy:  0.7862, test loss:  1.3879, test accuracy:  0.8727
epoch: 30, train loss:  1.3791, train accuracy:  0.8310, test loss:  1.3223, test accuracy:  0.8801
epoch: 40, train loss:  1.3418, train accuracy:  0.8578, test loss:  1.3088, test accuracy:  0.8875
epoch: 50, train loss:  1.3240, train accuracy:  0.8684, test loss:  1.3031, test accuracy:  0.8875
epoch: 60, train loss:  1.3257, train accuracy:  0.8712, test loss:  1.3001, test accuracy:  0.8893
epoch: 70, train loss:  1.3127, train accuracy:  0.8864, test loss:  1.2968, test accuracy:  0.8930
epoch: 80, train loss:  1.3129, train accuracy:  0.8883, test loss:  1.2954, test accuracy:  0.8893
epoch: 90, train loss:  1.3059, train accuracy:  0.8929, test loss:  1.2956, test accuracy:  0.8875
epoch: 100, train loss:  1.3077, train accuracy:  0.8906, tes

In [13]:
print(classification_report(y, oof, target_names=df_labels["label"].values))

                        precision    recall  f1-score   support

       Neural_Networks       0.87      0.90      0.89       818
         Rule_Learning       0.88      0.81      0.84       180
Reinforcement_Learning       0.87      0.82      0.84       217
 Probabilistic_Methods       0.90      0.88      0.89       426
                Theory       0.76      0.75      0.76       351
    Genetic_Algorithms       0.91      0.95      0.93       418
            Case_Based       0.84      0.85      0.84       298

              accuracy                           0.87      2708
             macro avg       0.86      0.85      0.86      2708
          weighted avg       0.87      0.87      0.87      2708



In [16]:
cm = pd.DataFrame(
    confusion_matrix(y, oof),
    index=df_labels["label"].values,
    columns=df_labels["label"].values,
)
cm

Unnamed: 0,Neural_Networks,Rule_Learning,Reinforcement_Learning,Probabilistic_Methods,Theory,Genetic_Algorithms,Case_Based
Neural_Networks,736,0,9,22,26,11,14
Rule_Learning,3,145,0,0,21,1,10
Reinforcement_Learning,13,1,178,2,6,14,3
Probabilistic_Methods,31,0,4,374,12,0,5
Theory,39,10,9,14,264,5,10
Genetic_Algorithms,10,0,4,1,1,397,5
Case_Based,10,9,1,2,17,7,252


## LGBMの精度と比べてみる

In [31]:
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

oof = np.zeros(len(x), dtype=int)

for fold, (train_index, test_index) in enumerate(cv.split(x, y)):
    print(f"------------ start fold {fold + 1} ------------")

    x_train = x[train_index]
    y_train = y[train_index]
    x_test = x[test_index]
    y_test = y[test_index]

    params = {
        "objective": "multiclass",
        "num_class": len(df_labels),
        "seed": 42,
        "verbose": 1,
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(x_train, y_train)
    oof[test_index] = model.predict(x_test)

------------ start fold 1 ------------
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 984
[LightGBM] [Info] Number of data points in the train set: 2166, number of used features: 492
[LightGBM] [Info] Start training from score -1.197530
[LightGBM] [Info] Start training from score -2.710824
[LightGBM] [Info] Start training from score -2.521582
[LightGBM] [Info] Start training from score -1.851692
[LightGBM] [Info] Start training from score -2.042283
[LightGBM] [Info] Start training from score -1.866507
[LightGBM] [Info] Start training from score -2.208367
------------ start fold 2 ------------
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 976
[LightGBM] [Info] Number of data points in the train set: 2166, number of used features: 488
[LightGBM] [Info] Start training from score -1.197530
[LightGBM] [Info] Start training from score -2.710824
[LightGBM] [Info] Start training from score -2.521582
[LightGBM] [In

In [32]:
print(classification_report(y, oof, target_names=df_labels["label"].values))

                        precision    recall  f1-score   support

       Neural_Networks       0.75      0.83      0.79       818
         Rule_Learning       0.74      0.58      0.65       180
Reinforcement_Learning       0.81      0.65      0.72       217
 Probabilistic_Methods       0.76      0.73      0.75       426
                Theory       0.64      0.68      0.66       351
    Genetic_Algorithms       0.86      0.84      0.85       418
            Case_Based       0.74      0.72      0.73       298

              accuracy                           0.75      2708
             macro avg       0.76      0.72      0.73      2708
          weighted avg       0.76      0.75      0.75      2708



In [33]:
cm = pd.DataFrame(
    confusion_matrix(y, oof),
    index=df_labels["label"].values,
    columns=df_labels["label"].values,
)
cm

Unnamed: 0,Neural_Networks,Rule_Learning,Reinforcement_Learning,Probabilistic_Methods,Theory,Genetic_Algorithms,Case_Based
Neural_Networks,682,5,9,51,42,17,12
Rule_Learning,20,104,3,4,22,6,21
Reinforcement_Learning,28,4,141,8,15,12,9
Probabilistic_Methods,74,2,1,311,18,6,14
Theory,53,14,9,16,239,7,13
Genetic_Algorithms,33,4,5,9,9,351,7
Case_Based,23,7,6,9,29,9,215
