In [27]:
# 53のコードの最後を変更して，正解率を求めている

from tqdm import tqdm

# bag of wordsの辞書を作成
def create_BoW_dict(file_path):
    BoW_dict = {}
    i = 0
    with open(file_path, "r", encoding="utf-8")as f:
        data_list = f.readlines()
        for data in data_list:
            data = data.split('\t')
            word_list = data[0].split()
            for word in word_list:
                if word in BoW_dict:
                    continue
                else:
                    BoW_dict[word] = i
                    i += 1
    return BoW_dict

# t=0, e=1, m=2, b=3としてテストデータのベクトルとラベルをそれぞれリストとして作成
def create_vector_and_label(file_path, BoW_dict):
    with open(file_path, "r", encoding="utf-8")as f:
        data_list = f.readlines()
        vector_list = []
        label_list = []
        for data in tqdm(data_list):
            data = data.split("\t")
            word_list = data[0].split()
            vector = [0] * len(BoW_dict)
            for word in word_list:
                if word in BoW_dict:
                    vector[BoW_dict[word]] += 1
            vector_list.append(vector)
            label_alph = data[1].strip("\n")
            if label_alph == "t":
                label = [1, 0, 0, 0]
            elif label_alph == "e":
                label = [0, 1, 0, 0]
            elif label_alph == "m":
                label = [0, 0, 1, 0]
            elif label_alph == "b":
                label = [0, 0, 0, 1]
            else:
                print("error label")
            label_list.append(label)
    return vector_list, label_list


TRAIN_FEATURE_FILE_PATH = "./news+aggregator/train.feature.txt"

# bag of wordsの辞書型を作成
BoW_dict = create_BoW_dict(TRAIN_FEATURE_FILE_PATH)

In [28]:
import torch

# モデルのクラスが定義されていないとloadできないため
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.l1 = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        x = self.l1(x)
        return x

# 52で学習したモデルのロード
model = torch.load('model_weight.pth')

In [29]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import numpy as np

correct = 0
total = 0

TEST_FEATURE_FILE_PATH = "./news+aggregator/test.feature.txt"

# test.txtからベクトルリストとラベルリストを作成
x_test_list, y_test_list = create_vector_and_label(TEST_FEATURE_FILE_PATH, BoW_dict)

# データの準備
x_test = torch.Tensor(x_test_list)
y_test = torch.Tensor(y_test_list).long()

# 訓練データのデータローダー
test = TensorDataset(x_test, y_test)
test_loader = DataLoader(test, batch_size=5)

test_matrix = np.zeros((4,4))

for x, y in test_loader:
    outputs = model(x)
    _, predicted = torch.max(outputs.data, 1)   #確率の最大値を取得
    total += y.size(0)  #batch処理をしているのでy.size == 5
    _, label = torch.max(y, 1)
    for l,p in zip(label, predicted):
        test_matrix[l][p] += 1

print(f"混合行列:\n {test_matrix}")
print(f"全要素数:{int(np.sum(test_matrix))}")   #testデータ(1334個)と一致しているかの確認用

#     実測値
# 予
# 測
# 値

100%|██████████| 1334/1334 [00:00<00:00, 11734.48it/s]


混合行列:
 [[103.  11.   1.  39.]
 [  6. 518.   0.  13.]
 [  4.  13.  49.  14.]
 [ 12.  15.   1. 535.]]
全要素数:1334


In [31]:
# 56の実装開始
import pandas as pd
# 適合率を求める関数
def get_precision(index: int, matrix):
    """
    index: ラベルのインデックス．
    t->0
    e->1
    m->2
    b->3

    matrix:混合行列
    """
    total = np.sum(matrix, axis=1)
    TP_FP= total[index]
    TP = matrix[index][index]
    precision = TP/TP_FP
    return precision

# 再現率を求める
def get_recall(index: int, matrix):
    """
    index: ラベルのインデックス．
    t->0
    e->1
    m->2
    b->3

    matrix:混合行列
    """
    total = np.sum(matrix, axis=0)
    TP_FN= total[index]
    TP = matrix[index][index]
    recall = TP/TP_FN
    return recall

# F1値を求める
def get_f1(index: int, matrix):
    """
    index: ラベルのインデックス．
    t->0
    e->1
    m->2
    b->3

    matrix:混合行列
    """
    P = get_precision(index, matrix)
    R = get_recall(index, matrix)
    numerator = 2 * P * R #分子
    denominator = P + R #分母
    return numerator/denominator

def get_macro_ave(column: str):
    """
    column: 次のどれか ["適合率", "再現率", "F1値"]
    """
    column_series = df.loc["t":"b", column]
    macro_ave = column_series.sum()/4
    return macro_ave

# マイクロ平均 (適合率も再現率もF1値も同じになる)
def get_mairo_ave(matrix):
    total = np.sum(matrix)
    TP = 0
    for i in range(0,4):
        TP += test_matrix[i][i]
    return TP/total

df = pd.DataFrame(columns = ["適合率", "再現率", "F1値"], index=["t", "e", "m", "b", "macro-average", "micro-average"])
df.loc["t", "適合率"] = get_precision(0, test_matrix)
df.loc["e", "適合率"] = get_precision(1, test_matrix)
df.loc["m", "適合率"] = get_precision(2, test_matrix)
df.loc["b", "適合率"] = get_precision(3, test_matrix)
df.loc["t", "再現率"] = get_recall(0, test_matrix)
df.loc["e", "再現率"] = get_recall(1, test_matrix)
df.loc["m", "再現率"] = get_recall(2, test_matrix)
df.loc["b", "再現率"] = get_recall(3, test_matrix)
df.loc["t", "F1値"] = get_f1(0, test_matrix)
df.loc["e", "F1値"] = get_f1(1, test_matrix)
df.loc["m", "F1値"] = get_f1(2, test_matrix)
df.loc["b", "F1値"] = get_f1(3, test_matrix)
df.loc["macro-average", "適合率"] = get_macro_ave("適合率")
df.loc["macro-average", "再現率"] = get_macro_ave("再現率")
df.loc["macro-average", "F1値"] = get_macro_ave("F1値")
df.loc["micro-average", "適合率"] = get_mairo_ave(test_matrix)
df.loc["micro-average", "再現率"] = get_mairo_ave(test_matrix)
df.loc["micro-average", "F1値"] = get_mairo_ave(test_matrix)
df

Unnamed: 0,適合率,再現率,F1値
t,0.668831,0.824,0.738351
e,0.964618,0.929982,0.946984
m,0.6125,0.960784,0.748092
b,0.950266,0.890183,0.919244
macro-average,0.799054,0.901237,0.838168
micro-average,0.903298,0.903298,0.903298
