In [1]:
from tqdm import tqdm

# bag of wordsの辞書を作成
def create_BoW_dict(file_path):
    BoW_dict = {}
    i = 0
    with open(file_path, "r", encoding="utf-8")as f:
        data_list = f.readlines()
        for data in data_list:
            data = data.split('\t')
            word_list = data[0].split()
            for word in word_list:
                if word in BoW_dict:
                    continue
                else:
                    BoW_dict[word] = i
                    i += 1
    return BoW_dict

# t=0, e=1, m=2, b=3としてテストデータのベクトルとラベルをそれぞれリストとして作成
def create_vector_and_label(file_path, BoW_dict):
    with open(file_path, "r", encoding="utf-8")as f:
        data_list = f.readlines()
        vector_list = []
        label_list = []
        for data in tqdm(data_list):
            data = data.split("\t")
            word_list = data[0].split()
            vector = [0] * len(BoW_dict)
            for word in word_list:
                if word in BoW_dict:
                    vector[BoW_dict[word]] += 1
            vector_list.append(vector)
            label_alph = data[1].strip("\n")
            if label_alph == "t":
                label = [1, 0, 0, 0]
            elif label_alph == "e":
                label = [0, 1, 0, 0]
            elif label_alph == "m":
                label = [0, 0, 1, 0]
            elif label_alph == "b":
                label = [0, 0, 0, 1]
            else:
                print("error label")
            label_list.append(label)
    return vector_list, label_list

In [2]:
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

TRAIN_FEATURE_FILE_PATH = "./news+aggregator/train.feature.txt"

# bag of wordsの辞書型を作成
BoW_dict = create_BoW_dict(TRAIN_FEATURE_FILE_PATH)

# test.txtからベクトルリストとラベルリストを作成
x_train_list, y_train_list = create_vector_and_label(TRAIN_FEATURE_FILE_PATH, BoW_dict)

# データの準備
x_train = torch.Tensor(x_train_list)
y_train = torch.Tensor(y_train_list)

# 訓練データのデータローダー
train = TensorDataset(x_train, y_train)
train_loader = DataLoader(train, batch_size=10, shuffle=True)

100%|██████████| 10671/10671 [00:01<00:00, 9188.47it/s]


In [3]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.l1 = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        x = self.l1(x)
        return x

input_dim = len(x_train_list[0])
print(input_dim)
output_dim = 4
model1 = LogisticRegression(input_dim, output_dim)
model2 = LogisticRegression(input_dim, output_dim)

18036


In [4]:
lr_late = 0.001
weight_decay_1 = 0.01
weight_decay_2 = 1
criterion = torch.nn.CrossEntropyLoss()

#weight_decayではL2正則化が行われる
optimizer_1 = torch.optim.SGD(model1.parameters(), lr = lr_late, weight_decay=weight_decay_1)
optimizer_2 = torch.optim.SGD(model2.parameters(), lr = lr_late, weight_decay=weight_decay_2)

In [5]:
# L2ノルムの係数が0.01の場合
loss_history = []
for epoch in tqdm(range(500)):
    total_loss = 0
    for x, y in train_loader:

        # 学習ステップ
        optimizer_1.zero_grad()
        outputs = model1(x)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer_1.step()

        total_loss += loss.item()

    loss_history.append(total_loss)
    if (epoch +1) % 100 == 0:
        print(epoch + 1, total_loss)

# L2ノルムの係数が1の場合
loss_history = []
for epoch in tqdm(range(500)):
    total_loss = 0
    for x, y in train_loader:

        # 学習ステップ
        optimizer_2.zero_grad()
        outputs = model2(x)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer_2.step()

        total_loss += loss.item()

    loss_history.append(total_loss)
    if (epoch +1) % 100 == 0:
        print(epoch + 1, total_loss)

 20%|██        | 100/500 [00:33<02:06,  3.17it/s]

100 857.9507199525833


 40%|████      | 200/500 [01:06<01:39,  3.02it/s]

200 821.4074583053589


 60%|██████    | 300/500 [01:38<01:05,  3.06it/s]

300 811.690823495388


 80%|████████  | 400/500 [02:11<00:32,  3.09it/s]

400 809.0689907819033


100%|██████████| 500/500 [02:47<00:00,  2.99it/s]


500 808.831681728363


 20%|██        | 100/500 [00:36<02:37,  2.55it/s]

100 1369.6489729881287


 40%|████      | 200/500 [01:19<02:13,  2.24it/s]

200 1370.0429347753525


 60%|██████    | 300/500 [02:03<01:30,  2.20it/s]

300 1370.1339852809906


 80%|████████  | 400/500 [02:45<00:40,  2.49it/s]

400 1369.8560975790024


100%|██████████| 500/500 [03:25<00:00,  2.43it/s]

500 1369.8315967321396





In [7]:
correct = 0
total = 0

TEST_FEATURE_FILE_PATH = "./news+aggregator/test.feature.txt"
VALID_FEATURE_FILE_PATH = "./news+aggregator/valid.feature.txt"

# test.txtからベクトルリストとラベルリストを作成
x_test_list, y_test_list = create_vector_and_label(TEST_FEATURE_FILE_PATH, BoW_dict)
# データの準備
x_test = torch.Tensor(x_test_list)
y_test = torch.Tensor(y_test_list)
# テストデータのデータローダー
test = TensorDataset(x_test, y_test)
test_loader = DataLoader(test, batch_size=5)

# test.txtからベクトルリストとラベルリストを作成
x_valid_list, y_valid_list = create_vector_and_label(VALID_FEATURE_FILE_PATH, BoW_dict)
# データの準備
x_valid = torch.Tensor(x_valid_list)
y_valid = torch.Tensor(y_valid_list)
# テストデータのデータローダー
valid = TensorDataset(x_valid, y_valid)
valid_loader = DataLoader(valid, batch_size=5)

# model1の学習データでの正解率計算
correct = 0
total = 0
for x, y in train_loader:
    outputs = model1(x)
    _, predicted = torch.max(outputs.data, 0)   #確率の最大値を取得
    total += y.size(0)  #batch処理をしているのでy.size == 5
    correct += (predicted == y).sum().item()    #ラベルと一致している個数を計算．　item()はtensor->intへの変換
print('model1,学習データ,正解率', int(correct)/total*100)

# model2の学習データでの正解率計算
correct = 0
total = 0
for x, y in train_loader:
    outputs = model2(x)
    _, predicted = torch.max(outputs.data, 0)   #確率の最大値を取得
    total += y.size(0)  #batch処理をしているのでy.size == 5
    correct += (predicted == y).sum().item()    #ラベルと一致している個数を計算．　item()はtensor->intへの変換
print('model2,学習データ,正解率', int(correct)/total*100)

# model1の検証データでの正解率計算
correct = 0
total = 0
for x, y in valid_loader:
    outputs = model1(x)
    _, predicted = torch.max(outputs.data, 0)   #確率の最大値を取得
    total += y.size(0)  #batch処理をしているのでy.size == 5
    correct += (predicted == y).sum().item()    #ラベルと一致している個数を計算．　item()はtensor->intへの変換
print('model1,検証データ,正解率', int(correct)/total*100)

# model2の検証データでの正解率計算
correct = 0
total = 0
for x, y in valid_loader:
    outputs = model2(x)
    _, predicted = torch.max(outputs.data, 0)   #確率の最大値を取得
    total += y.size(0)  #batch処理をしているのでy.size == 5
    correct += (predicted == y).sum().item()    #ラベルと一致している個数を計算．　item()はtensor->intへの変換
print('model2,検証データ,正解率', int(correct)/total*100)

# model1のテストデータでの正解率計算
correct = 0
total = 0
for x, y in test_loader:
    outputs = model1(x)
    _, predicted = torch.max(outputs.data, 0)   #確率の最大値を取得
    total += y.size(0)  #batch処理をしているのでy.size == 5
    correct += (predicted == y).sum().item()    #ラベルと一致している個数を計算．　item()はtensor->intへの変換
print('model 1,テストデータ,正解率', int(correct)/total*100)

# model2のテストデータでの正解率計算
correct = 0
total = 0
for x, y in test_loader:
    outputs = model2(x)
    _, predicted = torch.max(outputs.data, 0)   #確率の最大値を取得
    total += y.size(0)  #batch処理をしているのでy.size == 5
    correct += (predicted == y).sum().item()    #ラベルと一致している個数を計算．　item()はtensor->intへの変換
print('model 2,テストデータ,正解率', int(correct)/total*100)

100%|██████████| 1335/1335 [00:00<00:00, 10888.98it/s]
100%|██████████| 1333/1333 [00:00<00:00, 14797.29it/s]


model1,学習データ,正解率 39.64951738356292
model2,学習データ,正解率 38.74051166713522
model1,検証データ,正解率 80.3450862715679
model2,検証データ,正解率 79.74493623405851
model 1,テストデータ,正解率 81.94756554307116
model 2,テストデータ,正解率 81.64794007490637
