In [17]:
from tqdm import tqdm

# bag of wordsの辞書を作成
def create_BoW_dict(file_path):
    BoW_dict = {}
    i = 0
    with open(file_path, "r", encoding="utf-8")as f:
        data_list = f.readlines()
        for data in data_list:
            data = data.split('\t')
            word_list = data[0].split()
            for word in word_list:
                if word in BoW_dict:
                    continue
                else:
                    BoW_dict[word] = i
                    i += 1
    return BoW_dict

# t=0, e=1, m=2, b=3としてテストデータのベクトルとラベルをそれぞれリストとして作成
def create_vector_and_label(file_path, BoW_dict):
    with open(file_path, "r", encoding="utf-8")as f:
        data_list = f.readlines()
        vector_list = []
        label_list = []
        for data in tqdm(data_list):
            data = data.split("\t")
            word_list = data[0].split()
            vector = [0] * len(BoW_dict)
            for word in word_list:
                if word in BoW_dict:
                    vector[BoW_dict[word]] += 1
            vector_list.append(vector)
            label_alph = data[1].strip("\n")
            if label_alph == "t":
                label = [1, 0, 0, 0]
            elif label_alph == "e":
                label = [0, 1, 0, 0]
            elif label_alph == "m":
                label = [0, 0, 1, 0]
            elif label_alph == "b":
                label = [0, 0, 0, 1]
            else:
                print("error label")
            label_list.append(label)
    return vector_list, label_list

In [18]:
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

TRAIN_FEATURE_FILE_PATH = "./news+aggregator/train.feature.txt"

# bag of wordsの辞書型を作成
BoW_dict = create_BoW_dict(TRAIN_FEATURE_FILE_PATH)

# test.txtからベクトルリストとラベルリストを作成
x_train_list, y_train_list = create_vector_and_label(TRAIN_FEATURE_FILE_PATH, BoW_dict)

# データの準備
x_train = torch.Tensor(x_train_list)
y_train = torch.Tensor(y_train_list)

# 訓練データのデータローダー
train = TensorDataset(x_train, y_train)
train_loader = DataLoader(train, batch_size=10, shuffle=True)

100%|██████████| 10671/10671 [00:01<00:00, 8840.46it/s]


In [19]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.l1 = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        x = self.l1(x)
        return x

input_dim = len(x_train_list[0])
print(input_dim)
output_dim = 4
model = LogisticRegression(input_dim, output_dim)

18036


In [20]:
lr_late = 0.001
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = lr_late)

In [24]:
loss_history = []
for epoch in tqdm(range(1000)):
    total_loss = 0
    for x, y in train_loader:

        # 学習ステップ
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    loss_history.append(total_loss)
    if (epoch +1) % 100 == 0:
        print(epoch + 1, total_loss)


 10%|█         | 100/1000 [00:38<05:30,  2.72it/s]

100 395.76273143291473


 20%|██        | 200/1000 [01:16<04:22,  3.04it/s]

200 366.5188947543502


 30%|███       | 300/1000 [01:49<03:44,  3.12it/s]

300 342.0197876319289


 40%|████      | 400/1000 [02:22<03:16,  3.05it/s]

400 320.86812723241746


 50%|█████     | 500/1000 [02:55<02:54,  2.86it/s]

500 302.60727171972394


 60%|██████    | 600/1000 [03:31<02:28,  2.70it/s]

600 286.5454768240452


 70%|███████   | 700/1000 [04:12<02:03,  2.42it/s]

700 272.10299603268504


 80%|████████  | 800/1000 [04:56<01:25,  2.35it/s]

800 259.3595562428236


 90%|█████████ | 900/1000 [05:36<00:40,  2.49it/s]

900 247.2589638736099


100%|██████████| 1000/1000 [06:16<00:00,  2.66it/s]

1000 236.6337717808783





In [25]:
torch.save(model, 'model_weight.pth')

In [32]:
correct = 0
total = 0

TEST_FEATURE_FILE_PATH = "./news+aggregator/test.feature.txt"

# test.txtからベクトルリストとラベルリストを作成
x_test_list, y_test_list = create_vector_and_label(TEST_FEATURE_FILE_PATH, BoW_dict)

# データの準備
x_test = torch.Tensor(x_test_list)
y_test = torch.Tensor(y_test_list)

# 訓練データのデータローダー
test = TensorDataset(x_test, y_test)
test_loader = DataLoader(test, batch_size=5)

for x, y in test_loader:
    outputs = model(x)
    _, predicted = torch.max(outputs.data, 1)   #確率の最大値を取得
    total += y.size(0)  #batch処理をしているのでy.size == 5
    _, label = torch.max(y, 1)
    correct += (predicted == label).sum().item() #ラベルと一致している個数を計算．　item()はtensor->intへの変換
print('テストデータでの正解率', int(correct)/total*100)

100%|██████████| 1335/1335 [00:00<00:00, 9882.45it/s]


テストデータでの正解率 86.74157303370787
