In [138]:
from tqdm import tqdm

# bag of wordsの辞書を作成
def create_BoW_dict(file_path):
    BoW_dict = {}
    i = 0
    with open(file_path, "r", encoding="utf-8")as f:
        data_list = f.readlines()
        for data in data_list:
            data = data.split('\t')
            word_list = data[0].split()
            for word in word_list:
                if word in BoW_dict:
                    continue
                else:
                    BoW_dict[word] = i
                    i += 1
    return BoW_dict

# t=0, e=1, m=2, b=3としてテストデータのベクトルとラベルをそれぞれリストとして作成
def create_vector_and_label(file_path, BoW_dict):
    with open(file_path, "r", encoding="utf-8")as f:
        data_list = f.readlines()
        vector_list = []
        label_list = []
        for data in tqdm(data_list):
            data = data.split("\t")
            word_list = data[0].split()
            vector = [0] * len(BoW_dict)
            for word in word_list:
                if word in BoW_dict:
                    vector[BoW_dict[word]] += 1
            vector_list.append(vector)
            label_alph = data[1].strip("\n")
            if label_alph == "t":
                label = [1, 0, 0, 0]
            elif label_alph == "e":
                label = [0, 1, 0, 0]
            elif label_alph == "m":
                label = [0, 0, 1, 0]
            elif label_alph == "b":
                label = [0, 0, 0, 1]
            else:
                print("error label")
            label_list.append(label)
    return vector_list, label_list

In [139]:
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

TRAIN_FEATURE_FILE_PATH = "./news+aggregator/train.feature.txt"

# bag of wordsの辞書型を作成
BoW_dict = create_BoW_dict(TRAIN_FEATURE_FILE_PATH)

# test.txtからベクトルリストとラベルリストを作成
x_train_list, y_train_list = create_vector_and_label(TRAIN_FEATURE_FILE_PATH, BoW_dict)

# データの準備
x_train = torch.Tensor(x_train_list)
y_train = torch.Tensor(y_train_list)

# 訓練データのデータローダー
train = TensorDataset(x_train, y_train)
train_loader = DataLoader(train, batch_size=10, shuffle=True)

100%|██████████| 10672/10672 [00:01<00:00, 8508.30it/s]


In [140]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.l1 = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        x = self.l1(x)
        return x

input_dim = len(x_train_list[0])
print(input_dim)
output_dim = 4
model = LogisticRegression(input_dim, output_dim)

19433


In [141]:
lr_late = 0.001
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = lr_late)

In [142]:
loss_history = []
for epoch in tqdm(range(5000)):
    total_loss = 0
    for x, y in train_loader:

        # 学習ステップ
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    loss_history.append(total_loss)
    if (epoch +1) % 100 == 0:
        print(epoch + 1, total_loss)


  2%|▏         | 100/5000 [00:33<28:18,  2.88it/s]

100 761.1860098838806


  4%|▍         | 200/5000 [01:08<27:56,  2.86it/s]

200 625.5535930544138


  6%|▌         | 300/5000 [01:45<29:59,  2.61it/s]

300 542.0386967211962


  8%|▊         | 400/5000 [02:30<39:38,  1.93it/s]

400 482.75428840518


 10%|█         | 500/5000 [03:18<34:07,  2.20it/s]

500 437.0078830495477


 12%|█▏        | 600/5000 [04:01<30:55,  2.37it/s]

600 401.25587888062


 14%|█▍        | 700/5000 [04:42<29:16,  2.45it/s]

700 371.9822965115309


 16%|█▌        | 800/5000 [05:23<27:45,  2.52it/s]

800 347.05813186615705


 18%|█▊        | 900/5000 [06:04<27:03,  2.53it/s]

900 325.6603614836931


 20%|██        | 1000/5000 [06:44<27:26,  2.43it/s]

1000 307.45335640385747


 22%|██▏       | 1100/5000 [07:25<26:40,  2.44it/s]

1100 290.630559489131


 24%|██▍       | 1200/5000 [08:06<26:33,  2.39it/s]

1200 275.46822855621576


 26%|██▌       | 1300/5000 [08:47<25:04,  2.46it/s]

1300 262.25558410957456


 28%|██▊       | 1400/5000 [09:27<23:53,  2.51it/s]

1400 250.30101216584444


 30%|███       | 1500/5000 [10:08<23:58,  2.43it/s]

1500 239.49997985735536


 32%|███▏      | 1600/5000 [10:49<23:20,  2.43it/s]

1600 229.59570173732936


 34%|███▍      | 1700/5000 [11:30<24:08,  2.28it/s]

1700 220.03269326873124


 36%|███▌      | 1800/5000 [12:12<21:43,  2.46it/s]

1800 211.7617136761546


 38%|███▊      | 1900/5000 [17:44<17:07,  3.02it/s]   

1900 203.6610907241702


 40%|████      | 2000/5000 [18:17<17:18,  2.89it/s]

2000 196.46574130281806


 42%|████▏     | 2100/5000 [18:51<15:55,  3.04it/s]

2100 189.73275798000395


 44%|████▍     | 2200/5000 [19:24<15:30,  3.01it/s]

2200 183.30839114636183


 46%|████▌     | 2300/5000 [19:57<16:14,  2.77it/s]

2300 177.17994842864573


 48%|████▊     | 2400/5000 [20:30<14:03,  3.08it/s]

2400 171.74735465366393


 50%|█████     | 2500/5000 [21:09<17:43,  2.35it/s]

2500 166.40923403389752


 52%|█████▏    | 2600/5000 [21:47<15:30,  2.58it/s]

2600 161.71981749869883


 54%|█████▍    | 2700/5000 [22:29<22:09,  1.73it/s]

2700 156.5944049945101


 56%|█████▌    | 2800/5000 [23:16<16:28,  2.23it/s]

2800 152.25181629881263


 58%|█████▊    | 2900/5000 [24:01<14:13,  2.46it/s]

2900 147.84349730052054


 60%|██████    | 3000/5000 [24:44<14:15,  2.34it/s]

3000 143.9211634825915


 62%|██████▏   | 3100/5000 [25:28<12:51,  2.46it/s]

3100 140.04854682646692


 64%|██████▍   | 3200/5000 [26:08<12:13,  2.46it/s]

3200 136.5620765388012


 66%|██████▌   | 3300/5000 [26:49<12:01,  2.35it/s]

3300 133.13560885749757


 68%|██████▊   | 3400/5000 [31:56<08:57,  2.98it/s]   

3400 129.97308901138604


 70%|███████   | 3500/5000 [32:30<08:40,  2.88it/s]

3500 126.59668392129242


 72%|███████▏  | 3600/5000 [33:04<07:56,  2.94it/s]

3600 123.58141821529716


 74%|███████▍  | 3700/5000 [38:57<12:05,  1.79it/s]   

3700 120.7086262544617


 76%|███████▌  | 3800/5000 [39:34<07:00,  2.86it/s]

3800 118.02180839795619


 78%|███████▊  | 3900/5000 [40:12<07:06,  2.58it/s]

3900 115.33378690760583


 80%|████████  | 4000/5000 [40:52<06:38,  2.51it/s]

4000 112.82221245206892


 82%|████████▏ | 4100/5000 [41:29<05:57,  2.52it/s]

4100 110.51758272200823


 84%|████████▍ | 4200/5000 [42:08<04:51,  2.74it/s]

4200 108.13682402484119


 86%|████████▌ | 4300/5000 [42:45<04:14,  2.75it/s]

4300 105.85063517745584


 88%|████████▊ | 4400/5000 [43:27<04:33,  2.19it/s]

4400 103.69817931205034


 90%|█████████ | 4500/5000 [44:10<03:35,  2.32it/s]

4500 101.87434778781608


 92%|█████████▏| 4600/5000 [44:57<04:44,  1.40it/s]

4600 100.16657078173012


 94%|█████████▍| 4700/5000 [45:40<02:08,  2.33it/s]

4700 97.84056044649333


 96%|█████████▌| 4800/5000 [46:22<01:21,  2.45it/s]

4800 95.92304863967001


 98%|█████████▊| 4900/5000 [47:04<00:42,  2.38it/s]

4900 94.20696676149964


100%|██████████| 5000/5000 [47:45<00:00,  1.74it/s]

5000 92.46189489495009





In [151]:
torch.save(model, 'model_weight.pth')

In [150]:
correct = 0
total = 0

TEST_FEATURE_FILE_PATH = "./news+aggregator/test.feature.txt"

# test.txtからベクトルリストとラベルリストを作成
x_test_list, y_test_list = create_vector_and_label(TEST_FEATURE_FILE_PATH, BoW_dict)

# データの準備
x_test = torch.Tensor(x_test_list)
y_test = torch.Tensor(y_test_list).long()

# 訓練データのデータローダー
test = TensorDataset(x_test, y_test)
test_loader = DataLoader(test, batch_size=5)

for x, y in test_loader:
    outputs = model(x)
    _, predicted = torch.max(outputs.data, 0)   #確率の最大値を取得
    total += y.size(0)  #batch処理をしているのでy.size == 5
    correct += (predicted == y).sum().item()    #ラベルと一致している個数を計算．　item()はtensor->intへの変換
print('正解率', int(correct)/total*100)

100%|██████████| 1334/1334 [00:00<00:00, 4988.87it/s]


正解率 79.83508245877061
