In [2]:
from tqdm import tqdm

# bag of wordsの辞書を作成
def create_BoW_dict(file_path):
    BoW_dict = {}
    i = 0
    with open(file_path, "r", encoding="utf-8")as f:
        data_list = f.readlines()
        for data in data_list:
            data = data.split('\t')
            word_list = data[0].split()
            for word in word_list:
                if word in BoW_dict:
                    continue
                else:
                    BoW_dict[word] = i
                    i += 1
    return BoW_dict

# t=0, e=1, m=2, b=3としてテストデータのベクトルとラベルをそれぞれリストとして作成
def create_vector_and_label(file_path, BoW_dict):
    with open(file_path, "r", encoding="utf-8")as f:
        data_list = f.readlines()
        vector_list = []
        label_list = []
        for data in tqdm(data_list):
            data = data.split("\t")
            word_list = data[0].split()
            vector = [0] * len(BoW_dict)
            for word in word_list:
                if word in BoW_dict:
                    vector[BoW_dict[word]] += 1
            vector_list.append(vector)
            label_alph = data[1].strip("\n")
            if label_alph == "t":
                label = [1, 0, 0, 0]
            elif label_alph == "e":
                label = [0, 1, 0, 0]
            elif label_alph == "m":
                label = [0, 0, 1, 0]
            elif label_alph == "b":
                label = [0, 0, 0, 1]
            else:
                print("error label")
            label_list.append(label)
    return vector_list, label_list

In [3]:
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

TRAIN_FEATURE_FILE_PATH = "./news+aggregator/train.feature.txt"

# bag of wordsの辞書型を作成
BoW_dict = create_BoW_dict(TRAIN_FEATURE_FILE_PATH)

# test.txtからベクトルリストとラベルリストを作成
x_train_list, y_train_list = create_vector_and_label(TRAIN_FEATURE_FILE_PATH, BoW_dict)

# データの準備
x_train = torch.Tensor(x_train_list)
y_train = torch.Tensor(y_train_list)

# 訓練データのデータローダー
train = TensorDataset(x_train, y_train)
train_loader = DataLoader(train, batch_size=10, shuffle=True)

100%|██████████| 10671/10671 [00:01<00:00, 7309.79it/s]


In [17]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, middle_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.l1 = torch.nn.Linear(input_dim, middle_dim)
        self.relu = torch.nn.ReLU()
        self.l2 = torch.nn.Linear(middle_dim, output_dim)
        self.softmax =  torch.nn.Softmax(dim = 1)

    def forward(self, x):
        x = self.l1(x)
        x = self.relu(x)
        x = self.l2(x)
        x = self.softmax(x)
        return x

In [18]:
TEST_FEATURE_FILE_PATH = "./news+aggregator/test.feature.txt"

# test.txtからベクトルリストとラベルリストを作成
x_test_list, y_test_list = create_vector_and_label(TEST_FEATURE_FILE_PATH, BoW_dict)
# データの準備
x_test = torch.Tensor(x_test_list)
y_test = torch.Tensor(y_test_list)
# テストデータのデータローダー
test = TensorDataset(x_test, y_test)
test_loader = DataLoader(test, batch_size=5)

100%|██████████| 1335/1335 [00:00<00:00, 14728.74it/s]


In [19]:
criterion = torch.nn.CrossEntropyLoss()
input_dim = len(x_train_list[0])
output_dim = 4
middle_dim = 100
model= LogisticRegression(input_dim, middle_dim, output_dim)
lr_late_list = [0.001, 0.01 ,0.1 ,1]
for lr_late in lr_late_list:
    model= LogisticRegression(input_dim, middle_dim, output_dim)
    optimizer= torch.optim.SGD(model.parameters(), lr = lr_late)
    for epoch in range(100):
        total_loss = 0
        for x, y in train_loader:

            # 学習ステップ
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
    correct = 0
    total = 0
    for x, y in test_loader:
        outputs = model(x)
        _, predicted = torch.max(outputs.data, 0)   #確率の最大値を取得
        total += y.size(0)  #batch処理をしているのでy.size == 5
        correct += (predicted == y).sum().item()    #ラベルと一致している個数を計算．　item()はtensor->intへの変換
    print(f'lr_late = {lr_late}, 正解率: {correct/total}')

lr_late = 0.001, 正解率: 0.7625468164794007
lr_late = 0.01, 正解率: 0.7872659176029962
lr_late = 0.1, 正解率: 0.800749063670412
lr_late = 1, 正解率: 0.8554307116104869
