70. 単語ベクトルの和による特徴量Permalink

In [39]:
import json
import re
import random
import pandas as pd
from sklearn.model_selection import train_test_split
# データの読込
df=pd.read_csv('NewsAggregatorDataset/newsCorpora.csv', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
#df[df['PUBLISHER']  == 'Reuters']
df = df[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail'])]
df = df[['TITLE','CATEGORY']]

#分割はlist、ndarray、DataFrame

train,valid_once=train_test_split(df,test_size=0.2,shuffle=True,random_state=100,stratify=df['CATEGORY'])
valid,test=train_test_split(valid_once,test_size=0.5,shuffle=True,random_state=100,stratify=valid_once['CATEGORY'])

In [31]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('~/Desktop/ニューラル勉強会/hlab2023-nlp100/7/GoogleNews-vectors-negative300.bin', binary=True)

In [50]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import string
import torch
import torch.nn as nn

In [77]:
def processing(data):
    x=[]
    y=[]
    label={'b':0,'t':1,'e':2,'m':3}
    for title,categoly in data:
        table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        title = title.translate(table)  # 記号をスペースに置換
        title = re.sub('[0-9]+','0',title)
        title = title.lower()
        title = title.split()
        vec = [model[word] for word in title if word in model]
        x.append(sum(vec) / len(vec))
        y.append(label[categoly])
    return torch.tensor(x),torch.tensor(y)

train = np.array(train)
valid = np.array(valid)
test  = np.array(test)

x_train, y_train = processing(train)
x_valid, y_valid = processing(valid)
x_test , y_test  = processing(test)

torch.Size(x_train.size())
print(x_train)
print(y_train)


tensor([[-0.0200,  0.1468, -0.1442,  ..., -0.0660,  0.0160,  0.1448],
        [-0.0325,  0.0205,  0.0668,  ..., -0.0393,  0.0037, -0.0476],
        [ 0.0546, -0.0825, -0.0760,  ..., -0.0602,  0.0048, -0.0069],
        ...,
        [-0.0367,  0.0171, -0.0423,  ..., -0.0434,  0.0469,  0.0620],
        [ 0.0225,  0.0613,  0.0038,  ..., -0.0829, -0.0349, -0.0185],
        [-0.0289,  0.0737, -0.0573,  ..., -0.0636,  0.0144,  0.1008]])
tensor([2, 0, 1,  ..., 0, 2, 2])


71. 単層ニューラルネットワークによる予測

In [240]:
class SLPNet(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.fc = nn.Linear(input_size, output_size, bias=False)
        #nn.init.normal_(self.fc.weight, 0.0, 1.0)   #正規乱数で重みを初期化
    
    def forward(self, x):
        logits = self.fc(x)
        return logits

In [241]:
torch.manual_seed(1)
#model = SLPNet(300, 4)
#わざわざSLNetを作る意味とは
model = nn.Linear(300, 4)
nn.init.normal_(model.weight, 0.0, 1.0)   #正規乱数で重みを初期化

print(model(x_train[:1]))
#そもそもマイナスに対してどうやってsoftmaxしてるのか不明

#model()でも呼び出されるのはforward
#なるべくforwardを直接使用するのは避ける
y_hat_1 = torch.softmax(model(x_train[:1]), dim=-1)
#dimとは→-2,0で列単位-1,1で行単位，なんで二個あるのかは知らん
print(y_hat_1)
Y_hat = torch.softmax(model(x_train[:4]), dim=-1)
print(Y_hat)

tensor([[-1.0368,  1.7777,  0.8214,  0.5145]], grad_fn=<AddmmBackward0>)
tensor([[0.0347, 0.5790, 0.2225, 0.1637]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0347, 0.5790, 0.2225, 0.1637],
        [0.1497, 0.4886, 0.2236, 0.1380],
        [0.0493, 0.3584, 0.3407, 0.2516],
        [0.1980, 0.2933, 0.1872, 0.3215]], grad_fn=<SoftmaxBackward0>)


72. 損失と勾配の計算

In [206]:
criterion = nn.CrossEntropyLoss()

In [242]:
l_1 = criterion(model(x_train[:1]), y_train[:1])
#入力ベクトルはsoftmax前の値
#softmax後の値を入れたら，勾配が計算できなかった

model.zero_grad()  # 勾配をゼロで初期化
l_1.backward()  # 勾配を計算
print(f'損失: {l_1:.4f}')
print(f'勾配:\n{model.weight.grad}')

損失: 1.5027
勾配:
tensor([[-0.0007,  0.0051, -0.0050,  ..., -0.0023,  0.0006,  0.0050],
        [-0.0116,  0.0850, -0.0835,  ..., -0.0382,  0.0093,  0.0838],
        [ 0.0156, -0.1141,  0.1121,  ...,  0.0513, -0.0124, -0.1125],
        [-0.0033,  0.0240, -0.0236,  ..., -0.0108,  0.0026,  0.0237]])


73. 確率的勾配降下法による学習

In [243]:
from torch.utils.data import Dataset

class NewsDataset(Dataset):
  def __init__(self, X, y):  # datasetの構成要素を指定
    self.X = X
    self.y = y

  def __len__(self):  # len(dataset)で返す値を指定
    return len(self.y)

  def __getitem__(self, idx):  # dataset[idx]で返す値を指定
    return [self.X[idx], self.y[idx]]

In [244]:
from torch.utils.data import DataLoader

dataset_train = NewsDataset(x_train, y_train)
dataset_valid = NewsDataset(x_valid, y_valid)
dataset_test = NewsDataset(x_test, y_test)

# Dataloaderの作成
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True)
dataloader_valid = DataLoader(dataset_valid, batch_size=len(dataset_valid), shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=len(dataset_test), shuffle=False)
#DataLoaderに必要なのは，Datasetオブジェクト，バッチサイズ，シャッフル
#データセット：lenとgetitemを指定し，データへの一定のアクセス手法を提供する
#バッチサイズ：？
#シャッフル：エポックごとにデータをシャッフルするかどうか

In [257]:
# モデルの定義
model = SLPNet(300, 4)

# 損失関数の定義
criterion = nn.CrossEntropyLoss()

print(model.parameters)

#オプティマイザ：自動微分の結果を利用してモデルの更新を行うもの
# オプティマイザの定義
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)#model.parametersとは

# 学習
num_epochs = 15
for epoch in range(num_epochs):
  # 訓練モードに設定
  model.train()
  loss_train = 0.0
  for i, (inputs, labels) in enumerate(dataloader_train):
    # 勾配をゼロで初期化
    optimizer.zero_grad()

    # 順伝播 + 誤差逆伝播 + 重み更新
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    # 損失を記録
    loss_train += loss.item()
 
  # バッチ単位の平均損失計算
  loss_train = loss_train / i

  # 検証データの損失計算
  model.eval() 
  with torch.no_grad():
    inputs, labels = next(iter(dataloader_valid))
    outputs = model(inputs)
    loss_valid = criterion(outputs, labels)

  # ログを出力
  print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, loss_valid: {loss_valid:.4f}')  

<bound method Module.parameters of SLPNet(
  (fc): Linear(in_features=300, out_features=4, bias=False)
)>
epoch: 1, loss_train: 0.5297, loss_valid: 0.3998
epoch: 2, loss_train: 0.3648, loss_valid: 0.3648
epoch: 3, loss_train: 0.3323, loss_valid: 0.3495
epoch: 4, loss_train: 0.3155, loss_valid: 0.3437
epoch: 5, loss_train: 0.3044, loss_valid: 0.3392
epoch: 6, loss_train: 0.2970, loss_valid: 0.3303
epoch: 7, loss_train: 0.2909, loss_valid: 0.3264
epoch: 8, loss_train: 0.2868, loss_valid: 0.3243
epoch: 9, loss_train: 0.2825, loss_valid: 0.3219
epoch: 10, loss_train: 0.2799, loss_valid: 0.3234
epoch: 11, loss_train: 0.2767, loss_valid: 0.3232
epoch: 12, loss_train: 0.2748, loss_valid: 0.3198
epoch: 13, loss_train: 0.2724, loss_valid: 0.3196
epoch: 14, loss_train: 0.2711, loss_valid: 0.3221
epoch: 15, loss_train: 0.2700, loss_valid: 0.3199


74. 正解率の計測

In [256]:
def calculate_accuracy(model, loader):
  model.eval()
  total = 0
  correct = 0
  with torch.no_grad():
    for inputs, labels in loader:
      outputs = model(inputs)
      pred = torch.argmax(outputs, dim=-1)
      total += len(inputs)
      correct += (pred == labels).sum().item()
      
  return correct / total

acc_train = calculate_accuracy(model, dataloader_train)
acc_test = calculate_accuracy(model, dataloader_test)
print(f'正解率（学習データ）：{acc_train:.3f}')
print(f'正解率（評価データ）：{acc_test:.3f}')

正解率（学習データ）：0.908
正解率（評価データ）：0.884
