# 70. 単語ベクトルの和による特徴量

In [1]:
import torch
import pandas as pd
import string
import re

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
df_train = pd.read_csv('./../6/data/train.txt', sep='\t')
df_valid = pd.read_csv('./../6/data/valid.txt', sep='\t')
df_test = pd.read_csv('./../6/data/test.txt', sep='\t')

In [35]:
print(df_test)

     CATEGORY                                              TITLE
0           t  UPDATE 1-US eyes bankruptcy link in GM ignitio...
1           e  17 Times Zach Braff's 'Wish I Was Here' Refere...
2           e  Orlando Bloom - Orlando Bloom wants to inspire...
3           b  Factbox: Energy Future Holdings and largest de...
4           b  UPDATE 10-Oil steadies after big drop on easin...
...       ...                                                ...
1329        t  To snap a thief: App takes 'theftie' photos an...
1330        e  Macaulay Culkin's The Pizza Underground Booed ...
1331        b  Treasury Volatility Drops on Yellen Low-Rates ...
1332        b  Shell Profit Falls 3 Percent on Lower Producti...
1333        e  Peter Mayhew - Peter Mayhew returning to Star ...

[1334 rows x 2 columns]


In [4]:
def preprocessing_text(text):
    for p in string.punctuation:
        text = text.replace(p, " ") 
    text = text.lower()
    text = re.sub('[0-9]+', '0', text)
    return text

In [36]:
df_train['TITLE'] = df_train['TITLE'].map(preprocessing_text)
df_valid['TITLE'] = df_valid['TITLE'].map(preprocessing_text)
df_test['TITLE'] = df_test['TITLE'].map(preprocessing_text)

In [37]:
print(df_test)

     CATEGORY                                              TITLE
0           t  update 0 us eyes bankruptcy link in gm ignitio...
1           e  0 times zach braff s  wish i was here  referen...
2           e  orlando bloom   orlando bloom wants to inspire...
3           b  factbox  energy future holdings and largest de...
4           b  update 0 oil steadies after big drop on easing...
...       ...                                                ...
1329        t  to snap a thief  app takes  theftie  photos an...
1330        e  macaulay culkin s the pizza underground booed ...
1331        b  treasury volatility drops on yellen low rates ...
1332        b  shell profit falls 0 percent on lower producti...
1333        e  peter mayhew   peter mayhew returning to star ...

[1334 rows x 2 columns]


In [8]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [30]:
def sentence2vec(sentence):
    words = sentence.split()
    vec = []
    for word in words:
        if word in model.key_to_index:
            vec.append(model[word])
    return torch.tensor(sum(vec)/len(vec))

In [40]:
print(sentence2vec(df_test.loc[2]['TITLE']))
print(sentence2vec(df_test.loc[2]['TITLE']).shape)

tensor([-0.0064,  0.1666,  0.0461,  0.0635, -0.0651, -0.0059,  0.1076, -0.0627,
        -0.0975,  0.0043, -0.0691, -0.1365, -0.0534,  0.0007, -0.0624,  0.1193,
         0.0442,  0.1467,  0.1217, -0.1173, -0.1886,  0.0437,  0.1072,  0.0053,
        -0.1796, -0.0396, -0.1024,  0.0732,  0.0245, -0.0174, -0.0630,  0.0473,
        -0.2296, -0.0962, -0.1084, -0.0104,  0.0239, -0.0671,  0.0096,  0.0771,
         0.1613, -0.1780,  0.1324, -0.0715, -0.0157, -0.2377, -0.0110, -0.1320,
        -0.0738,  0.0753, -0.0649,  0.0859,  0.1310,  0.1611,  0.1163,  0.0762,
        -0.0607, -0.0691, -0.0372,  0.0264,  0.1182, -0.0639, -0.0516, -0.0295,
        -0.1074, -0.0647, -0.0289,  0.0035,  0.0780,  0.0920,  0.1152,  0.0538,
        -0.0842,  0.0648, -0.1462,  0.0180,  0.0691, -0.0541, -0.0080,  0.1553,
        -0.0535,  0.0303, -0.0819, -0.1530, -0.0576,  0.1530, -0.0348,  0.0070,
        -0.0972,  0.0549,  0.0286, -0.0829, -0.1797, -0.2237, -0.0668, -0.0294,
         0.0943,  0.0873,  0.0802, -0.05

In [41]:
X_train = torch.stack([sentence2vec(title) for title in df_train['TITLE']])
X_valid = torch.stack([sentence2vec(title) for title in df_valid['TITLE']])
X_test = torch.stack([sentence2vec(title) for title in df_test['TITLE']])

In [42]:
X_train.shape

torch.Size([10672, 300])

In [43]:
category_label = {'b': 0, 't': 1, 'e':2, 'm':3}

In [54]:
y_train = torch.tensor(df_train['CATEGORY'].map(lambda x: category_label[x]))
y_valid = torch.tensor(df_valid['CATEGORY'].map(lambda x: category_label[x]))
y_test = torch.tensor(df_test['CATEGORY'].map(lambda x: category_label[x]))

In [56]:
print(y_train.shape)

torch.Size([10672])


In [57]:
torch.save(X_train, './data/X_train.pt')
torch.save(X_valid, './data/X_valid.pt')
torch.save(X_test,  './data/X_test.pt')
torch.save(y_train, './data/y_train.pt')
torch.save(y_valid, './data/y_valid.pt')
torch.save(y_test,  './data/y_test.pt')

# 71. 単層ニューラルネットワークによる予測

In [58]:
from torch import nn

In [90]:
class SP(nn.Module):
    def __init__(self):
        super().__init__()
        self.l = nn.Linear(300,4)
    def forward(self, x):
        x = self.l(x)
        return x

In [91]:
model = SP()

In [92]:
print(model)

SP(
  (l): Linear(in_features=300, out_features=4, bias=True)
)


In [103]:
logits = model.forward(X_train[0])
y_1 =  torch.softmax(logits, dim=-1)
print(logits)
print(y_1)
y_14 = torch.softmax(model.forward(X_train[:4]), dim=-1)

tensor([ 0.0190, -0.0010,  0.0423, -0.0975], grad_fn=<AddBackward0>)
tensor([0.2568, 0.2517, 0.2629, 0.2286], grad_fn=<SoftmaxBackward0>)


In [94]:
y_14

tensor([[0.2568, 0.2517, 0.2629, 0.2286],
        [0.2453, 0.2469, 0.2574, 0.2505],
        [0.2574, 0.2567, 0.2483, 0.2376],
        [0.2530, 0.2501, 0.2550, 0.2419]], grad_fn=<SoftmaxBackward0>)

# 72. 損失と勾配の計算

In [112]:
criterion = nn.CrossEntropyLoss()

In [113]:
logits = model(X_train[0])
loss = criterion(logits, y_train[0])

In [114]:
print(f'損失: {loss}')

損失: 1.3361155986785889


In [115]:
model.zero_grad()
print(f'初期化勾配：{model.l.weight.grad}')
loss.backward()
print(f'更新後勾配：{model.l.weight.grad}')

初期化勾配：tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
更新後勾配：tensor([[-0.0188, -0.0018,  0.0171,  ..., -0.0169,  0.0233,  0.0182],
        [-0.0184, -0.0018,  0.0168,  ..., -0.0166,  0.0228,  0.0179],
        [ 0.0540,  0.0052, -0.0492,  ...,  0.0486, -0.0669, -0.0524],
        [-0.0167, -0.0016,  0.0153,  ..., -0.0151,  0.0207,  0.0162]])


# 73. 確率的勾配降下法による学習

In [116]:
import torch.utils.data as data

In [117]:
class NewsDataset(data.Dataset):
    def __init__(self, X, y, phase='train'):
        self.X = X
        self.y = y
        self.phase = phase
    
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [124]:
train_ds = NewsDataset(X_train, y_train, phase='train')
valid_ds = NewsDataset(X_valid, y_valid, phase='val')
test_ds = NewsDataset(X_test, y_test, phase='val')

In [123]:
print(train_ds.__getitem__(0))

(tensor([-7.3273e-02, -7.0496e-03,  6.6753e-02,  6.8593e-02,  8.5124e-02,
        -7.9244e-02, -3.2888e-02, -2.3075e-01,  9.4126e-02,  6.0750e-02,
         2.7913e-02, -8.7077e-02,  3.9673e-04,  3.7638e-03,  1.7118e-02,
        -2.7059e-03,  9.9487e-03,  1.0543e-01,  1.3762e-02, -1.6479e-02,
        -4.8584e-02,  4.0927e-02,  6.1971e-02, -2.4729e-02,  6.2948e-02,
        -1.0811e-01, -2.1730e-02,  6.1198e-02,  1.0673e-01, -1.7207e-02,
         5.8484e-02,  3.3276e-02, -7.4178e-02,  3.8737e-02, -8.0037e-02,
         7.7947e-02,  1.9435e-02,  9.6232e-03,  3.3539e-02,  8.5426e-02,
        -2.1596e-02, -1.3161e-01,  1.3269e-01,  4.5644e-02,  1.0569e-01,
         6.9936e-05, -2.6754e-02, -3.2705e-02, -3.0909e-02,  6.8665e-02,
         9.3282e-03,  9.4467e-02,  8.2347e-02,  3.9958e-02,  4.2124e-02,
         6.1462e-02, -1.2727e-01, -3.9810e-02, -9.5774e-03, -1.0060e-01,
         5.3304e-02,  7.2286e-02,  1.9762e-02, -1.2910e-01,  2.4913e-02,
        -6.4936e-02, -3.1235e-02, -1.1332e-02,  2.

In [140]:
train_dataloader = data.DataLoader(train_ds, batch_size=64, shuffle=True)
valid_dataloader = data.DataLoader(valid_ds, batch_size=len(valid_ds), shuffle=True)
test_dataloader = data.DataLoader(test_ds, batch_size=len(test_ds), shuffle=True)

In [141]:
dataloader = {'train': train_dataloader,
              'val': valid_dataloader,
              'test': test_dataloader}

In [147]:
from tqdm import tqdm

model = SP()

In [143]:
def train(model, dataloader, num_epochs):
    # 損失関数の定義
    criterion = nn.CrossEntropyLoss()
    # 最適化手法の定義
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1} / {num_epochs}')
        print('-'*50)
        
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
            
            epoch_loss = 0.0
            epoch_corrects = 0
            
            for inputs, labels in tqdm(dataloader[phase]):
                optimizer.zero_grad()
                # forward
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels) 
                    _, preds = torch.max(outputs, 1) 
                    # backward
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    epoch_loss += loss.item() * inputs.size(0)
                    epoch_corrects += torch.sum(preds == labels.data)
            # epochごとのlossと正解率の表示
            epoch_loss = epoch_loss / len(dataloader[phase].dataset)
            epoch_acc = epoch_corrects.double() / len(dataloader[phase].dataset)
            
            print('{} Loss: {:.4f}, Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

In [148]:
train(model, dataloader, 10)

Epoch 1 / 10
--------------------------------------------------


100%|██████████| 167/167 [00:00<00:00, 1115.79it/s]


train Loss: 1.2727, Acc: 0.6088


100%|██████████| 1/1 [00:00<00:00, 122.31it/s]


val Loss: 1.1879, Acc: 0.6957
Epoch 2 / 10
--------------------------------------------------


100%|██████████| 167/167 [00:00<00:00, 1018.49it/s]


train Loss: 1.1392, Acc: 0.7191


100%|██████████| 1/1 [00:00<00:00, 98.23it/s]


val Loss: 1.1032, Acc: 0.7376
Epoch 3 / 10
--------------------------------------------------


100%|██████████| 167/167 [00:00<00:00, 997.00it/s] 


train Loss: 1.0741, Acc: 0.7451


100%|██████████| 1/1 [00:00<00:00, 114.43it/s]


val Loss: 1.0540, Acc: 0.7504
Epoch 4 / 10
--------------------------------------------------


100%|██████████| 167/167 [00:00<00:00, 1010.50it/s]


train Loss: 1.0318, Acc: 0.7539


100%|██████████| 1/1 [00:00<00:00, 118.72it/s]


val Loss: 1.0187, Acc: 0.7519
Epoch 5 / 10
--------------------------------------------------


100%|██████████| 167/167 [00:00<00:00, 1107.90it/s]


train Loss: 0.9995, Acc: 0.7570


100%|██████████| 1/1 [00:00<00:00, 131.89it/s]


val Loss: 0.9902, Acc: 0.7534
Epoch 6 / 10
--------------------------------------------------


100%|██████████| 167/167 [00:00<00:00, 1100.18it/s]


train Loss: 0.9726, Acc: 0.7594


100%|██████████| 1/1 [00:00<00:00, 130.13it/s]


val Loss: 0.9658, Acc: 0.7564
Epoch 7 / 10
--------------------------------------------------


100%|██████████| 167/167 [00:00<00:00, 1105.80it/s]


train Loss: 0.9493, Acc: 0.7607


100%|██████████| 1/1 [00:00<00:00, 134.77it/s]


val Loss: 0.9444, Acc: 0.7624
Epoch 8 / 10
--------------------------------------------------


100%|██████████| 167/167 [00:00<00:00, 1100.35it/s]


train Loss: 0.9285, Acc: 0.7608


100%|██████████| 1/1 [00:00<00:00, 124.38it/s]


val Loss: 0.9252, Acc: 0.7639
Epoch 9 / 10
--------------------------------------------------


100%|██████████| 167/167 [00:00<00:00, 1110.21it/s]


train Loss: 0.9098, Acc: 0.7618


100%|██████████| 1/1 [00:00<00:00, 123.61it/s]


val Loss: 0.9078, Acc: 0.7654
Epoch 10 / 10
--------------------------------------------------


100%|██████████| 167/167 [00:00<00:00, 1090.41it/s]


train Loss: 0.8928, Acc: 0.7626


100%|██████████| 1/1 [00:00<00:00, 123.58it/s]

val Loss: 0.8918, Acc: 0.7661





# 74. 正解率の計測

In [149]:
def calc_acc(model, dataloader):
    model.eval()
    corrects = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1) # ラベルを予想
            corrects += torch.sum(preds == labels.data)
    return corrects / len(dataloader.dataset)

In [152]:
acc_train = calc_acc(model, dataloader['train'])
acc_valid = calc_acc(model, dataloader['val'])
acc_test = calc_acc(model, dataloader['test'])

In [155]:
print(f'train acuracy: {acc_train}')
print(f'valid acuracy: {acc_valid}')
print(f'test acuracy: {acc_test}')

train acuracy: 0.761712908744812
valid acuracy: 0.7661169171333313
test acuracy: 0.7533733248710632
