# 第8章: ニューラルネット

[https://nlp100.github.io/ja/ch08.html](https://nlp100.github.io/ja/ch08.html)

第6章で取り組んだニュース記事のカテゴリ分類を題材として，ニューラルネットワークでカテゴリ分類モデルを実装する．なお，この章ではPyTorch, TensorFlow, Chainerなどの機械学習プラットフォームを活用せよ．

## 70. 単語ベクトルの和による特徴量

In [1]:
import gensim
import numpy as np
import pandas as pd
import spacy
import torch
import tqdm

nlp = spacy.load("en_core_web_sm")

# Word2Vec
w2v = gensim.models.KeyedVectors.load_word2vec_format('../Chapter7/GoogleNews-vectors-negative300.bin', binary=True)

# Load texts and make tensors
dataset_types = ['train', 'valid', 'test']
Xs, ys = {}, {}
label2int = {
    "b": 0,
    "t": 1,
    "e": 2,
    "m": 3
}

for dataset in tqdm.tqdm(dataset_types):
    tmp_x, tmp_y = [], []
    tmp_df = pd.read_table('../Chapter6/{:}.txt'.format(dataset))

    for each in tqdm.tqdm(tmp_df.itertuples()):

        # make X
        tokens = [token for token in nlp(each.TITLE)]
        num_tokens = len(tokens)

        x_i = np.zeros(300)
        for token in tokens:
            try:
                token_embedding = w2v[str(token)]
                x_i = np.add(x_i, token_embedding)

            except KeyError:
                num_tokens -= 1
                continue

        x_i = np.divide(x_i, num_tokens)
        tmp_x.append(x_i)

        # make y
        tmp_y.append(label2int[each.CATEGORY])
    
    # convert to torch.Tensor
    Xs[dataset] = torch.Tensor(tmp_x)
    ys[dataset] = torch.Tensor(tmp_y)

    # pickle
    torch.save(tmp_x, 'X_{:}.pickle'.format(dataset))
    torch.save(tmp_y, 'y_{:}.pickle'.format(dataset))

s][A
2105it [00:13, 162.40it/s][A
2122it [00:13, 163.49it/s][A
2139it [00:13, 163.86it/s][A
2156it [00:13, 164.65it/s][A
2173it [00:13, 163.71it/s][A
2190it [00:13, 164.48it/s][A
2207it [00:13, 162.67it/s][A
2224it [00:13, 164.22it/s][A
2241it [00:14, 164.25it/s][A
2259it [00:14, 166.32it/s][A
2276it [00:14, 162.77it/s][A
2293it [00:14, 159.22it/s][A
2309it [00:14, 158.70it/s][A
2326it [00:14, 159.85it/s][A
2343it [00:14, 159.82it/s][A
2359it [00:14, 159.78it/s][A
2376it [00:14, 161.89it/s][A
2393it [00:15, 157.49it/s][A
2409it [00:15, 156.75it/s][A
2425it [00:15, 155.11it/s][A
2442it [00:15, 158.49it/s][A
2459it [00:15, 159.92it/s][A
2476it [00:15, 160.71it/s][A
2493it [00:15, 160.67it/s][A
2510it [00:15, 160.77it/s][A
2527it [00:15, 162.50it/s][A
2544it [00:15, 163.00it/s][A
2561it [00:16, 164.42it/s][A
2578it [00:16, 161.31it/s][A
2595it [00:16, 161.54it/s][A
2612it [00:16, 150.57it/s][A
2629it [00:16, 155.86it/s][A
2646it [00:16, 159.65it/s][A
2663