# 使用20W训练集数据，生成词向量

In [1]:
!pip install torch torchvision

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import logging
import random

import numpy as np
import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# set seed
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f49ec1069d0>

In [3]:
# split data to 10 fold
fold_num = 10
data_file = '../input/train_set.csv'
import pandas as pd


def all_data2fold(fold_num, num=200000):
    fold_data = []
    
    # 读取数据文件
    f = pd.read_csv(data_file, sep='\t', encoding='UTF-8')
    
    # 读取到num到的数据到text
    texts = f['text'].tolist()[:num]
    
    # 读取到num到的数据到label
    labels = f['label'].tolist()[:num]
    
    # 获取标签数
    total = len(labels)

    # 使用标签数，生成列表
    index = list(range(total))
    
    # 打乱顺序
    np.random.shuffle(index)

    # 定义所有text
    all_texts = []
    # 定义所有label
    all_labels = []
    
    for i in index:
        # 把打乱顺序的text，放进所有text
        all_texts.append(texts[i])
        
        # 把打乱顺序的label，放进所有label
        all_labels.append(labels[i])

    label2id = {}
    
    #给每个label赋予索引值
    for i in range(total):
        
        # 按索引顺序 ，从打乱顺序的所有label例取出label
        label = str(all_labels[i])
        
        # 如果 label不在label2id里的话：
        if label not in label2id:
            # 给这个label更新索引值
            label2id[label] = [i]
        else:
            # 给这个label追加索引值
            label2id[label].append(i)
    
    # 定义fold_num个列表
    all_index = [[] for _ in range(fold_num)]
    
    # 遍历label2id里每个label和data
    for label, data in label2id.items():
        # print(label, len(data))
        
        # data和fold_num来确定批量尺寸
        batch_size = int(len(data) / fold_num)
        
        # 
        other = len(data) - batch_size * fold_num
        for i in range(fold_num):
            cur_batch_size = batch_size + 1 if i < other else batch_size
            # print(cur_batch_size)
            batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
            all_index[i].extend(batch_data)

    batch_size = int(total / fold_num)
    other_texts = []
    other_labels = []
    other_num = 0
    start = 0
    for fold in range(fold_num):
        num = len(all_index[fold])
        texts = [all_texts[i] for i in all_index[fold]]
        labels = [all_labels[i] for i in all_index[fold]]

        if num > batch_size:
            fold_texts = texts[:batch_size]
            other_texts.extend(texts[batch_size:])
            fold_labels = labels[:batch_size]
            other_labels.extend(labels[batch_size:])
            other_num += num - batch_size
        elif num < batch_size:
            end = start + batch_size - num
            fold_texts = texts + other_texts[start: end]
            fold_labels = labels + other_labels[start: end]
            start = end
        else:
            fold_texts = texts
            fold_labels = labels

        assert batch_size == len(fold_labels)

        # shuffle
        index = list(range(batch_size))
        np.random.shuffle(index)

        shuffle_fold_texts = []
        shuffle_fold_labels = []
        for i in index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])

        data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
        fold_data.append(data)

    logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))

    return fold_data


fold_data = all_data2fold(10)

2020-07-31 02:26:30,536 INFO: generated new fontManager
2020-07-31 02:26:37,617 INFO: Fold lens [20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000]


In [4]:
# build train data for word2vec
fold_id = 9

train_texts = []
for i in range(0, fold_id):
    data = fold_data[i]
    train_texts.extend(data['text'])
    
logging.info('Total %d docs.' % len(train_texts))

2020-07-31 02:26:37,655 INFO: Total 180000 docs.


In [5]:
!pip install -U gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 10.9MB/s eta 0:00:01
[?25hCollecting smart-open>=1.8.1 (from gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/0b/8e/464b06f5efd26f2dc16ce7bd1662c2f31cadf9104fdbcbf5994674cc3a51/smart_open-2.1.0.tar.gz (116kB)
[K     |████████████████████████████████| 122kB 56.2MB/s eta 0:00:01
Collecting requests (from smart-open>=1.8.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/45/1e/0c169c6a5381e241ba7404532c16a21d86ab872c9bed8bdcd4c423954103/requests-2.24.0-py2.py3-none-any.whl (61kB)
[K     |████████████████████████████████| 71kB 28.6MB/s eta 0:00:01
[?25hCollecting boto (from smart-open>=1.8.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/23/10/c0b78c27298029e4454a472a1919bde20cb18

In [6]:
logging.info('Start training...')
from gensim.models.word2vec import Word2Vec

num_features = 100     # Word vector dimensionality
num_workers = 8       # Number of threads to run in parallel

train_texts = list(map(lambda x: list(x.split()), train_texts))
model = Word2Vec(train_texts, workers=num_workers, size=num_features)
model.init_sims(replace=True)

# save model
model.save("../emb/word2vec.bin")

2020-07-31 02:26:51,439 INFO: Start training...
2020-07-31 02:27:10,142 INFO: collecting all words and their counts
2020-07-31 02:27:10,143 INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-07-31 02:27:11,336 INFO: PROGRESS: at sentence #10000, processed 9131556 words, keeping 5300 word types
2020-07-31 02:27:12,523 INFO: PROGRESS: at sentence #20000, processed 18143665 words, keeping 5689 word types
2020-07-31 02:27:13,703 INFO: PROGRESS: at sentence #30000, processed 27090844 words, keeping 5872 word types
2020-07-31 02:27:14,909 INFO: PROGRESS: at sentence #40000, processed 36068099 words, keeping 6008 word types
2020-07-31 02:27:16,170 INFO: PROGRESS: at sentence #50000, processed 45236085 words, keeping 6121 word types
2020-07-31 02:27:17,390 INFO: PROGRESS: at sentence #60000, processed 54263660 words, keeping 6210 word types
2020-07-31 02:27:18,571 INFO: PROGRESS: at sentence #70000, processed 63258307 words, keeping 6279 word types
2020-07-31 02:27:19

In [7]:
# load model
model = Word2Vec.load("../emb/word2vec.bin")

# convert format
model.wv.save_word2vec_format('../emb/word2vec.txt', binary=False)

2020-07-31 02:32:51,333 INFO: loading Word2Vec object from ../emb/word2vec.bin
2020-07-31 02:32:53,880 INFO: loading wv recursively from ../emb/word2vec.bin.wv.* with mmap=None
2020-07-31 02:32:53,881 INFO: setting ignored attribute vectors_norm to None
2020-07-31 02:32:53,882 INFO: loading vocabulary recursively from ../emb/word2vec.bin.vocabulary.* with mmap=None
2020-07-31 02:32:53,882 INFO: loading trainables recursively from ../emb/word2vec.bin.trainables.* with mmap=None
2020-07-31 02:32:53,884 INFO: setting ignored attribute cum_table to None
2020-07-31 02:32:53,885 INFO: loaded ../emb/word2vec.bin
2020-07-31 02:32:53,896 INFO: storing 5976x100 projection weights into ../emb/word2vec.txt


# 这里转换生成的word2vec.txt,为之后的TextCNN和TextRNN作为数据来源
参考目录结构：
- datawhale
-- app
-- emb
-- input
-- output
-- zip