**使用gensim训练word2vec**

In [1]:
import logging
import random

import numpy as np
import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# set seed 
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x23cbd884bd0>

In [7]:
# split data to 10 fold
fold_num = 10
data_file = r'E:\deeplearning_data\NewsTextClassification_Data\train_set.csv'
import pandas as pd

In [8]:
def all_data2fold(fold_num):
    fold_data = []
    f = pd.read_csv(data_file, sep='\t', encoding='UTF-8')
    texts = f['text'].tolist()
    labels = f['label'].tolist()

    total = len(labels)

    index = list(range(total))
    np.random.shuffle(index)

    all_texts = []
    all_labels = []
    for i in index:
        all_texts.append(texts[i])
        all_labels.append(labels[i])

    # 对应前面章节的代码
    label2id = {}  # 这里储存的是 标签对应 内容的编码索引 i 么？
    for i in range(total):
        label = str(all_labels[i])
        if label not in label2id:
            label2id[label] = [i]
        else:
            label2id[label].append(i)
            
    all_index = [[] for _ in range(fold_num)]
    for label, data in label2id.items():
        # print(label, len(data))
        batch_size = int(len(data) / fold_num)
        other = len(data) - batch_size * fold_num
        for i in range(fold_num):
            cur_batch_size = batch_size + 1 if i < other else batch_size
            # print(cur_batch_size)
            batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
            all_index[i].extend(batch_data) #存入索引
            
    batch_size = int(total / fold_num)
    other_texts = []
    other_labels = []
    other_num = 0
    start = 0
    for fold in range(fold_num):
        num = len(all_index[fold])
        texts = [all_texts[i] for i in all_index[fold]] #依据索引找到数据
        labels = [all_labels[i] for i in all_index[fold]]

        if num > batch_size: #裁剪
            fold_texts = texts[:batch_size]
            other_texts.extend(texts[batch_size:])
            fold_labels = labels[:batch_size]
            other_labels.extend(labels[batch_size:])
            other_num += num - batch_size
        elif num < batch_size:
            end = start + batch_size - num
            fold_texts = texts + other_texts[start: end]
            fold_labels = labels + other_labels[start: end]
            start = end
        else:
            fold_texts = texts
            fold_labels = labels

        assert batch_size == len(fold_labels)

        # shuffle
        index = list(range(batch_size))
        np.random.shuffle(index)

        shuffle_fold_texts = []
        shuffle_fold_labels = []
        for i in index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])

        data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
        fold_data.append(data)

    logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))

    return fold_data


fold_data = all_data2fold(10)

2021-09-24 17:09:36,694 INFO: Fold lens [20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000]


In [12]:
fold_data[0]['text'].__len__()

20000

In [16]:
fold_data[0]['text'][2].__len__()

1584

In [27]:
# build train data for word2vec
fold_id = 9

train_texts = []
for i in range(0, fold_id):
    data = fold_data[i]
    train_texts.extend(data['text'])
    
logging.info('Total %d docs.' % len(train_texts))

2021-09-24 17:21:09,597 INFO: Total 180000 docs.


In [28]:
train_texts[179999]

'6811 1580 7539 6043 3099 980 3223 980 2935 443 2465 2827 5165 307 6656 5619 5883 5736 1953 2899 1315 7539 6043 3099 980 3223 980 2935 443 648 5780 1722 1816 2827 4151 5165 7194 5057 669 3097 7239 1953 7123 3484 6043 3750 3618 7206 2729 5393 6043 3099 6111 5560 6043 3099 7096 1394 4939 6122 5520 2109 6248 1031 5598 3750 1474 4151 648 2827 4151 5165 7194 2212 742 669 3800 656 4490 490 7123 803 5619 5883 5736 1953 1635 648 2891 2210 3015 4148 6017 2770 3300 6038 4231 648 5530 2835 340 3893 2265 2003 900 5028 4223 531 893 4315 3809 1066 6284 2119 2465 3283 2119 4301 3560 648 2827 6485 3661 4089 3477 2119 3694 2465 2851 1018 4301 2119 5057 151 4211 5530 4216 6122 4853 6734 1394 5282 137 623 6656 4490 900 5619 5883 5736 1953 7399 4967 5530 299 1767 5915 1696 3901 669 5169 3750 2345 1060 5560 4558 1060 4939 4603 5948 1699 2975 3750 3440 1953 3692 3694 3809 4128 5915 2975 5393 3901 4659 3750 5998 1460 6040 152 6833 4939 307 6656 4659 6122 1877 2316 671 5598 900 565 742 4811 5445 1323 3800 750

In [1]:
# logging.info('Start training...')
# from gensim.models.word2vec import Word2Vec

# num_features = 100     # Word vector dimensionality
# num_workers = 8       # Number of threads to run in parallel

# train_texts = list(map(lambda x: list(x.split()), train_texts))
# model = Word2Vec(train_texts, workers=num_workers, vector_size=num_features)
# model.init_sims(replace=True)

# # save model
# model.save("./word2vec.bin")

In [2]:
# Word2Vec??

**注：训练后得到的 word2vec 已经保存在 ./data 文件夹中。**