## 训练word2vec

In [1]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.callbacks import CallbackAny2Vec
import time, csv
ori_train_file = "/home/kesci/input/bytedance/first-round/train.csv"
word2vec_train_sentences = "./inputs/word2vec_train_sentences.txt"
model_save_dir = "./outputs/models"
model_save_file = "word2vec.model"
import os

In [2]:
regen_word2vec = False

将数据处理成适合word2vec模型直接读取的，一行为一个句子。

In [3]:
def PrepareWord2vecSamples(source_csv, save_file):
    query_id = -1
    since = time.time()
    with open(save_file, 'w') as f:
        with open(source_csv) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            line_count = 0
            for row in csv_reader:
                line_count += 1
                if row[0] != query_id:
                    query_id = row[0]
                    f.write("{0}\n".format(row[1]))
                f.write("{0}\n".format(row[3]))
                if line_count % 5000000 == 0: 
                    print(f'Processed {line_count} lines.')
    end = time.time()
    print("Process use time: %ds" % (end-since))
# PrepareWord2vecSamples(ori_train_file, word2vec_train_sentences)

建立记录word2vec训练过程的类。

In [5]:
class EpochSaver(CallbackAny2Vec):
    '''用于保存模型, 打印损失函数等等'''
    def __init__(self, savedir, save_name="word2vector.model"):
        os.makedirs(savedir, exist_ok=True)
        self.save_path = os.path.join(savedir, save_name)
        self.epoch = 0
        self.pre_loss = 0
        self.best_loss = 999999999.9
        self.since = time.time()

    def on_epoch_end(self, model):
        self.epoch += 1
        cum_loss = model.get_latest_training_loss() # 返回的是从第一个epoch累计的
        epoch_loss = cum_loss - self.pre_loss
        time_taken = time.time() - self.since
        print("Epoch %d, loss: %.2f, time: %dmin %ds" % 
                    (self.epoch, epoch_loss, time_taken//60, time_taken%60))
        if self.best_loss > epoch_loss and epoch_loss > 0:
            self.best_loss = epoch_loss
            print("Better model. Best loss: %.2f" % self.best_loss)
            model.save(self.save_path)
            print("Model %s save done!" % self.save_path)
        self.pre_loss = cum_loss
        self.since = time.time()

In [5]:
if regen_word2vec:
    # 1, 构建模型(不训练)
    model_word2vec = Word2Vec(min_count=2, 
                              window=10, 
                              size=200,
                              workers=4,
                              batch_words=100000)

In [6]:
if regen_word2vec:
    # 2, 遍历一遍语料库
    since = time.time()
    model_word2vec.build_vocab(
        LineSentence(word2vec_train_sentences), 
        progress_per=20000000
    )
    time_elapsed = time.time() - since
    print('Time to build vocab: {:.0f}min {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

In [7]:
if regen_word2vec:
    # 3, 训练
    since = time.time()
    model_word2vec.train(
        LineSentence(word2vec_train_sentences), 
        total_examples=model_word2vec.corpus_count, 
        epochs=15, compute_loss=True, report_delay=60*10, # 每隔10分钟输出一下日志
        callbacks=[EpochSaver(model_save_dir, model_save_file)])
    time_elapsed = time.time() - since
    print('Time to train: {:.0f}min {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))