## 包引入与变量准备

In [1]:
import os
import sys
import csv
sys.path.append("./global")
from helper import Timer, usetime, OFFLINE, cp

In [2]:
# 原始训练与测试数据位置
if OFFLINE:
    ori_train_file = "./legacy/data/train.csv"
    ori_test_file = "./legacy/data/test.csv"
    ori_test_label_file = "./legacy/data/test_label.csv"
else:
    ori_train_file = "/home/kesci/input/bytedance/first-round/train.csv"
    ori_test_file = "/home/kesci/input/bytedance/first-round/test.csv"

In [3]:
# Stage0阶段
stage0_output_dir = "./stage1/input/"
## 训练数据相关
if OFFLINE:
    train_file = stage0_output_dir + "train.csv"
else:
    random_sample = False
    if random_sample:
        random_rate = 0.05
        train_file = stage0_output_dir + "train_random_005.csv"
    else:
        sample_num = 5000000
        # 默认取前500w条
        train_file = stage0_output_dir + "train_front_500w.csv"
## 测试数据相关
test_file = stage0_output_dir + "test.csv"
# 线下可以增加测试集label
if OFFLINE:
    test_label_file = stage0_output_dir + "test_label.csv"

In [4]:
# Stage1阶段
state1_dir = "./stage1/"
stage1_code_dir = state1_dir + "code/"
stage1_input_dir = state1_dir + "input/"
stage1_output_dir = state1_dir + "output/"

In [5]:
# Stage2阶段
state2_dir = "./stage2/"
stage2_input_dir = state2_dir + "input/"
stage2_output_dir = state2_dir + "output/"

In [6]:
regenerate_all = False  # 谨慎使用这个参数 

## Stage0阶段

这一阶段主要是数据准备，且只需要运行一次。

In [7]:
from stage0.code.data_prepare import RandomSampleCSV, SampleCSV
regenerate_data = False

In [8]:
if regenerate_all or regenerate_data:
    # 训练数据准备
    if OFFLINE:
        cp(ori_train_file, train_file)
    else:
        if random_sample:
            # 随机从一亿数据中采样rate比例的数据来训练
            RandomSampleCSV(source_csv=ori_train_file, save_file=train_file, rate=random_rate)
        else:
            # 取前c条数据来训练   
            SampleCSV(source_csv=ori_train_file, save_file=train_file, count=sample_num)  

In [9]:
if regenerate_all or regenerate_data:
    # 测试数据准备
    if OFFLINE:
        cp(ori_test_file, test_file)
        cp(ori_test_label_file, test_label_file)
    else:
        cp(ori_test_file, test_file)

## Stage1阶段

### 提取词向量

In [10]:
from stage1.code.build_word2vec import PrepareWord2vecSamples, TrainWord2vec
stage1_word2vec_sentences = stage1_input_dir+"word2vec_sentences.txt"
stage1_word2vec_model = stage1_output_dir+"word2vec"
regenerate_embedding_samples = False  # 重新生成适合embedding训练的样本集
regenerate_embedding_model = False  # 重新生成适合embedding模型

#### Word2vec

1. 生成Word2vec可直接训练的sentences文件
2. 训练word2vec

In [11]:
# 生成适合word2vec训练的样本集
if regenerate_all or regenerate_embedding_samples:
    PrepareWord2vecSamples(train_file, stage1_word2vec_sentences)

----->Started 'Prepare for word2vec samples' block...
----->Finished 'Prepare for word2vec samples' block, time used: 0.07s.


In [12]:
# 生成word2vec模型
if regenerate_all or regenerate_embedding_model:
    TrainWord2vec(stage1_word2vec_sentences, stage1_word2vec_model)

----->Started 'Train word2vec' block...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


----->Finished 'Train word2vec' block, time used: 0.98s.


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


#### FastText

FastText的使用：https://blog.csdn.net/ymaini/article/details/81489599

1. 生成适合fastText训练的文本格式
2. 有监督训练

In [13]:
# ProcessForTrainFastText(train_file_data, stage1_input_dir+"fastText_labeled_content.txt", add_label=True)

In [14]:
# !cd stage1/code; python build_fastText.py -f "../input/fastText_labeled_content.txt" -d "../output/" -s "fastText_supervised.bin"

### 特征抽取

In [15]:
from stage1.code.feature_text import ExtractTextFeature
from stage1.code.feature_embed import ExtractEmbedFeature
from helper import ORI_TRAIN_NAMES, ORI_TEST_NAMES, ORI_TRAIN_DTYPE, ORI_TEST_DTYPE
feature_prefix = "v1"
regenerate_feature = False
regenerate_text_feature = False
regenerate_embed_feature = False

#### Text Mining Featues

In [16]:
if regenerate_all or regenerate_feature or regenerate_text_feature:
    # 提取train的特征
    ExtractTextFeature(train_file, save_dir=stage1_output_dir, prefix="train_{}".format(feature_prefix), 
                       names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', 'label', "query_id", "query_title_id"])

----->Started 'Extract text feature' block...
----->Finished 'Extract text feature' block, time used: 10.8s.


In [17]:
if regenerate_all or regenerate_feature or regenerate_text_feature:
    # 提取test的特征
    ExtractTextFeature(test_file, save_dir=stage1_output_dir, prefix="test_{}".format(feature_prefix), 
                       names=ORI_TEST_NAMES, dtype=ORI_TEST_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', "query_id", "query_title_id"])

----->Started 'Extract text feature' block...
----->Finished 'Extract text feature' block, time used: 3.5s.


#### Embedding Features

In [None]:
if regenerate_all or regenerate_feature or regenerate_embed_feature:
    # 提取train的特征 
    ExtractEmbedFeature(train_file, save_dir=stage1_output_dir, prefix="train_{}".format(feature_prefix), 
                       names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', 'label', "query_id", "query_title_id"], 
                       embed_mode="word2vec", embed_model_file=stage1_word2vec_model+".kv")

----->Started 'Extract embedding feature' block...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
if regenerate_all or regenerate_feature or regenerate_embed_feature:
    # 提取test的特征 
    ExtractEmbedFeature(test_file, save_dir=stage1_output_dir, prefix="test_{}".format(feature_prefix), 
                       names=ORI_TEST_NAMES, dtype=ORI_TEST_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', "query_id", "query_title_id"], 
                       embed_mode="word2vec", embed_model_file=stage1_word2vec_model+".kv")

### Prepare for Stage2

本节是将一些stage1产生的结果进过处理后放到stage2的input里面去。

In [None]:
from stage1.code.prepare_for_stage2 import ExtractTrainLabel, ConvertCSVToNPY, CombineFeatures

# stage1
## text feature
train_text_feature_stage1 = stage1_output_dir + "train_{}_feature_text.csv".format(feature_prefix)
test_text_feature_stage1 = stage1_output_dir + "test_{}_feature_text.csv".format(feature_prefix)
## embed feature
train_embed_feature_stage1 = stage1_output_dir + "train_{}_feature_embed.csv".format(feature_prefix)
test_embed_feature_stage1 = stage1_output_dir + "test_{}_feature_embed.csv".format(feature_prefix)
## concat
train_concat_feature_stage1 = stage1_output_dir + "train_{}_feature_concat.csv".format(feature_prefix)
test_concat_feature_stage1 = stage1_output_dir + "test_{}_feature_concat.csv".format(feature_prefix)

# stage2
## text feature
train_feature_stage2 = stage2_input_dir + "train_{}_feature_concat.npy".format(feature_prefix)
test_feature_stage2 = stage2_input_dir + "test_{}_feature_concat.npy".format(feature_prefix)
train_labels_stage2 = stage2_input_dir+"train_labels.npy"
test_labels_stage2 = None
if OFFLINE:
    test_labels_stage2 = stage2_input_dir+"test_labels.npy"

In [None]:
# 提取标签
if regenerate_all or regenerate_data:
    # 提取训练集的标签并单独存储
    ExtractTrainLabel(train_file, train_labels_stage2)
    if OFFLINE:
        # 提取测试集的标签并单独存储
        ExtractTrainLabel(test_label_file, test_labels_stage2)

In [None]:
# 拼接特征并输出到stage2
if regenerate_all or regenerate_feature:
    CombineFeatures([train_text_feature_stage1, train_embed_feature_stage1], train_concat_feature_stage1)
    CombineFeatures([test_text_feature_stage1, test_embed_feature_stage1], test_concat_feature_stage1)
    ConvertCSVToNPY(train_concat_feature_stage1, train_feature_stage2)
    ConvertCSVToNPY(test_concat_feature_stage1, test_feature_stage2)

## Stage2阶段

In [None]:
from stage2.code.run_hyperopt import LoadDataset, run_lgb_gbdt, run_lgb_dart

In [None]:
# 加载数据集
LoadDataset(in_train_feature=train_feature_stage2, 
            in_train_label=train_labels_stage2, 
            in_test_feature=test_feature_stage2,
            in_test_label=test_labels_stage2)

In [None]:
run_lgb_gbdt(eval_num=20)

In [None]:
# run_lgb_dart(eval_num=20)