In [1]:
! pip install  --index "http://pypi/simple" --trusted-host pypi  hyperopt
! pip install  --index "http://pypi/simple" --trusted-host pypi  pyemd

Looking in indexes: http://pypi/simple
Looking in indexes: http://pypi/simple


## 包引入与变量准备

In [2]:
import os
import sys
import csv
import numpy as np
sys.path.append("./global")
from helper import Timer, usetime, OFFLINE, cp

In [3]:
OFFLINE = False

In [4]:
# 原始训练与测试数据位置
if OFFLINE:
    ori_train_file = "./legacy/data/train.csv"
    ori_test_file = "./legacy/data/test.csv"
    ori_test_label_file = "./legacy/data/test_label.csv"
else:
    ori_train_file = "./legacy/data/train.csv"
    ori_test_file = "./legacy/data/test.csv"
#     ori_train_file = "/home/kesci/input/bytedance/first-round/train.csv"
#     ori_test_file = "/home/kesci/input/bytedance/first-round/test.csv"

In [5]:
# Stage0阶段
stage0_output_dir = "./stage1/input/"
## 训练数据相关
if OFFLINE:
    train_file = stage0_output_dir + "train.csv"
else:
    sample_way = "orderly_sample"
    if sample_way == "random":
        random_rate = 0.05
        train_file = stage0_output_dir + "train_random_005.csv"
    elif sample_way == "pre_fixed":
        sample_num = 5000000
        # 默认取前500w条
        train_file = stage0_output_dir + "train_front_500w.csv"
    elif sample_way == "orderly_sample":
        sample_rate = 0.05
        train_file = stage0_output_dir + "train_orderly_500w.csv"
## 测试数据相关
test_file = stage0_output_dir + "test.csv"
# 线下可以增加测试集label
if OFFLINE:
    test_label_file = stage0_output_dir + "test_label.csv"

In [6]:
# Stage1阶段
state1_dir = "./stage1/"
stage1_code_dir = state1_dir + "code/"
stage1_input_dir = state1_dir + "input/"
stage1_output_dir = state1_dir + "output/"

In [7]:
# Stage2阶段
state2_dir = "./stage2/"
stage2_input_dir = state2_dir + "input/"
stage2_output_dir = state2_dir + "output/"

In [8]:
regenerate_all = False  # 谨慎使用这个参数 

## Stage0阶段

这一阶段主要是数据准备，且只需要运行一次。

In [9]:
from stage0.code.data_prepare import RandomSampleCSV, SampleCSV, SampleWithRate
regenerate_data = False

In [10]:
if regenerate_all or regenerate_data:
    # 训练数据准备
    if OFFLINE:
        cp(ori_train_file, train_file)
    else:
        if sample_way == "random":
            # 随机从一亿数据中采样rate比例的数据来训练
            RandomSampleCSV(source_csv=ori_train_file, save_file=train_file, rate=random_rate)
        elif sample_way == "pre_fixed":
            # 取前c条数据来训练   
            SampleCSV(source_csv=ori_train_file, save_file=train_file, count=sample_num)  
        elif sample_way == "orderly_sample":
            SampleWithRate(source_csv=ori_train_file, save_file=train_file, rate=sample_rate, chunk_size=5000)

----->Started 'Sample with rate' block...
processing chunk...
processing chunk...
processing chunk...
processing chunk...
Finished process.
----->Finished 'Sample with rate' block, time used: 0.1s.


In [11]:
if regenerate_all or regenerate_data:
    # 测试数据准备
    if OFFLINE:
        cp(ori_test_file, test_file)
        cp(ori_test_label_file, test_label_file)
    else:
        cp(ori_test_file, test_file)

## Stage1阶段

### 提取词向量

In [12]:
from stage1.code.build_word2vec import PrepareWord2vecSamples, TrainWord2vec
stage1_word2vec_sentences = stage1_input_dir+"word2vec_sentences.txt"
stage1_word2vec_model = stage1_output_dir+"word2vec"
regenerate_embedding_samples = False  # 重新生成适合embedding训练的样本集
regenerate_embedding_model = False  # 重新生成适合embedding模型

#### Word2vec

1. 生成Word2vec可直接训练的sentences文件
2. 训练word2vec

In [13]:
# 生成适合word2vec训练的样本集
if regenerate_all or regenerate_embedding_samples:
    PrepareWord2vecSamples(train_file, stage1_word2vec_sentences)

In [14]:
# 生成word2vec模型
if regenerate_all or regenerate_embedding_model:
    TrainWord2vec(stage1_word2vec_sentences, stage1_word2vec_model)

#### FastText

FastText的使用：https://blog.csdn.net/ymaini/article/details/81489599

1. 生成适合fastText训练的文本格式
2. 有监督训练

In [15]:
# ProcessForTrainFastText(train_file_data, stage1_input_dir+"fastText_labeled_content.txt", add_label=True)

In [16]:
# !cd stage1/code; python build_fastText.py -f "../input/fastText_labeled_content.txt" -d "../output/" -s "fastText_supervised.bin"

### 特征抽取

In [17]:
from stage1.code.feature_text import ExtractTextFeature
from stage1.code.feature_embed import ExtractEmbedFeature
from stage1.code.feature_vector_space import ExtractVectorSpaceFeature
from helper import ORI_TRAIN_NAMES, ORI_TEST_NAMES, ORI_TRAIN_DTYPE, ORI_TEST_DTYPE
feature_prefix = "v1"
regenerate_feature = False
regenerate_text_feature = False
regenerate_embed_feature = False
regenerate_vector_feature = True

#### Text Mining Featues

In [18]:
if regenerate_all or regenerate_feature or regenerate_text_feature:
    # 提取train的特征
    ExtractTextFeature(train_file, save_dir=stage1_output_dir, prefix="train_{}".format(feature_prefix), 
                       names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', 'label', "query_id", "query_title_id"])

In [19]:
if regenerate_all or regenerate_feature or regenerate_text_feature:
    # 提取test的特征
    ExtractTextFeature(test_file, save_dir=stage1_output_dir, prefix="test_{}".format(feature_prefix), 
                       names=ORI_TEST_NAMES, dtype=ORI_TEST_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', "query_id", "query_title_id"])

#### Embedding Features

In [20]:
if regenerate_all or regenerate_feature or regenerate_embed_feature:
    # 提取train的特征 
    ExtractEmbedFeature(train_file, save_dir=stage1_output_dir, prefix="train_{}".format(feature_prefix), 
                       names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', 'label', "query_id", "query_title_id"], 
                       embed_mode="word2vec", embed_model_file=stage1_word2vec_model+".kv")

In [21]:
if regenerate_all or regenerate_feature or regenerate_embed_feature:
    # 提取test的特征 
    ExtractEmbedFeature(test_file, save_dir=stage1_output_dir, prefix="test_{}".format(feature_prefix), 
                       names=ORI_TEST_NAMES, dtype=ORI_TEST_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', "query_id", "query_title_id"], 
                       embed_mode="word2vec", embed_model_file=stage1_word2vec_model+".kv")

#### Vector space feature

In [22]:
if regenerate_all or regenerate_feature or regenerate_vector_feature:
    # 提取train的特征 
    ExtractVectorSpaceFeature(train_file, save_dir=stage1_output_dir, prefix="train_{}".format(feature_prefix), 
                       names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', 'label', "query_id", "query_title_id"])

----->Started 'Extract vector space feature' block...
----->Started 'Extract one chunk feature' block...
----->Started 'Fit tf-idf' block...
----->Finished 'Fit tf-idf' block, time used: 0.13s.
----->Started 'transform df1' block...
----->Finished 'transform df1' block, time used: 0.25s.
----->Started 'transform df2' block...
----->Finished 'transform df2' block, time used: 0.22s.
----->Finished 'Extract one chunk feature' block, time used: 0.71s.
file saved to ./stage1/output/train_v1_feature_vector_space.csv
----->Finished 'Extract vector space feature' block, time used: 0.77s.


In [23]:
if regenerate_all or regenerate_feature or regenerate_vector_feature:
    # 提取test的特征
    ExtractVectorSpaceFeature(test_file, save_dir=stage1_output_dir, prefix="test_{}".format(feature_prefix), 
                       names=ORI_TEST_NAMES, dtype=ORI_TEST_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', "query_id", "query_title_id"])

----->Started 'Extract vector space feature' block...
----->Started 'Extract one chunk feature' block...
----->Started 'Fit tf-idf' block...
----->Finished 'Fit tf-idf' block, time used: 0.81s.
----->Started 'transform df1' block...
----->Finished 'transform df1' block, time used: 1.77s.
----->Started 'transform df2' block...
----->Finished 'transform df2' block, time used: 2.05s.
----->Finished 'Extract one chunk feature' block, time used: 5.11s.
file saved to ./stage1/output/test_v1_feature_vector_space.csv
----->Finished 'Extract vector space feature' block, time used: 5.18s.


### Prepare for Stage2

本节是将一些stage1产生的结果进过处理后放到stage2的input里面去。

In [24]:
from stage1.code.prepare_for_stage2 import ExtractTrainLabel, ConvertCSVToNPY, CombineFeatures

# stage1
## text feature
train_text_feature_stage1 = stage1_output_dir + "train_{}_feature_text.csv".format(feature_prefix)
test_text_feature_stage1 = stage1_output_dir + "test_{}_feature_text.csv".format(feature_prefix)
## embed feature
train_embed_feature_stage1 = stage1_output_dir + "train_{}_feature_embed.csv".format(feature_prefix)
test_embed_feature_stage1 = stage1_output_dir + "test_{}_feature_embed.csv".format(feature_prefix)
## embed feature
train_vector_feature_stage1 = stage1_output_dir + "train_{}_feature_vector_space.csv".format(feature_prefix)
test_vector_feature_stage1 = stage1_output_dir + "test_{}_feature_vector_space.csv".format(feature_prefix)
## concat
train_concat_feature_stage1 = stage1_output_dir + "train_{}_feature_concat.csv".format(feature_prefix)
test_concat_feature_stage1 = stage1_output_dir + "test_{}_feature_concat.csv".format(feature_prefix)

# stage2
## text feature
train_feature_stage2 = stage2_input_dir + "train_{}_feature_concat.npy".format(feature_prefix)
test_feature_stage2 = stage2_input_dir + "test_{}_feature_concat.npy".format(feature_prefix)
train_labels_stage2 = stage2_input_dir+"train_labels.npy"
test_labels_stage2 = None
if OFFLINE:
    test_labels_stage2 = stage2_input_dir+"test_labels.npy"

In [25]:
# 提取标签
if regenerate_all or regenerate_data:
    # 提取训练集的标签并单独存储
    ExtractTrainLabel(train_file, train_labels_stage2)
    if OFFLINE:
        # 提取测试集的标签并单独存储
        ExtractTrainLabel(test_label_file, test_labels_stage2)

----->Started 'Extract label' block...
Part of labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Label file saved to ./stage2/input/train_labels.npy
----->Finished 'Extract label' block, time used: 0.0s.


In [26]:
# 拼接特征并输出到stage2
if regenerate_all or regenerate_feature or regenerate_embed_feature or regenerate_text_feature or regenerate_vector_feature or True:
    CombineFeatures([train_text_feature_stage1, train_embed_feature_stage1, train_vector_feature_stage1], 
                    train_concat_feature_stage1)
    CombineFeatures([test_text_feature_stage1, test_embed_feature_stage1, test_vector_feature_stage1], 
                    test_concat_feature_stage1)
    ConvertCSVToNPY(train_concat_feature_stage1, train_feature_stage2)
    ConvertCSVToNPY(test_concat_feature_stage1, test_feature_stage2)

----->Started 'Concat features' block...
----->Finished 'Concat features' block, time used: 0.19s.
----->Started 'Change inf to nan' block...
----->Finished 'Change inf to nan' block, time used: 0.03s.
----->Started 'Fill nan' block...
----->Finished 'Fill nan' block, time used: 0.0s.
----->Started 'Concat features' block...
----->Finished 'Concat features' block, time used: 0.06s.
----->Started 'Change inf to nan' block...
----->Finished 'Change inf to nan' block, time used: 0.0s.
----->Started 'Fill nan' block...
----->Finished 'Fill nan' block, time used: 0.0s.
----->Started 'Convert CSV To NPY' block...
Part of rows: [[ 3.80000000e+01  1.80000000e+01  3.66356165e+00  2.94443898e+00
   1.22448980e-01  6.52173913e-02  2.99206349e-01  4.48015873e-01
   3.70000000e+01  1.70000000e+01  3.63758616e+00  2.89037176e+00
   3.70370370e-02  1.88679245e-02  2.94723485e-01  3.85405710e-01
   3.60000000e+01  1.60000000e+01  3.61091791e+00  2.83321334e+00
   0.00000000e+00  0.00000000e+00  2.9030

In [27]:
tmp = np.load(train_feature_stage2)
print(tmp.shape)

(15000, 38)


## Stage2阶段

In [28]:
from stage2.code.run_hyperopt import LoadDataset, run_lgb_gbdt, run_lgb_dart

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [29]:
# 加载数据集
LoadDataset(in_train_feature=train_feature_stage2, 
            in_train_label=train_labels_stage2, 
            in_test_feature=test_feature_stage2,
            in_test_label=test_labels_stage2)

-------------------------------------
[Running function]: _load_train
[Used Time]: 0:00:00.004904
-------------------------------------
-------------------------------------
[Running function]: _load_test


TypeError: expected str, bytes or os.PathLike object, not NoneType

In [None]:
run_lgb_gbdt(eval_num=10)

In [None]:
# run_lgb_dart(eval_num=20)