## 包引入与变量准备

In [1]:
import sys
import csv
sys.path.append("./global")
from helper import Timer, usetime, OFFLINE, cp

In [2]:
# 原始训练与测试数据位置
if OFFLINE:
    ori_train_file = "./legacy/data/train.csv"
    ori_test_file = "./legacy/data/test.csv"
    ori_test_label_file = "./legacy/data/test_label.csv"
else:
    ori_train_file = "/home/kesci/input/bytedance/first-round/train.csv"
    ori_test_file = "/home/kesci/input/bytedance/first-round/test.csv"

In [3]:
# Stage0阶段
stage0_output_dir = "./stage1/input/"
## 训练数据相关
if OFFLINE:
    train_file = stage0_output_dir + "train.csv"
else:
    random_sample = False
    if random_sample:
        random_rate = 0.05
        train_file = stage0_output_dir + "train_random_005.csv"
    else:
        sample_num = 5000000
        # 默认取前500w条
        train_file = stage0_output_dir + "train_front_500w.csv"
## 测试数据相关
test_file = stage0_output_dir + "test.csv"
# 线下可以增加测试集label
if OFFLINE:
    test_label_file = stage0_output_dir + "test_label.csv"

In [4]:
# Stage1阶段
state1_dir = "./stage1/"
stage1_code_dir = state1_dir + "code/"
stage1_input_dir = state1_dir + "input/"
stage1_output_dir = state1_dir + "output/"

In [5]:
# Stage2阶段
state2_dir = "./stage2/"
stage2_input_dir = state2_dir + "input/"
stage2_output_dir = state2_dir + "output/"

## Stage0阶段

这一阶段主要是数据准备，且只需要运行一次。

In [6]:
from stage0.code.data_prepare import RandomSampleCSV, SampleCSV
regenerate_data = False

In [7]:
if regenerate_data:
    # 训练数据准备
    if OFFLINE:
        cp(ori_train_file, train_file)
    else:
        if random_sample:
            # 随机从一亿数据中采样rate比例的数据来训练
            RandomSampleCSV(source_csv=ori_train_file, save_file=train_file, rate=random_rate)
        else:
            # 取前c条数据来训练   
            SampleCSV(source_csv=ori_train_file, save_file=train_file, count=sample_num)  

In [8]:
if regenerate_data:
    # 测试数据准备
    if OFFLINE:
        cp(ori_test_file, test_file)
        cp(ori_test_label_file, test_label_file)
    else:
        cp(ori_test_file, test_file)

## Stage1阶段

### 提取词向量

In [9]:
from stage1.code.build_word2vec import PrepareWord2vecSamples, TrainWord2vec
stage1_word2vec_sentences = stage1_input_dir+"word2vec_sentences.txt"
stage1_word2vec_model = stage1_output_dir+"word2vec"
regenerate_embedding_samples = False  # 重新生成适合embedding训练的样本集
regenerate_embedding_model = False  # 重新生成适合embedding模型

#### Word2vec

1. 生成Word2vec可直接训练的sentences文件
2. 训练word2vec

In [10]:
# 生成适合word2vec训练的样本集
if regenerate_embedding_samples:
    PrepareWord2vecSamples(train_file, stage1_word2vec_sentences)

In [11]:
# 生成word2vec模型
if regenerate_embedding_model:
    TrainWord2vec(stage1_word2vec_sentences, stage1_word2vec_model)

#### FastText

FastText的使用：https://blog.csdn.net/ymaini/article/details/81489599

1. 生成适合fastText训练的文本格式
2. 有监督训练

In [12]:
# ProcessForTrainFastText(train_file_data, stage1_input_dir+"fastText_labeled_content.txt", add_label=True)

In [13]:
# !cd stage1/code; python build_fastText.py -f "../input/fastText_labeled_content.txt" -d "../output/" -s "fastText_supervised.bin"

### 特征抽取

In [14]:
from stage1.code.feature_text import ExtractTextFeature
from stage1.code.feature_embed import ExtractEmbedFeature
from helper import ORI_TRAIN_NAMES, ORI_TEST_NAMES, ORI_TRAIN_DTYPE, ORI_TEST_DTYPE
feature_prefix, regenerate_feature = "v1", False
regenerate_text_feature = False
regenerate_embed_feature = False

#### Text Mining Featues

In [15]:
if regenerate_feature or regenerate_text_feature:
    # 提取train的特征
    ExtractTextFeature(train_file, save_dir=stage1_output_dir, prefix="train_{}".format(feature_prefix), 
                       names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', 'label', "query_id", "query_title_id"])

In [16]:
if regenerate_feature or regenerate_text_feature:
    # 提取test的特征
    ExtractTextFeature(test_file, save_dir=stage1_output_dir, prefix="test_{}".format(feature_prefix), 
                       names=ORI_TEST_NAMES, dtype=ORI_TEST_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', "query_id", "query_title_id"])

#### Embedding Features

In [17]:
if regenerate_feature or regenerate_embed_feature:
    # 提取train的特征 
    ExtractEmbedFeature(train_file, save_dir=stage1_output_dir, prefix="train_{}".format(feature_prefix), 
                       names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', 'label', "query_id", "query_title_id"], 
                       embed_mode="word2vec", embed_model_file=stage1_word2vec_model+".kv")

In [18]:
if regenerate_feature or regenerate_embed_feature:
    # 提取test的特征 
    ExtractEmbedFeature(test_file, save_dir=stage1_output_dir, prefix="test_{}".format(feature_prefix), 
                       names=ORI_TEST_NAMES, dtype=ORI_TEST_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', "query_id", "query_title_id"], 
                       embed_mode="word2vec", embed_model_file=stage1_word2vec_model+".kv")

### Prepare for Stage2

本节是将一些stage1产生的结果进过处理后放到stage2的input里面去。

In [19]:
from stage1.code.prepare_for_stage2 import ExtractTrainLabel, ConvertCSVToNPY, CombineFeatures
# stage1
## text feature
train_text_feature_stage1 = stage1_output_dir + "train_{}_feature_text.csv".format(feature_prefix)
test_text_feature_stage1 = stage1_output_dir + "test_{}_feature_text.csv".format(feature_prefix)
## embed feature
train_embed_feature_stage1 = stage1_output_dir + "train_{}_feature_embed.csv".format(feature_prefix)
test_embed_feature_stage1 = stage1_output_dir + "test_{}_feature_embed.csv".format(feature_prefix)
## concat
train_concat_feature_stage1 = stage1_output_dir + "train_{}_feature_concat.csv".format(feature_prefix)
test_concat_feature_stage1 = stage1_output_dir + "test_{}_feature_concat.csv".format(feature_prefix)

# stage2
## text feature
train_feature_stage2 = stage2_input_dir + "train_{}_feature_concat.npy".format(feature_prefix)
test_feature_stage2 = stage2_input_dir + "test_{}_feature_concat.npy".format(feature_prefix)
train_labels_stage2 = stage2_input_dir+"train_labels.npy"
test_labels_stage2 = None
if OFFLINE:
    test_labels_stage2 = stage2_input_dir+"test_labels.npy"

In [20]:
if regenerate_data:
    # 提取训练集的标签并单独存储
    ExtractTrainLabel(train_file, train_labels_stage2)
    if OFFLINE:
        # 提取测试集的标签并单独存储
        ExtractTrainLabel(test_label_file, test_labels_stage2)

In [21]:
if regenerate_feature or True:
    CombineFeatures([train_text_feature_stage1, train_embed_feature_stage1], train_concat_feature_stage1)
    CombineFeatures([test_text_feature_stage1, test_embed_feature_stage1], test_concat_feature_stage1)

----->Started 'Concat feature' block...
----->Finished 'Concat feature' block, time used: 0.12s.
----->Started 'Change inf to nan' block...
----->Finished 'Change inf to nan' block, time used: 0.01s.
----->Started 'Fill nan' block...
----->Finished 'Fill nan' block, time used: 0.0s.
----->Started 'Concat feature' block...
----->Finished 'Concat feature' block, time used: 0.04s.
----->Started 'Change inf to nan' block...
----->Finished 'Change inf to nan' block, time used: 0.0s.
----->Started 'Fill nan' block...
----->Finished 'Fill nan' block, time used: 0.0s.


In [25]:
if regenerate_feature or True:
    # 将特征转化为numpy格式便于读取
    ConvertCSVToNPY(train_concat_feature_stage1, train_feature_stage2)
    ConvertCSVToNPY(test_concat_feature_stage1, test_feature_stage2)

----->Started 'Convert CSV To NPY' block...
Part of rows: [[3.6635616461296463, 2.9444389791664403, 0.12244897959183673, 0.06521739130434782, 0.2992063492063493, 0.4480158730158729, 3.6375861597263857, 2.8903717578961645, 0.037037037037037035, 0.018867924528301886, 0.29472348543555965, 0.3854057102423114, 3.610917912644224, 2.8332133440562166, 0.0, 0.0, 0.290305992782974, 0.3487576082394259, 0.037313907474065613, -0.030803564026652464, -0.0654901015377979, -0.06418299538368988, -0.11691944201652138, 0.984094505750426, 57.81236737169481, 0.15177127474245647, 0.013282850209497349, 0.006355855561862918, 0.2270298477938941], [3.6635616461296463, 2.772588722239781, 0.25, 0.14285714285714285, 0.3268044324648097, 0.4362084456424078, 3.6375861597263857, 2.7080502011022096, 0.0784313725490196, 0.040816326530612235, 0.32240058416529005, 0.3875202972088785, 3.610917912644224, 2.6390573296152584, 0.0, 0.0, 0.3097278527447116, 0.3628817581667688, 0.02543202961783023, -0.030803564026652464, -0.07947

## Stage2阶段

In [23]:
from stage2.code.run_hyperopt import LoadDataset, run_lgb_gbdt, run_lgb_dart

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [26]:
# 加载数据集
LoadDataset(in_train_feature=train_feature_stage2, 
            in_train_label=train_labels_stage2, 
            in_test_feature=test_feature_stage2,
            in_test_label=test_labels_stage2)

-------------------------------------
[Running function]: _load_train
[Used Time]: 0:00:00.011002
-------------------------------------
-------------------------------------
[Running function]: _load_test
[Used Time]: 0:00:00.010424
-------------------------------------
X: (15000, 29) [[ 3.66356165e+00  2.94443898e+00  1.22448980e-01  6.52173913e-02
   2.99206349e-01  4.48015873e-01  3.63758616e+00  2.89037176e+00
   3.70370370e-02  1.88679245e-02  2.94723485e-01  3.85405710e-01
   3.61091791e+00  2.83321334e+00  0.00000000e+00  0.00000000e+00
   2.90305993e-01  3.48757608e-01  3.73139075e-02 -3.08035640e-02
  -6.54901015e-02 -6.41829954e-02 -1.16919442e-01  9.84094506e-01
   5.78123674e+01  1.51771275e-01  1.32828502e-02  6.35585556e-03
   2.27029848e-01]
 [ 3.66356165e+00  2.77258872e+00  2.50000000e-01  1.42857143e-01
   3.26804432e-01  4.36208446e-01  3.63758616e+00  2.70805020e+00
   7.84313725e-02  4.08163265e-02  3.22400584e-01  3.87520297e-01
   3.61091791e+00  2.63905733e+00  

In [29]:
run_lgb_gbdt(eval_num=20)

{                                                   
 "bagging_fraction": 0.9,
 "bagging_freq": 14.0,
 "feature_fraction": 0.6000000000000001,
 "learning_rate": 0.09,
 "max_bin": 255,
 "min_data_in_leaf": 100.0,
 "num_iterations": 40.0,
 "num_leaves": 85.0,
 "num_threads": 4
}
----->Started 'lgb fit' block...                    
----->Finished 'lgb fit' block, time used:          
0.28s.                                              
----->Started 'lgb fit' block...                    
----->Finished 'lgb fit' block, time used:          
0.23s.                                              
----->Started 'lgb fit' block...                    
----->Finished 'lgb fit' block, time used:          
0.22s.                                              
----->Started 'lgb fit' block...                    
----->Finished 'lgb fit' block, time used:          
0.21s.                                              
----->Started 'lgb fit' block...                    
----->Finished 'lgb fit' block, t

----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.22s.                                                                       
Test AUC:0.5279066097267402                                                  
{                                                                            
 "bagging_fraction": 0.6000000000000001,
 "bagging_freq": 16.0,
 "feature_fraction": 0.7000000000000001,
 "learning_rate": 0.05,
 "max_bin": 1023,
 "min_data_in_leaf": 60.0,
 "num_iterations": 200.0,
 "num_leaves": 60.0,
 "num_threads": 4
}
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.548991	valid's binary_logloss: 0.565899                  
[100]	valid's auc: 0.539735	valid's binary_logloss: 0.574853                 
[150]	valid's auc: 0.52934	valid's binary_logloss: 0.585153                  
[200]	valid's auc: 0.526807	valid's binary_logloss: 0.593

[100]	valid's auc: 0.561187	valid's binary_logloss: 0.57275                  
[150]	valid's auc: 0.559928	valid's binary_logloss: 0.582004                 
[200]	valid's auc: 0.552866	valid's binary_logloss: 0.592676                 
----->Finished 'lgb fit' block, time used:                                   
0.55s.                                                                       
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.532168	valid's binary_logloss: 0.575648                  
[100]	valid's auc: 0.525684	valid's binary_logloss: 0.587768                 
[150]	valid's auc: 0.521429	valid's binary_logloss: 0.599797                 
[200]	valid's auc: 0.521522	valid's binary_logloss: 0.61092                  
----->Finished 'lgb fit' block, time used:                                   
0.54s.                                                                       
----->Started 'lgb fit' block...                                

----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.531606	valid's binary_logloss: 0.573381                  
[100]	valid's auc: 0.526193	valid's binary_logloss: 0.586115                 
[150]	valid's auc: 0.520775	valid's binary_logloss: 0.599127                 
----->Finished 'lgb fit' block, time used:                                   
0.47s.                                                                       
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.540008	valid's binary_logloss: 0.571737                  
[100]	valid's auc: 0.527511	valid's binary_logloss: 0.585527                 
[150]	valid's auc: 0.522806	valid's binary_logloss: 0.597215                 
----->Finished 'lgb fit' block, time used:                                   
0.46s.                                                                       
----->Started 'lgb fit' block...                                

----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.541379	valid's binary_logloss: 0.580565                  
[100]	valid's auc: 0.528383	valid's binary_logloss: 0.602525                 
[150]	valid's auc: 0.526135	valid's binary_logloss: 0.620269                 
----->Finished 'lgb fit' block, time used:                                   
0.62s.                                                                       
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.558915	valid's binary_logloss: 0.572992                  
[100]	valid's auc: 0.552262	valid's binary_logloss: 0.589075                 
[150]	valid's auc: 0.554085	valid's binary_logloss: 0.605018                 
----->Finished 'lgb fit' block, time used:                                   
0.57s.                                                                       
----->Started 'lgb fit' block...                                

----->Finished 'lgb fit' block, time used:                                   
0.41s.                                                                       
Test AUC:0.5249186768769094                                                  
{                                                                            
 "bagging_fraction": 0.7000000000000001,
 "bagging_freq": 18.0,
 "feature_fraction": 0.7000000000000001,
 "learning_rate": 0.02,
 "max_bin": 63,
 "min_data_in_leaf": 30.0,
 "num_iterations": 40.0,
 "num_leaves": 95.0,
 "num_threads": 4
}
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.16s.                                                                       
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.15s.                                                      

0.27s.                                                                       
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.568525	valid's binary_logloss: 0.558689                  
----->Finished 'lgb fit' block, time used:                                   
0.27s.                                                                       
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.56453	valid's binary_logloss: 0.559694                   
----->Finished 'lgb fit' block, time used:                                   
0.29s.                                                                       
Valid AUC (chunk, total):0.5643338718279916 0.5685948591572626               
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.19s.                                                          

In [28]:
# run_lgb_dart(eval_num=20)

{                                                   
 "bagging_fraction": 0.9,
 "bagging_freq": 18.0,
 "drop_rate": 0.6000000000000001,
 "feature_fraction": 1.0,
 "learning_rate": 0.07,
 "max_bin": 1023,
 "min_data_in_leaf": 50.0,
 "num_iterations": 120.0,
 "num_leaves": 55.0,
 "num_threads": 4,
 "skip_drop": 0.5
}
----->Started 'lgb fit' block...                    
[50]	valid's auc: 0.54157	valid's binary_logloss: 0.569471
[100]	valid's auc: 0.54297	valid's binary_logloss: 0.568131
----->Finished 'lgb fit' block, time used:          
2.98s.                                              
----->Started 'lgb fit' block...                    
[50]	valid's auc: 0.570909	valid's binary_logloss: 0.565632
[100]	valid's auc: 0.576469	valid's binary_logloss: 0.562611
----->Finished 'lgb fit' block, time used:          
3.19s.                                              
----->Started 'lgb fit' block...                    
[50]	valid's auc: 0.535978	valid's binary_logloss: 0.571329
[100]	valid'

{                                                                            
 "bagging_fraction": 0.9,
 "bagging_freq": 16.0,
 "drop_rate": 0.6000000000000001,
 "feature_fraction": 0.9,
 "learning_rate": 0.09,
 "max_bin": 15,
 "min_data_in_leaf": 20.0,
 "num_iterations": 120.0,
 "num_leaves": 100.0,
 "num_threads": 4,
 "skip_drop": 0.8
}
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.546817	valid's binary_logloss: 0.569129                  
[100]	valid's auc: 0.53757	valid's binary_logloss: 0.577122                  
----->Finished 'lgb fit' block, time used:                                   
1.61s.                                                                       
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.558103	valid's binary_logloss: 0.564762                  
[100]	valid's auc: 0.554383	valid's binary_logloss: 0.56865                  
----->Finished 'lgb fit' block, tim

----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.41s.                                                                       
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.4s.                                                                        
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.41s.                                                                       
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.47s.                                                                       
----->Started 'lgb fit' block...                                

0.46s.                                                                       
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.553877	valid's binary_logloss: 0.564425                  
----->Finished 'lgb fit' block, time used:                                   
0.46s.                                                                       
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.55127	valid's binary_logloss: 0.565638                   
----->Finished 'lgb fit' block, time used:                                   
0.46s.                                                                       
Valid AUC (chunk, total):0.5515956776726945 0.54998713434045                 
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.46s.                                                          

----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.555217	valid's binary_logloss: 0.563634                  
[100]	valid's auc: 0.547474	valid's binary_logloss: 0.568486                 
----->Finished 'lgb fit' block, time used:                                   
0.81s.                                                                       
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.564722	valid's binary_logloss: 0.562337                  
[100]	valid's auc: 0.567119	valid's binary_logloss: 0.56299                  
----->Finished 'lgb fit' block, time used:                                   
0.87s.                                                                       
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.546705	valid's binary_logloss: 0.566203                  
[100]	valid's auc: 0.540249	valid's binary_logloss: 0.570398    

----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.29s.                                                                       
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.32s.                                                                       
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.23s.                                                                       
Valid AUC (chunk, total):0.5561366590448062 0.5609810380695526               
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.27s.                                                          

[50]	valid's auc: 0.546067	valid's binary_logloss: 0.565375                   
[100]	valid's auc: 0.547707	valid's binary_logloss: 0.569                     
[150]	valid's auc: 0.537433	valid's binary_logloss: 0.576129                  
[200]	valid's auc: 0.534275	valid's binary_logloss: 0.581559                  
----->Finished 'lgb fit' block, time used:                                    
1.5s.                                                                         
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.54897	valid's binary_logloss: 0.564216                    
[100]	valid's auc: 0.540781	valid's binary_logloss: 0.570574                  
[150]	valid's auc: 0.536895	valid's binary_logloss: 0.574261                  
[200]	valid's auc: 0.525751	valid's binary_logloss: 0.582059                  
----->Finished 'lgb fit' block, time used:                                    
1.56s.                                              