## 包引入与变量准备

In [1]:
import sys
import csv
sys.path.append("./global")
from helper import Timer, usetime, OFFLINE, cp

In [2]:
# 原始训练与测试数据位置
if OFFLINE:
    ori_train_file = "./legacy/data/train.csv"
    ori_test_file = "./legacy/data/test.csv"
    ori_test_label_file = "./legacy/data/test_label.csv"
else:
    ori_train_file = "/home/kesci/input/bytedance/first-round/train.csv"
    ori_test_file = "/home/kesci/input/bytedance/first-round/test.csv"

In [3]:
# Stage0阶段
stage0_output_dir = "./stage1/input/"
## 训练数据相关
if OFFLINE:
    train_file = stage0_output_dir + "train.csv"
else:
    random_sample = False
    if random_sample:
        random_rate = 0.05
        train_file = stage0_output_dir + "train_random_005.csv"
    else:
        sample_num = 5000000
        # 默认取前500w条
        train_file = stage0_output_dir + "train_front_500w.csv"
## 测试数据相关
test_file = stage0_output_dir + "test.csv"
# 线下可以增加测试集label
if OFFLINE:
    test_label_file = stage0_output_dir + "test_label.csv"

In [4]:
# Stage1阶段
state1_dir = "./stage1/"
stage1_code_dir = state1_dir + "code/"
stage1_input_dir = state1_dir + "input/"
stage1_output_dir = state1_dir + "output/"

In [5]:
# Stage2阶段
state2_dir = "./stage2/"
stage2_input_dir = state2_dir + "input/"
stage2_output_dir = state2_dir + "output/"

## Stage0阶段

这一阶段主要是数据准备，且只需要运行一次。

In [6]:
from stage0.code.data_prepare import RandomSampleCSV, SampleCSV
regenerate_data = True

In [7]:
if regenerate_data:
    # 训练数据准备
    if OFFLINE:
        cp(ori_train_file, train_file)
    else:
        if random_sample:
            # 随机从一亿数据中采样rate比例的数据来训练
            RandomSampleCSV(source_csv=ori_train_file, save_file=train_file, rate=random_rate)
        else:
            # 取前c条数据来训练   
            SampleCSV(source_csv=ori_train_file, save_file=train_file, count=sample_num)  

In [8]:
if regenerate_data:
    # 测试数据准备
    if OFFLINE:
        cp(ori_test_file, test_file)
        cp(ori_test_label_file, test_label_file)
    else:
        cp(ori_test_file, test_file)

## Stage1阶段

### 提取词向量

#### Word2vec

1. 生成Word2vec可直接训练的sentences文件
2. 训练word2vec

In [9]:
# ProcessForTrainWord2vec(train_file_data, stage1_input_dir+"word2vec_sentences.txt")

In [10]:
# !cd stage1/code; python build_word2vec.py -f "../input/word2vec_sentences.txt" -d "../output/" -m "word2vec.model"

#### FastText

FastText的使用：https://blog.csdn.net/ymaini/article/details/81489599

1. 生成适合fastText训练的文本格式
2. 有监督训练

In [11]:
# ProcessForTrainFastText(train_file_data, stage1_input_dir+"fastText_labeled_content.txt", add_label=True)

In [12]:
# !cd stage1/code; python build_fastText.py -f "../input/fastText_labeled_content.txt" -d "../output/" -s "fastText_supervised.bin"

### 特征抽取

In [13]:
from stage1.code.feature_text import ExtractTextFeature
from helper import ORI_TRAIN_NAMES, ORI_TEST_NAMES, ORI_TRAIN_DTYPE, ORI_TEST_DTYPE
feature_prefix, regenerate_feature = "v3", True

#### Text Mining Featues

In [14]:
if regenerate_feature:
    # 提取train的特征
    ExtractTextFeature(train_file, save_dir=stage1_output_dir, prefix="train_{}".format(feature_prefix), 
                       names=ORI_TRAIN_NAMES, dtype=ORI_TRAIN_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', 'label', "query_id", "query_title_id"])

----->Started 'Extract text feature' block...
----->Finished 'Extract text feature' block, time used: 11.07s.


In [15]:
if regenerate_feature:
    # 提取test的特征
    ExtractTextFeature(test_file, save_dir=stage1_output_dir, prefix="test_{}".format(feature_prefix), 
                       names=ORI_TEST_NAMES, dtype=ORI_TEST_DTYPE, process_chunkly=False,
                       drop_cols=['query', 'title', "query_id", "query_title_id"])

----->Started 'Extract text feature' block...
----->Finished 'Extract text feature' block, time used: 3.94s.


#### Embedding Features

In [16]:
# ! cd stage1/code; python feature_embed.py -f "../input/train_01_0607.csv" -p "train_v1" -d "../output" -b "true" -e "word2vec"
# ! cd stage1/code; python feature_embed.py -f "../input/test_copy.csv" -p "test_v1" -d "../output" -b "true" -e "word2vec" -t "false"

### Prepare for Stage2

本节是将一些stage1产生的结果进过处理后放到stage2的input里面去。

In [17]:
from stage1.code.prepare_for_stage2 import ExtractTrainLabel, ConvertCSVToNPY
# stage1
## text feature
train_text_feature_stage1 = stage1_output_dir + "train_{}_feature_text.csv".format(feature_prefix)
test_text_feature_stage1 = stage1_output_dir + "test_{}_feature_text.csv".format(feature_prefix)
# stage2
## text feature
train_text_feature_stage2 = stage2_input_dir + "train_{}_feature_text.npy".format(feature_prefix)
test_text_feature_stage2 = stage2_input_dir + "test_{}_feature_text.npy".format(feature_prefix)
train_labels_stage2 = stage2_input_dir+"train_labels.npy"
test_labels_stage2 = None

In [18]:
if regenerate_data:
    # 提取训练集的标签并单独存储
    ExtractTrainLabel(train_file, train_labels_stage2)
    if OFFLINE:
        # 提取测试集的标签并单独存储
        test_labels_stage2 = stage2_input_dir+"test_labels.npy"
        ExtractTrainLabel(test_label_file, test_labels_stage2)

----->Started 'Extract label' block...
Part of labels: [1, 0, 1, 0, 1, 0, 0, 1, 0, 0]
Label file saved to ./stage2/input/train_labels.npy
----->Finished 'Extract label' block, time used: 0.06s.
----->Started 'Extract label' block...
Part of labels: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Label file saved to ./stage2/input/test_labels.npy
----->Finished 'Extract label' block, time used: 0.01s.


In [19]:
if regenerate_feature:
    # 将特征转化为numpy格式便于读取
    ConvertCSVToNPY(train_text_feature_stage1, train_text_feature_stage2)
    ConvertCSVToNPY(test_text_feature_stage1, test_text_feature_stage2)

----->Started 'Convert CSV To NPY' block...
Part of rows: [[3.6635616461296463, 2.9444389791664403, 0.12244897959183673, 0.06521739130434782, 0.2992063492063493, 0.44801587301587287, 3.6375861597263857, 2.8903717578961645, 0.037037037037037035, 0.018867924528301886, 0.29472348543555965, 0.3854057102423114, 3.6109179126442243, 2.833213344056216, 0.0, 0.0, 0.290305992782974, 0.3487576082394259], [3.6635616461296463, 2.772588722239781, 0.25, 0.14285714285714285, 0.3268044324648097, 0.4362084456424078, 3.6375861597263857, 2.70805020110221, 0.0784313725490196, 0.04081632653061224, 0.32240058416529005, 0.3875202972088785, 3.6109179126442243, 2.6390573296152584, 0.0, 0.0, 0.3097278527447116, 0.3628817581667688]]
Npy file saved to ./stage2/input/train_v3_feature_text.npy
----->Finished 'Convert CSV To NPY' block, time used: 0.26s.
----->Started 'Convert CSV To NPY' block...
Part of rows: [[1.6094379124341003, 2.0794415416798357, 0.5454545454545454, 0.375, 0.5454545454545454, 0.6060606060606061

## Stage2阶段

In [20]:
from stage2.code.run_hyperopt import LoadDataset, run_lgb_gbdt, run_lgb_dart

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [21]:
# 加载数据集
LoadDataset(in_train_feature=train_text_feature_stage2, 
            in_train_label=train_labels_stage2, 
            in_test_feature=test_text_feature_stage2,
            in_test_label=test_labels_stage2)

-------------------------------------
[Running function]: _load_train
[Used Time]: 0:00:00.005559
-------------------------------------
-------------------------------------
[Running function]: _load_test
[Used Time]: 0:00:00.002973
-------------------------------------
X: (15000, 18) [[3.66356165 2.94443898 0.12244898 0.06521739 0.29920635 0.44801587
  3.63758616 2.89037176 0.03703704 0.01886792 0.29472349 0.38540571
  3.61091791 2.83321334 0.         0.         0.29030599 0.34875761]
 [3.66356165 2.77258872 0.25       0.14285714 0.32680443 0.43620845
  3.63758616 2.7080502  0.07843137 0.04081633 0.32240058 0.3875203
  3.61091791 2.63905733 0.         0.         0.30972785 0.36288176]]
y: (15000,) [1 0]
Test x: (5000, 18) [[1.60943791 2.07944154 0.54545455 0.375      0.54545455 0.60606061
  1.38629436 1.94591015 0.44444444 0.28571429 0.44444444 0.54901961
  1.09861229 1.79175947 0.28571429 0.16666667 0.39162562 0.41758242]
 [1.38629436 2.94443898 0.3        0.17647059 0.28571429 0.285

In [22]:
run_lgb_gbdt(eval_num=50)

{                                                   
 "bagging_fraction": 0.8,
 "bagging_freq": 4.0,
 "feature_fraction": 1.0,
 "learning_rate": 0.06,
 "max_bin": 1023,
 "min_data_in_leaf": 80.0,
 "num_iterations": 40.0,
 "num_leaves": 55.0,
 "num_threads": 4
}
----->Started 'lgb fit' block...                    
----->Finished 'lgb fit' block, time used:          
0.45s.                                              
----->Started 'lgb fit' block...                    
----->Finished 'lgb fit' block, time used:          
0.33s.                                              
----->Started 'lgb fit' block...                    
----->Finished 'lgb fit' block, time used:          
0.22s.                                              
----->Started 'lgb fit' block...                    
----->Finished 'lgb fit' block, time used:          
0.2s.                                               
----->Started 'lgb fit' block...                    
----->Finished 'lgb fit' block, time used:       

[150]	valid's auc: 0.527334	valid's binary_logloss: 0.584251                 
[200]	valid's auc: 0.51892	valid's binary_logloss: 0.59327                   
----->Finished 'lgb fit' block, time used:                                   
0.54s.                                                                       
Valid AUC (chunk, total):0.5281365640341575 0.5254311478276518               
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.64s.                                                                       
Test AUC:0.5322113658151874                                                  
{                                                                            
 "bagging_fraction": 0.8,
 "bagging_freq": 0.0,
 "feature_fraction": 0.9,
 "learning_rate": 0.04,
 "max_bin": 255,
 "min_data_in_leaf": 80.0,
 "num_iterations": 120.0,
 "num_leaves": 55.0,
 "num_threads": 4
}
----->Start

0.69s.                                                                       
Valid AUC (chunk, total):0.530621426892435 0.5259699855550737                
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.35s.                                                                       
Test AUC:0.5281137853480689                                                  
{                                                                            
 "bagging_fraction": 0.8,
 "bagging_freq": 2.0,
 "feature_fraction": 0.9,
 "learning_rate": 0.02,
 "max_bin": 15,
 "min_data_in_leaf": 10.0,
 "num_iterations": 200.0,
 "num_leaves": 70.0,
 "num_threads": 4
}
----->Started 'lgb fit' block...                                             
[50]	valid's auc: 0.5549	valid's binary_logloss: 0.561113                    
[100]	valid's auc: 0.556463	valid's binary_logloss: 0.561681                 
[150]	valid'

----->Finished 'lgb fit' block, time used:                                   
0.11s.                                                                       
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.1s.                                                                        
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.11s.                                                                       
Valid AUC (chunk, total):0.5378272172291693 0.538142645341483                
----->Started 'lgb fit' block...                                             
----->Finished 'lgb fit' block, time used:                                   
0.11s.                                                                       
Test AUC:0.5385734240038065                                     

----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.544083	valid's binary_logloss: 0.56233                    
----->Finished 'lgb fit' block, time used:                                    
0.18s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.545723	valid's binary_logloss: 0.562645                   
----->Finished 'lgb fit' block, time used:                                    
0.18s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.532595	valid's binary_logloss: 0.5642                     
----->Finished 'lgb fit' block, time used:                                    
0.19s.                                                                        
----->Started 'lgb fit' block...                    

----->Started 'lgb fit' block...                                              
----->Finished 'lgb fit' block, time used:                                    
0.35s.                                                                        
Test AUC:0.5291914197814923                                                   
{                                                                             
 "bagging_fraction": 1.0,
 "bagging_freq": 14.0,
 "feature_fraction": 0.6000000000000001,
 "learning_rate": 0.08,
 "max_bin": 15,
 "min_data_in_leaf": 70.0,
 "num_iterations": 40.0,
 "num_leaves": 75.0,
 "num_threads": 4
}
----->Started 'lgb fit' block...                                              
----->Finished 'lgb fit' block, time used:                                    
0.1s.                                                                         
----->Started 'lgb fit' block...                                              
----->Finished 'lgb fit' block, time used:                        

----->Started 'lgb fit' block...                                              
----->Finished 'lgb fit' block, time used:                                    
0.07s.                                                                        
----->Started 'lgb fit' block...                                              
----->Finished 'lgb fit' block, time used:                                    
0.07s.                                                                        
Valid AUC (chunk, total):0.5453979723189164 0.5434671884270428                
----->Started 'lgb fit' block...                                              
----->Finished 'lgb fit' block, time used:                                    
0.06s.                                                                        
Test AUC:0.5442714495291279                                                   
{                                                                             
 "bagging_fraction": 1.0,
 "bagging_freq": 18.0,
 "f

----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.546577	valid's binary_logloss: 0.561956                   
----->Finished 'lgb fit' block, time used:                                    
0.16s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.557127	valid's binary_logloss: 0.56119                    
----->Finished 'lgb fit' block, time used:                                    
0.29s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.537721	valid's binary_logloss: 0.564702                   
----->Finished 'lgb fit' block, time used:                                    
0.42s.                                                                        
----->Started 'lgb fit' block...                    

----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.54014	valid's binary_logloss: 0.562547                    
----->Finished 'lgb fit' block, time used:                                    
0.1s.                                                                         
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.564009	valid's binary_logloss: 0.56104                    
----->Finished 'lgb fit' block, time used:                                    
0.12s.                                                                        
Valid AUC (chunk, total):0.5505871913890138 0.551181301895572                 
----->Started 'lgb fit' block...                                              
----->Finished 'lgb fit' block, time used:                                    
0.11s.                                                                        
Test AUC:0.547184863395783                          

0.58s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.540958	valid's binary_logloss: 0.562511                   
[100]	valid's auc: 0.543636	valid's binary_logloss: 0.561879                  
----->Finished 'lgb fit' block, time used:                                    
0.28s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.564773	valid's binary_logloss: 0.562164                   
[100]	valid's auc: 0.565074	valid's binary_logloss: 0.561591                  
----->Finished 'lgb fit' block, time used:                                    
0.45s.                                                                        
Valid AUC (chunk, total):0.55451665560482 0.5539358894867138                  
----->Started 'lgb fit' block...                    

[50]	valid's auc: 0.545372	valid's binary_logloss: 0.563451                   
[100]	valid's auc: 0.544766	valid's binary_logloss: 0.563335                  
----->Finished 'lgb fit' block, time used:                                    
0.16s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.537732	valid's binary_logloss: 0.562632                   
[100]	valid's auc: 0.540614	valid's binary_logloss: 0.562262                  
----->Finished 'lgb fit' block, time used:                                    
0.18s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.560597	valid's binary_logloss: 0.562252                   
[100]	valid's auc: 0.559968	valid's binary_logloss: 0.561678                  
----->Finished 'lgb fit' block, time used:          

----->Finished 'lgb fit' block, time used:                                    
0.19s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.527495	valid's binary_logloss: 0.565849                   
[100]	valid's auc: 0.524242	valid's binary_logloss: 0.569147                  
----->Finished 'lgb fit' block, time used:                                    
0.17s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.576852	valid's binary_logloss: 0.559362                   
[100]	valid's auc: 0.55958	valid's binary_logloss: 0.561757                   
----->Finished 'lgb fit' block, time used:                                    
0.2s.                                                                         
Valid AUC (chunk, total):0.5448443885718524 0.540086

Valid AUC (chunk, total):0.537907480905768 0.5366011661539226                 
----->Started 'lgb fit' block...                                              
----->Finished 'lgb fit' block, time used:                                    
0.09s.                                                                        
Test AUC:0.5383396956413461                                                   
{                                                                             
 "bagging_fraction": 1.0,
 "bagging_freq": 6.0,
 "feature_fraction": 0.7000000000000001,
 "learning_rate": 0.04,
 "max_bin": 63,
 "min_data_in_leaf": 70.0,
 "num_iterations": 120.0,
 "num_leaves": 85.0,
 "num_threads": 4
}
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.532362	valid's binary_logloss: 0.566984                   
[100]	valid's auc: 0.528095	valid's binary_logloss: 0.574146                  
----->Finished 'lgb fit' block, time used:                        

[200]	valid's auc: 0.537442	valid's binary_logloss: 0.57273                   
----->Finished 'lgb fit' block, time used:                                    
0.42s.                                                                        
Valid AUC (chunk, total):0.5332910031162519 0.5315714061987068                
----->Started 'lgb fit' block...                                              
----->Finished 'lgb fit' block, time used:                                    
0.35s.                                                                        
Test AUC:0.5310672424614306                                                   
{                                                                             
 "bagging_fraction": 0.8,
 "bagging_freq": 10.0,
 "feature_fraction": 0.7000000000000001,
 "learning_rate": 0.05,
 "max_bin": 63,
 "min_data_in_leaf": 40.0,
 "num_iterations": 120.0,
 "num_leaves": 50.0,
 "num_threads": 4
}
----->Started 'lgb fit' block...                                 

Test AUC:0.5362351767716621                                                   
{                                                                             
 "bagging_fraction": 0.9,
 "bagging_freq": 8.0,
 "feature_fraction": 0.9,
 "learning_rate": 0.06,
 "max_bin": 15,
 "min_data_in_leaf": 80.0,
 "num_iterations": 80.0,
 "num_leaves": 65.0,
 "num_threads": 4
}
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.548073	valid's binary_logloss: 0.566066                   
----->Finished 'lgb fit' block, time used:                                    
0.22s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.549315	valid's binary_logloss: 0.565863                   
----->Finished 'lgb fit' block, time used:                                    
0.2s.                                                                         
---

0.2s.                                                                         
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.527478	valid's binary_logloss: 0.568881                   
----->Finished 'lgb fit' block, time used:                                    
0.22s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.521021	valid's binary_logloss: 0.570281                   
----->Finished 'lgb fit' block, time used:                                    
0.21s.                                                                        
----->Started 'lgb fit' block...                                              
[50]	valid's auc: 0.532112	valid's binary_logloss: 0.567611                   
----->Finished 'lgb fit' block, time used:                                    
0.21s.                                              

In [23]:
# run_lgb_dart(eval_num=10)