In [1]:
# -*- coding: utf-8 -*-
import lightgbm as lgb
import csv
import pandas as pd
import time
import json
from joblib import load, dump
from tqdm import tqdm
import re
from utils.features_ents import feature_ents
from utils.ner import ner
from sklearn.model_selection import train_test_split

In [2]:
class Train():
    def __init__(self):
        self.train_data_path = "data/coreEntityEmotion_train.txt"

    def model_lgb(self, X, Y, process_num):
            # create dataset for lightgbm
        train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.1, random_state=0)   # 分训练集和验证集    
        lgb_train = lgb.Dataset(train_x, train_y)
        lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)

        # specify your configurations as a dict
        params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'cross_entropy'},
        'num_leaves': 31,
        'max_depth' : 3,
        'learning_rate': 0.1,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'seed' : 0
        }
        # train
        print("Training lgb model....")
        gbm = lgb.train(params,lgb_train,num_boost_round=100,valid_sets=lgb_eval,early_stopping_rounds=10)
        print("Save model to "+process_num+".joblib")
        dump(gbm, "models/"+process_num+".joblib")
    
    def train_ents(self):
        train_data = open(self.train_data_path)
        fea_ents = feature_ents()
        ners = []
        X = []
        Y = []
        count = 0
        for news in tqdm(train_data):
            count += 1
            if(count == 10):  #测试能否跑通，测试完去掉
                break
            news = json.loads(news)
            X_data = fea_ents.combine_features(news)
            Y_data = [x['entity'] for x in news['coreEntityEmotions']]
            for x in X_data:
                if x[0][0] in Y_data:
                    Y.append(1)
                else:
                    Y.append(0)
                X.append(x[1])
            if count == 28000:
                print("Save features for holdout... ")
                dump(X, "features/holdout_x1.joblib")
                dump(Y, "features/holdout_y1.joblib")
        print("Save features... ")
        dump(X, "features/x1.joblib")
        dump(Y, "features/y1.joblib")
        self.model_lgb(X, Y, "model1")
        print("done!")
        
if __name__ == "__main__":
    train = Train()
    train.train_ents()

0it [00:00, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.383 seconds.
Prefix dict has been built succesfully.
9it [00:02,  2.94it/s]


Save features... 
Training lgb model....
[1]	valid_0's xentropy: 0.0345824
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's xentropy: 0.0351084
[3]	valid_0's xentropy: 0.0356076
[4]	valid_0's xentropy: 0.0360813
[5]	valid_0's xentropy: 0.0365295
[6]	valid_0's xentropy: 0.0371864
[7]	valid_0's xentropy: 0.0378312
[8]	valid_0's xentropy: 0.0384247
[9]	valid_0's xentropy: 0.0389891
[10]	valid_0's xentropy: 0.0395244
[11]	valid_0's xentropy: 0.0397566
Early stopping, best iteration is:
[1]	valid_0's xentropy: 0.0345824
Save model to model1.joblib
done!
