In [1]:
import pandas as pd
import numpy as np

In [5]:
df_train = pd.read_csv('../data/new_data/train_set.csv')
df_test = pd.read_csv('../data/new_data/test_set.csv')

## 特征工程

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

word_vec = TfidfVectorizer(analyzer='word',
            ngram_range=(1,2),
            min_df=3,  # 低频词
            max_df=0.9,  # 高频词
            use_idf=True,
            smooth_idf=True, 
            sublinear_tf=True)

In [7]:
train_doc = word_vec.fit_transform(df_train['word_seg'])
test_doc = word_vec.transform(df_test['word_seg'])

In [8]:
print(type(train_doc))

<class 'scipy.sparse.csr.csr_matrix'>


In [None]:
char_vec = TfidfVectorizer(analyzer='word',
            ngram_range=(1,2),
            min_df=3,  # 低频词
            max_df=0.9,  # 高频词
            use_idf=True,
            smooth_idf=True, 
            sublinear_tf=True)
train_char = char_vec.fit_transform(df_train['article'])
test_char = char_vec.transform(df_test['article'])

In [None]:
from scipy.sparse import hstack
train_all = hstack([train_doc, train_char])
test_all = hstack([test_doc, test_char])

In [9]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
df_train['label'] = lb.fit_transform(df_train['class'])

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(train_doc, df_train['label'], test_size=0.2, shuffle=True, random_state=2019)

## 模型训练

In [11]:
import lightgbm as lgb

In [12]:
model = lgb.LGBMClassifier(
    boosting_type='gbdt',
    num_leaves=2**5,
    max_depth=-1,
    learning_rate=0.1,
    n_estimators=2000, 
    objective='multiclass',
    subsample=0.7, 
    colsample_bytree=0.5, 
    reg_lambda=10, 
    n_jobs=16, 
    num_class=19, 
    silent=True, 
    random_state=2019, 
    colsample_bylevel=0.5, 
    min_child_weight=1.5, 
    metric='multi_logloss'
    
)

In [None]:
model.fit(x_train, y_train, eval_set=(x_valid, y_valid), early_stopping_rounds=100)

## 模型验证

In [None]:
from sklearn.metrics import f1_score
y_valid = model.predict_proba(x_valid)
y_valid = np.argmax(y_valid, axis=1)
pred_valid = lb.inverse_transform(y_valid)
valid_score = f1_score(y_valid, pred_valid, average='macro')
print("valid score is {}".format(valid_score))

## 预测提交

In [9]:
test_prob = clf.predict_proba(test_doc)

In [10]:
test_pred = np.argmax(test_prob, axis=1)
df_test['class'] = lb.inverse_transform(test_pred)
df_test[["id","class"]].to_csv("submission_traditional_baseline.csv", index=False, header=True, encoding='utf-8')