In [1]:
# 代码链接
# https://mp.weixin.qq.com/s/m01J5Mi25txyRkKo7_BAuw

# 3. 简单实现fastText
# coding: utf-8
from __future__ import unicode_literals

from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.layers import Dense

VOCAB_SIZE = 2000
EMBEDDING_DIM = 100
MAX_WORDS = 500
CLASS_NUM = 5


def build_fastText():
    model = Sequential()
    # 将词汇数VOCAB_SIZE映射为EMBEDDING_DIM维
    model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_WORDS))
    # 平均文档中所有词的embedding
    model.add(GlobalAveragePooling1D())
    # softmax分类
    model.add(Dense(CLASS_NUM, activation='softmax'))
    # 定义损失函数、优化器、分类度量指标
    model.compile(loss='categorical_crossentropy', optimizer='SGD', metrics=['accuracy'])
    return model

if __name__ == '__main__':
    model = build_fastText()
    print(model.summary())

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          200000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 505       
Total params: 200,505
Trainable params: 200,505
Non-trainable params: 0
_________________________________________________________________
None


In [3]:
# 4. 使用fastText文本分类
# 4.1 加载库
import time
import numpy as np
import fasttext
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold

# 4.2 fastText分类
# 主要超参数：
# lr: 学习率
#
# dim: 词向量的维度
#
# epoch: 每轮的个数
#
# wordNgrams: 词的n-gram，一般设置为2或3
#
# loss: 损失函数 ns(negative sampling, 负采样)、hs(hierarchical softmax, 分层softmax)、softmax、ova(One-VS-ALL)

def fasttext_model(nrows, train_num, lr=1.0, wordNgrams=2, minCount=1, epoch=25, loss='hs', dim=100):
    start_time = time.time()

    # 转换为FastText需要的格式
    train_df = pd.read_csv('data/train_set.csv', sep='\t', nrows=nrows)

    # shuffle
    train_df = shuffle(train_df, random_state=666)

    train_df['label_ft'] = '__label__' + train_df['label'].astype('str')
    train_df[['text', 'label_ft']].iloc[:train_num].to_csv('data/fastText_train.csv', index=None, header=None, sep='\t')

    # 调用fasttext的方法有监督的训练
    model = fasttext.train_supervised('data/fastText_train.csv', lr=lr, wordNgrams=wordNgrams, verbose=2,
                                      minCount=minCount, epoch=epoch, loss=loss, dim=dim)

    train_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[:train_num]['text']]
    print('Train f1_score:', f1_score(train_df['label'].values[:train_num].astype(str), train_pred, average='macro'))
    val_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[train_num:]['text']]
    print('Val f1_score:', f1_score(train_df['label'].values[train_num:].astype(str), val_pred, average='macro'))
    train_time = time.time()
    print('Train time: {:.2f}s'.format(train_time - start_time))  # 输出训练时间

     # 预测并保存  
    test_df = pd.read_csv('data/test_a.csv')  #pd pandas

    test_pred = [model.predict(x)[0][0].split('__')[-1] for x in test_df['text']]
    test_pred = pd.DataFrame(test_pred, columns=['label'])
    test_pred.to_csv('data/test_fastText_ridgeclassifier.csv', index=False)
    print('Test predict saved.')
    end_time = time.time()
    print('Predict time:{:.2f}s'.format(end_time - train_time))  # 输出预测时间


if __name__ == '__main__':
    nrows = 200000
    train_num = int(nrows * 0.7)  # 只使用部分训练数据
    lr=0.01
    wordNgrams=2
    minCount=1
    epoch=25
    loss='hs'  # 层softmax

    fasttext_model(nrows, train_num)

Train f1_score: 0.9984212272599677
Val f1_score: 0.9112293284548515
Train time: 628.02s
Test predict saved.
Predict time:19.34s


In [8]:
def fasttext_kfold_model(nrows, train_num, n_splits, lr=1.0, wordNgrams=2, minCount=1, epoch=25, loss='hs', dim=100):
    start_time = time.time()

    # 转换为FastText需要的格式
    train_df = pd.read_csv('data/train_set.csv', sep='\t', nrows=nrows)

    # shuffle
    train_df = shuffle(train_df, random_state=666)

    train_df['label_ft'] = '__label__' + train_df['label'].astype('str')

    models = []
    train_scores = []
    val_scores = []

    # K折交叉验证
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=666)
    for train_index, test_index in skf.split(train_df['text'], train_df['label_ft']):
        train_df[['text', 'label_ft']].iloc[train_index].to_csv('data/fastText_train.csv', index=None, header=None, sep='\t')

        model = fasttext.train_supervised('data/fastText_train.csv', lr=lr, wordNgrams=wordNgrams, verbose=2,
                                          minCount=minCount, epoch=epoch, loss=loss)
        models.append(model)

        train_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[train_index]['text']]
        train_score = f1_score(train_df['label'].values[train_index].astype(str), train_pred, average='macro')
        # print('Train length: ', len(train_pred))
        print('Train score: ', train_score)
        train_scores.append(train_score)

        val_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[test_index]['text']]
        val_score = f1_score(train_df['label'].values[test_index].astype(str), val_pred, average='macro')
        # print('Val length: ', len(val_pred))
        print('Val score', val_score)
        val_scores.append(val_score)

    print('mean train score: ', np.mean(train_scores))
    print('mean val score: ', np.mean(val_scores))
    train_time = time.time()
    print('Train time: {:.2f}s'.format(train_time - start_time))

    return models

def fasttext_kfold_predict(models, n_splits):

    pred_list = []

    start_time = time.time()
    # 预测并保存
    test_df = pd.read_csv('data/test_a.csv')

    # 消耗时间较长
    for model in models:
        test_pred = [model.predict(x)[0][0].split('__')[-1] for x in test_df['text']]
        pred_list.append(test_pred)

    test_pred_label = pd.DataFrame(pred_list).T.apply(lambda row: np.argmax(np.bincount([row[i] for i in range(n_splits)])), axis=1)
    test_pred_label.columns='label'

    test_pred_label.to_csv('data/test_fastText_ridgeclassifier.csv', index=False)
    print('Test predict saved.')
    end_time = time.time()
    print('Predict time:{:.2f}s'.format(end_time - start_time))


if __name__ == '__main__':
  nrows = 200000
  train_num = int(nrows * 0.7)
  n_splits = 3
  lr=0.1
  wordNgrams=2
  minCount=1
  epoch=25
  loss='hs'
  dim=200
  models = fasttext_kfold_model(nrows, train_num, n_splits, lr=lr, wordNgrams=wordNgrams, minCount=minCount, epoch=epoch, loss=loss, dim=dim)
  fasttext_kfold_predict(models, n_splits=n_splits)


Train score:  0.9624907616345785
Val score 0.9091945705382832
Train score:  0.9632814673743758
Val score 0.9123353579952134
Train score:  0.9622628114148392
Val score 0.9059789133997882
mean train score:  0.9626783468079312
mean val score:  0.9091696139777617
Train time: 1538.12s
Test predict saved.
Predict time:54.81s
