In [0]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!cp /content/drive/My\ Drive/Datasets/DaGuan/TextClassification/new_data.7z ./
!7z x new_data.7z > /null

In [0]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

In [0]:
df_train = pd.read_csv('new_data/train_set.csv')
df_test = pd.read_csv('new_data/test_set.csv')

In [0]:
df = pd.concat([df_train, df_test], axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [0]:
df.shape

(204554, 4)

In [0]:
sentences = []
for document in df['word_seg'].to_list():
    sentences.append(document.split(" "))

In [0]:
model = Word2Vec(sentences=sentences,
                 size=200, 
                 alpha=0.025, 
                 window=5, 
                 min_count=2, 
                 sample=0.001,
                 seed=2019, 
                 workers=12, 
                 min_alpha=0.0001,
                 sg=0, 
                 hs=0,
                 negative=5, 
                 ns_exponent=0.75, 
                 cbow_mean=1,
                 iter=10
)

In [0]:
model.save('word2vec.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
model.wv['816903']

array([-9.11200121e-02,  6.01657450e-01,  1.19949512e-01,  1.44471455e+00,
       -1.96722901e+00, -5.77473819e-01,  3.42355758e-01,  2.17954350e+00,
        8.14185143e-01,  1.15457559e+00, -1.00908613e+00,  1.30161107e-01,
       -3.43487948e-01,  7.21720934e-01,  7.22807586e-01,  9.73198056e-01,
       -1.33312917e+00,  1.66020834e+00,  7.18180835e-01,  1.72288013e+00,
        9.36558023e-02, -1.72128290e-01,  7.30379879e-01,  1.56291556e+00,
       -3.61094624e-01, -1.22902024e+00, -4.39798594e-01,  9.66652185e-02,
       -1.23460698e+00,  1.28045660e-02,  7.29311466e-01,  2.91048624e-02,
        1.08619976e+00, -4.14549321e-01,  6.43083572e-01, -1.33042768e-01,
       -9.41847086e-01,  4.18553501e-02,  1.13867855e+00, -3.45869005e-01,
        1.68908083e+00,  7.86846340e-01, -1.50119352e+00,  1.48496300e-01,
       -1.23894715e+00, -5.36717892e-01, -8.91485810e-01, -1.19540855e-01,
       -1.06919289e+00,  1.50878143e+00,  1.27489781e+00, -1.59103706e-01,
       -3.98129791e-01,  

## 数据准备

In [0]:
import tensorflow as tf

In [0]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=50000,
    lower=False,
    filters=""
)
tokenizer.fit_on_texts(df['word_seg'].tolist())

In [0]:
vocab = tokenizer.word_index

In [0]:
train_ = tokenizer.texts_to_sequences(df_train['word_seg'].values)
test_ = tokenizer.texts_to_sequences(df_test['word_seg'].values)

In [0]:
np.percentile(list(map(lambda x: len(x), train_)), 95)

1822.199999999997

In [0]:
train_ = tf.keras.preprocessing.sequence.pad_sequences(train_, maxlen=1800, padding='pre', truncating='pre', value=0.0)
test_ = tf.keras.preprocessing.sequence.pad_sequences(test_, maxlen=1800, padding='pre', truncating='pre', value=0.0)

In [0]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
lb = LabelEncoder()
train_label = lb.fit_transform(df_train['class'].values)
train_label = to_categorical(train_label)

## 词嵌入

In [0]:
model = Word2Vec.load('word2vec.model')

In [0]:
count = 0
embedding_matrix = np.zeros((len(vocab)+1, 200))
for word, i in vocab.items():
    embedding_vector = model.wv[word] if word in model else None
    if embedding_vector is not None:
        count += 1
        embedding_matrix[i] = embedding_vector
    else:
        unknown_vec = np.random.random(200) * 0.5
        unknown_vec = unknown_vec - unknown_vec.mean()
        embedding_matrix[i] = unknown_vec 

  after removing the cwd from sys.path.


## 模型构建

In [0]:
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, Dense, BatchNormalization, Activation, Dropout, GRU

In [0]:
def build_model(sequence_length, embedding_weight, class_num):
    content = Input(shape=(sequence_length, ), dtype='int32')
    embedding = Embedding(
        name='word_embedding',
        input_dim=embedding_weight.shape[0],
        weights=[embedding_weight],
        output_dim=embedding_weight.shape[1],
        trainable=False
    )
    x = SpatialDropout1D(0.2)(embedding(content))
    x = Bidirectional(GRU(200, return_sequences=True))(x)
    x = Bidirectional(GRU(200, return_sequences=True))(x)
    
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    x = Dense(1000)(conc)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(500)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    output = Dense(19, activation='softmax')(x)
    model = tf.keras.models.Model(inputs=content, outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [0]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import gc

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=2019)
train_pre_matrix = np.zeros((df_train.shape[0], 19))
test_pre_matrix = np.zeros((10, df_test.shape[0], 19))
cv_scores = []

for i, (train_index, valid_index) in enumerate(kf.split(train_)):
    x_train, x_valid = train_[train_index, :], train_[valid_index, :]
    y_train, y_valid = train_label[train_index], train_label[valid_index]
    train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(64)
    valid_ds = tf.data.Dataset.from_tensor_slices((x_valid, y_valid)).batch(64)
    test_ds = tf.data.Dataset.from_tensor_slices((test_, np.zeros((test_.shape[0], 19)))).batch(64)
    
    model = build_model(1800, embedding_matrix, 19)
    model.fit(train_ds, epochs=30, validation_data=valid_ds, verbose=1)
    
    valid_prob = model.predict(valid_ds)
    valid_pred = np.argmax(valid_prob, axis=1)
    valid_pred = lb.inverse_transform(valid_pred)
    y_valid = np.argmax(y_valid, axis=1)
    y_valid = lb.inverse_transform(y_valid)
    f1_score = f1_score(y_valid, valid_pred, average='macro')
    print("F1 score", f1_score)
    train_pre_matrix[valid_index, :] = valid_prob
    test_pre_matrix[i, :, :] = model.predict(test_ds)
    del model
    gc.collect()
    tf.keras.backend.clear_session()

np.save('test.npy', test_pre_matrix)

## 结果融合

In [0]:
res = np.load('test.npy')
res_mean = np.mean(res, axis=0)
test_pred = lb.inverse_transform(np.argmax(res_maean, axis=1))
df_test['class'] = test_pred
df_test[['id', 'class']].to_csv('submission.csv', index=False, header=True, encoding='utf8')