In [1]:
import os
import keras
import numpy as np
import pandas as pd
import multiprocess as mp
import jieba.posseg as pseg
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [2]:
TRAIN_CSV_PATH = 'train.csv'
TEST_CSV_PATH = 'test.csv'
TOKENIZED_TRAIN_CSV_PATH = 'tokenized_train.csv'
TOKENIZED_TEST_CSV_PATH = 'tokenized_test.csv'

In [3]:
train = pd.read_csv(TRAIN_CSV_PATH, index_col='id')
train.head(3)

Unnamed: 0_level_0,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated


In [4]:
train = train.loc[:, ['title1_zh', 'title2_zh', 'label']]
train.head(3)

Unnamed: 0_level_0,title1_zh,title2_zh,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,unrelated
3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,unrelated
1,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,unrelated


In [5]:
train.isna().any()

title1_zh    False
title2_zh     True
label        False
dtype: bool

In [6]:
train.title2_zh.fillna('UNKNOWN', inplace=True)
train.isna().any()

title1_zh    False
title2_zh    False
label        False
dtype: bool

In [7]:
def jieba_tokenizer(text):
    words = pseg.cut(text)
    return ' '.join([word for word, flag in words if flag != 'x'])

def process(data):
    res = data.apply(jieba_tokenizer)
    return res

def check_merge_idx(data, res):
    assert((data.index == res.index).all(), 'Something error when merge data')

def parallelize(data, func):
    cores = partitions = mp.cpu_count()
    data_split = np.array_split(data, partitions)
    pool = mp.Pool(cores)
    res = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    check_merge_idx(data, res)
    return res

  assert((data.index == res.index).all(), 'Something error when merge data')


In [8]:
np.all(train.index == train.title1_zh.index)

True

In [9]:
if os.path.exists(TOKENIZED_TRAIN_CSV_PATH):
    print('Use prepared tokenized train data')
    train = pd.read_csv(TOKENIZED_TRAIN_CSV_PATH, index_col='id')
else:
    print('Start to training')
    train['title1_tokenized'] = parallelize(train.loc[:, 'title1_zh'], process)
    train['title2_tokenized'] = parallelize(train.loc[:, 'title2_zh'], process)
    train.to_csv('tokenized_train.csv', index=True)

Use prepared tokenized train data


In [10]:
train.fillna('UNKNOWN', inplace=True)
train.head(3)

Unnamed: 0_level_0,title1_zh,title2_zh,label,title1_tokenized,title2_tokenized
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,unrelated,2017 养老保险 又 新增 两项 农村 老人 人人 可 申领 你 领到 了 吗,警方 辟谣 鸟巢 大会 每人 领 5 万 仍 有 老人 坚持 进京
3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,unrelated,你 不 来 深圳 早晚 你 儿子 也 要 来 不出 10 年 深圳 人均 GDP 将 超 香港,深圳 GDP 首 超 香港 深圳 统计局 辟谣 只是 差距 在 缩小
1,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,unrelated,你 不 来 深圳 早晚 你 儿子 也 要 来 不出 10 年 深圳 人均 GDP 将 超 香港,GDP 首 超 香港 深圳 澄清 还 差 一点点


In [11]:
corpus_x1 = train.title1_tokenized
corpus_x2 = train.title2_tokenized
corpus = pd.concat([corpus_x1, corpus_x2])
corpus.shape

(641104,)

In [12]:
pd.DataFrame(corpus.iloc[:5], columns=['title'])

Unnamed: 0_level_0,title
id,Unnamed: 1_level_1
0,2017 养老保险 又 新增 两项 农村 老人 人人 可 申领 你 领到 了 吗
3,你 不 来 深圳 早晚 你 儿子 也 要 来 不出 10 年 深圳 人均 GDP 将 超 香港
1,你 不 来 深圳 早晚 你 儿子 也 要 来 不出 10 年 深圳 人均 GDP 将 超 香港
2,你 不 来 深圳 早晚 你 儿子 也 要 来 不出 10 年 深圳 人均 GDP 将 超 香港
9,用 大蒜 鉴别 地沟油 的 方法 怎么 鉴别 地沟油


In [13]:
corpus.isna().any()

False

In [14]:
MAX_NUM_WORDS = 10000
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(corpus)
x1_train = tokenizer.texts_to_sequences(corpus_x1)
x2_train = tokenizer.texts_to_sequences(corpus_x2)

In [15]:
len(x1_train)

320552

In [16]:
x1_train[:1]

[[217, 1268, 32, 1178, 5967, 25, 489, 2877, 116, 5559, 4, 1850, 2, 13]]

In [17]:
for seq in x1_train[:1]:
    print([tokenizer.index_word[idx] for idx in seq])

['2017', '养老保险', '又', '新增', '两项', '农村', '老人', '人人', '可', '申领', '你', '领到', '了', '吗']


In [18]:
MAX_SEQUENCE_LENGTH = 20
x1_train = keras.preprocessing.sequence.pad_sequences(x1_train, maxlen=MAX_SEQUENCE_LENGTH)
x2_train = keras.preprocessing.sequence.pad_sequences(x2_train, maxlen=MAX_SEQUENCE_LENGTH)

In [19]:
for seq in x1_train + x2_train:
    assert len(seq) == 20

In [20]:
train.label[:5]

id
0    unrelated
3    unrelated
1    unrelated
2    unrelated
9       agreed
Name: label, dtype: object

In [21]:
label_to_index = {
    'unrelated': 0, 
    'agreed': 1, 
    'disagreed': 2
}
y_train = train.label.apply(lambda x: label_to_index[x])
y_train = np.asarray(y_train).astype('float32')
y_train[:5]

array([0., 0., 0., 0., 1.], dtype=float32)

In [22]:
y_train = keras.utils.to_categorical(y_train)
y_train[:5]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.]], dtype=float32)

In [23]:
VALIDATION_RATIO = 0.1
RANDOM_STATE = 9527

x1_train, x1_val, x2_train, x2_val, y_train, y_val = train_test_split(
    x1_train, x2_train, y_train, test_size=VALIDATION_RATIO, random_state=RANDOM_STATE)

In [24]:
print('Training Set')
print('-' * 10)
print(f'x1_train: {x1_train.shape}')
print(f'x2_train: {x2_train.shape}')
print(f'y_train : {y_train.shape}')
print('-' * 10)
print(f'x1_val:   {x1_val.shape}')
print(f'x2_val:   {x2_val.shape}')
print(f'y_val :   {y_val.shape}')
print('-' * 10)
print('Test Set')

Training Set
----------
x1_train: (288496, 20)
x2_train: (288496, 20)
y_train : (288496, 3)
----------
x1_val:   (32056, 20)
x2_val:   (32056, 20)
y_val :   (32056, 3)
----------
Test Set


In [25]:
NUM_CLASSES = 3
MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 20
NUM_EMBEDDING_DIM = 256
NUM_LSTM_UNITS = 128

top_input = keras.Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
bm_input = keras.Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')

embedding_layer = keras.layers.Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
top_embedded = embedding_layer(top_input)
bm_embedded = embedding_layer(bm_input)

shared_lstm = keras.layers.LSTM(NUM_LSTM_UNITS)
top_output = shared_lstm(top_embedded)
bm_output = shared_lstm(bm_embedded)

merged = keras.layers.concatenate([top_output, bm_output], axis=-1)
dense = keras.layers.Dense(units=NUM_CLASSES, activation='softmax')
predictions = dense(merged)

model = keras.Model(inputs=[top_input, bm_input], outputs=predictions)
model.summary()

2022-04-23 16:45:24.313273: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 20, 256)      2560000     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 128)          197120      embedding[0][0]              

In [26]:
lr = 1e-3
opt = Adam(learning_rate=lr, decay=lr/50)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [27]:
BATCH_SIZE = 512
NUM_EPOCHS = 50

history = model.fit(
    x=[x1_train, x2_train], 
    y=y_train,
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
    validation_data=([x1_val, x2_val], y_val),
    shuffle=True
)

Epoch 1/50


2022-04-23 16:45:24.757301: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [32]:
if os.path.exists(TOKENIZED_TEST_CSV_PATH):
    print('Use prepared tokenized test data')
    test = pd.read_csv(TOKENIZED_TEST_CSV_PATH, index_col='id')
else:
    print('Use raw test data')
    test = pd.read_csv(TEST_CSV_PATH, index_col='id')
    test = test.loc[:, ['title1_zh', 'title2_zh']]
    test.fillna('UNKNOWN', inplace=True)
    test['title1_tokenized'] = parallelize(test.loc[:, 'title1_zh'], process)
    test['title2_tokenized'] = parallelize(test.loc[:, 'title2_zh'], process)
    test.to_csv('tokenized_test.csv', index=True)

Use prepared tokenized test data


In [33]:
test.fillna('UNKNOWN', inplace=True)
test.head(3)

Unnamed: 0_level_0,title1_zh,title2_zh,title1_tokenized,title2_tokenized
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
321187,萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大,辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？,萨拉 赫 人气 爆棚 埃及 总统大选 未 参选 获 百万 选票 现任 总统 压力 山 大,辟谣 里昂 官方 否认 费 基尔 加盟 利物浦 难道 是 价格 没 谈拢
321190,萨达姆被捕后告诫美国的一句话，发人深思,10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国,萨达姆 被捕 后 告诫 美国 的 一句 话 发人深思,10 大 最 让 美国 人 相信 的 荒诞 谣言 如 蜥蜴人 掌控 着 美国
321189,萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗,萨达姆被捕后告诫美国的一句话，发人深思,萨达姆 此项 计划 没有 此国 破坏 的话 美国 还 会 对 伊拉克 发动战争 吗,萨达姆 被捕 后 告诫 美国 的 一句 话 发人深思


In [34]:
x1_test = tokenizer.texts_to_sequences(test.title1_tokenized)
x2_test = tokenizer.texts_to_sequences(test.title2_tokenized)

x1_test = keras.preprocessing.sequence.pad_sequences(x1_test, maxlen=MAX_SEQUENCE_LENGTH)
x2_test = keras.preprocessing.sequence.pad_sequences(x2_test, maxlen=MAX_SEQUENCE_LENGTH)    

predictions = model.predict([x1_test, x2_test])
predictions[:5]

array([[9.9999952e-01, 1.9164672e-19, 4.2101283e-07],
       [1.0000000e+00, 5.6914282e-13, 9.1929320e-12],
       [9.8432922e-01, 1.5668621e-02, 2.2136023e-06],
       [1.0000000e+00, 4.8881038e-13, 4.5146115e-10],
       [9.9999809e-01, 1.8821100e-10, 1.8697018e-06]], dtype=float32)

In [35]:
index_to_label = {v: k for k, v in label_to_index.items()}
test['Category'] = [index_to_label[idx] for idx in np.argmax(predictions, axis=1)]
submission = test.loc[:, ['Category']].reset_index()
submission.columns = ['Id', 'Category']
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,Id,Category
0,321187,unrelated
1,321190,unrelated
2,321189,unrelated
3,321193,unrelated
4,321191,unrelated
