In [1]:
import os
import numpy as np
import pandas as pd
import warnings
from tqdm import tqdm
import random
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, StratifiedKFold

from keras import backend as K
from keras.preprocessing import text, sequence
from keras import Model
from keras.layers import Conv1D, Embedding, Input, Bidirectional, CuDNNLSTM, Dense, Concatenate, Masking, LSTM, SpatialDropout1D
from keras.layers import BatchNormalization, Dropout, Activation
from keras.layers import GlobalMaxPool1D, GlobalAveragePooling1D, GlobalAvgPool1D, GlobalMaxPooling1D
from keras.layers import Subtract, Multiply
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, Callback
from keras.utils import to_categorical
from keras_radam import RAdam
from keras_lookahead import Lookahead

# os.environ["CUDA_VISIBLE_DEVICES"] = '1'
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
def fix_seed(seed):
    np.random.seed(seed)
    tf.compat.v1.set_random_seed(seed)

seed = 2021
fix_seed(seed)
# gpus = tf.config.experimental.list_physical_devices('GPU')
# print(gpus)
# for gpu in gpus:
#     tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
df_train = pd.read_csv('fact_checking_train.csv', sep='\t')
df_train['claim'] = df_train['author'] +' '+ df_train['claim'] 
df_test = pd.read_csv('fact_checking_test.csv', sep='\t')
df_test['claim'] = df_test['author'] +' '+ df_test['claim'] 
evidence = pd.read_csv('evidence.csv',sep='\t')
evidence.columns = ['ID','claim']
evidence['author'] = 'NaN'
evidence['label'] = -1
label_2_v = {'pants-fire':0,'false':1,'barely-true':2,'half-true':3,'mostly-true':4,'true':5}
df_train['label'] = df_train['label'].map(label_2_v)

df_data = evidence.append(df_train)
df_data = df_data.append(df_test)
# df_data = df_train.append(df_test)
df_data = df_data.reset_index(drop=True)
df_data.shape

(39799, 4)

In [4]:
df_train.head()

Unnamed: 0,ID,author,claim,label
0,0,Joe Biden,"Joe Biden Sanders’ “Medicare for All” plan ""wo...",1
1,1,Hillary Clinton,"Hillary Clinton McCain ""still thinks it's okay...",1
2,2,Facebook posts,Facebook posts Says a video shows Iranian rock...,1
3,3,Tom Barrett,"Tom Barrett ""No one on my staff has ever been ...",1
4,4,City of Atlanta,City of Atlanta Tyler Perry’s plan to turn a m...,3


In [5]:
I1 = np.load("I1.npy")

In [6]:
I1[0:5]

array([[ 6576, 19510,  1769, 13226,  2348],
       [19163, 15663,   747, 17304, 17501],
       [17224, 10359, 12239,   368,  7150],
       [  973, 12778, 13214,   545, 15582],
       [18714,  8887, 16241, 11511, 13224]], dtype=int64)

In [7]:
max_words_num = None
seq_len = 2000
seq_len = 200
embedding_dim = 32
col = 'claim'

print('Generate seqs')
os.makedirs('seqs', exist_ok=True)
seq_path = 'seqs/seqs_{}_{}.npy'.format(max_words_num, seq_len)
word_index_path = 'seqs/word_index_{}_{}.npy'.format(max_words_num, seq_len)
if not os.path.exists(seq_path) or not os.path.exists(word_index_path):
    tokenizer = text.Tokenizer(num_words=max_words_num, lower=False, filters='')
#     tokenizer.fit_on_texts(df_data[col].values.tolist())
    tokenizer.fit_on_texts(df_train[col].values.tolist())
    seqs = sequence.pad_sequences(tokenizer.texts_to_sequences(df_data[col].values.tolist()), maxlen=seq_len,
                                  padding='post', truncating='pre')
    word_index = tokenizer.word_index
        
    np.save(seq_path, seqs)
    np.save(word_index_path, word_index)

else:
    seqs = np.load(seq_path)
    word_index = np.load(word_index_path, allow_pickle=True).item()

embedding = np.zeros((len(word_index) + 1, embedding_dim))


env = seqs[0:20006].copy()
seqs = seqs[20006:].copy()

Generate seqs


In [8]:
embedding.shape

(43021, 32)

In [9]:
df_data['label'].value_counts()


-1.0    20006
 1.0     4462
 3.0     3171
 2.0     2980
 4.0     2898
 5.0     2256
 0.0     2233
Name: label, dtype: int64

In [10]:
os.makedirs('model', exist_ok=True)
os.makedirs('sub', exist_ok=True)
os.makedirs('prob', exist_ok=True)

In [11]:
all_index = [i for i in range(18000)]
# test_index = df_data[df_data['label'].isnull()].index.tolist()

In [12]:
def build_model(emb, seq_len):
    emb_layer = Embedding(
        input_dim=emb.shape[0],
        output_dim=emb.shape[1],
        input_length=seq_len,
    )
    
    seq = Input(shape=(seq_len, ))
    seq_emb = emb_layer(seq)
    
    seq_emb = SpatialDropout1D(rate=0.5)(seq_emb)

    lstm = Bidirectional(CuDNNLSTM(50, return_sequences=True))(seq_emb)
#     lstm = Bidirectional(LSTM(200, return_sequences=True))(seq_emb)
    
    
    lstm_avg_pool = GlobalAveragePooling1D()(lstm)
    lstm_max_pool = GlobalMaxPooling1D()(lstm)
    x = Concatenate()([lstm_avg_pool, lstm_max_pool])
    
    x = Dropout(0.5)(Activation(activation='relu')(BatchNormalization()(Dense(128)(x))))
    out = Dense(6, activation='softmax')(x)
    
    model = Model(inputs=seq, outputs=out)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Lookahead(RAdam()), metrics=['accuracy'])

    return model

def build_model_multi_input(emb, seq_len):
    emb_layer = Embedding(
        input_dim=emb.shape[0],
        output_dim=emb.shape[1],
        input_length=seq_len,
    )
    
    seq1 = Input(shape=(seq_len, ))
    seq2 = Input(shape=(seq_len, ))
    seq_emb1 = emb_layer(seq1)
    seq_emb2 = emb_layer(seq2)
    
    shared_lstm = Bidirectional(CuDNNLSTM(50, return_sequences=True))
    
    seq_emb1 = SpatialDropout1D(rate=0.5)(seq_emb1)
    seq_emb2 = SpatialDropout1D(rate=0.5)(seq_emb2)
    
    lstm1 = shared_lstm(seq_emb1)
    lstm2 = shared_lstm(seq_emb2)    
    
    lstm_avg_pool1 = GlobalAveragePooling1D()(lstm1)
    lstm_max_pool1 = GlobalMaxPooling1D()(lstm1)
    lstm_avg_pool2 = GlobalAveragePooling1D()(lstm2)
    lstm_max_pool2 = GlobalMaxPooling1D()(lstm2)
    lstm_multiply_1 = Multiply()([lstm_avg_pool1,lstm_avg_pool2])
    lstm_multiply_2 = Multiply()([lstm_max_pool1,lstm_max_pool2])
    lstm_subtract_1 = Subtract()([lstm_avg_pool1,lstm_avg_pool2])
    lstm_subtract_2 = Subtract()([lstm_max_pool1,lstm_max_pool2])
    
    
    x = Concatenate()([lstm_avg_pool1, lstm_max_pool1, lstm_avg_pool2, lstm_max_pool2,
                      lstm_multiply_1, lstm_multiply_2, lstm_subtract_1, lstm_subtract_2])
    
    x = Dropout(0.5)(Activation(activation='relu')(BatchNormalization()(Dense(128)(x))))
    out = Dense(6, activation='softmax')(x)
    
    model = Model(inputs=[seq1, seq2], outputs=out)

#     sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.0001), metrics=['accuracy'])

    return model

In [13]:
model = build_model_multi_input(embedding, seq_len)
print(model.summary())

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 32)      1376672     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dro

In [14]:
class Evaluator(Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.best_val_f1 = 0.
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]

    def evaluate(self):
        y_true = self.y_val
        y_pred = self.model.predict(self.x_val).argmax(axis=1)
        f1 = f1_score(y_true, y_pred, average='macro')
        return f1

    def on_epoch_end(self, epoch, logs=None):
        val_f1 = self.evaluate()
        if val_f1 > self.best_val_f1:
            self.best_val_f1 = val_f1
        logs['val_f1'] = val_f1
        print(f'val_f1: {val_f1:.5f}, best_val_f1: {self.best_val_f1:.5f}')

In [15]:
bs = 2
monitor = 'val_f1'

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for fold_id, (train_index, val_index) in enumerate(kfold.split(all_index, df_train['label'])):
    train_x = seqs[train_index]
    val_x = seqs[val_index]
    
    train_env = env[I1[train_index][:,0]]
    val_env = env[I1[val_index][:,0]]

    label = df_train['label'].values
    train_y = label[train_index]
    val_y = label[val_index]
    
    model_path = 'model/lstm_{}.h5'.format(fold_id)
    checkpoint = ModelCheckpoint(model_path, monitor=monitor, verbose=1, save_best_only=True, mode='max', save_weights_only=True)
    earlystopping = EarlyStopping(monitor=monitor, patience=5, verbose=1, mode='max')
    reduce_lr = ReduceLROnPlateau(monitor=monitor, factor=0.5, patience=2, mode='max', verbose=1)
    
#     model = build_model(embedding, seq_len)
#     model.fit(train_x, train_y, batch_size=bs, epochs=30,
#               validation_data=(val_x, val_y),
#               callbacks=[Evaluator(validation_data=(val_x, val_y)), checkpoint, reduce_lr, earlystopping], verbose=1, shuffle=True)
    
    model = build_model_multi_input(embedding, seq_len)
    model.fit([train_x,train_env], train_y, batch_size=bs, epochs=30,
              validation_data=([val_x,val_env], val_y),
              callbacks=[Evaluator(validation_data=([val_x,val_env], val_y)), checkpoint, reduce_lr, earlystopping], verbose=1, shuffle=True)
    
    break


Train on 16200 samples, validate on 1800 samples
Epoch 1/30
val_f1: 0.23529, best_val_f1: 0.23529

Epoch 00001: val_f1 improved from -inf to 0.23529, saving model to model/lstm_0.h5
Epoch 2/30
val_f1: 0.17754, best_val_f1: 0.23529

Epoch 00002: val_f1 did not improve from 0.23529
Epoch 3/30
val_f1: 0.24795, best_val_f1: 0.24795

Epoch 00003: val_f1 improved from 0.23529 to 0.24795, saving model to model/lstm_0.h5
Epoch 4/30
val_f1: 0.24433, best_val_f1: 0.24795

Epoch 00004: val_f1 did not improve from 0.24795
Epoch 5/30
val_f1: 0.18762, best_val_f1: 0.24795

Epoch 00005: val_f1 did not improve from 0.24795

Epoch 00005: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05.
Epoch 6/30
val_f1: 0.31163, best_val_f1: 0.31163

Epoch 00006: val_f1 improved from 0.24795 to 0.31163, saving model to model/lstm_0.h5
Epoch 7/30
val_f1: 0.28337, best_val_f1: 0.31163

Epoch 00007: val_f1 did not improve from 0.31163
Epoch 8/30

InternalError: Failed to call ThenRnnBackward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 32, 50, 1, 200, 2, 50] 
	 [[{{node training/Adam/gradients/bidirectional_2_1/CudnnRNN_1_grad/CudnnRNNBackprop}}]]

In [None]:
env[I1[train_index][:,0]].shape