In [None]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GlobalAveragePooling1D, Lambda
from keras.layers import concatenate
from keras.models import Model
from keras.layers.normalization.batch_normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K

from BertEmbeddings import BertEmbeddings
from transformers import RobertaTokenizer, TFRobertaModel
from transformers import DebertaV2Tokenizer, TFDebertaV2Model

In [None]:
# The name below should be changed according to different courses
file = "COMP 301 Summer 2021.csv"
Train_Data_File = file.split(".")[0] + " (train).csv"
Test_Data_File = file.split(".")[0] + " (test).csv"
Max_Sequence_Length = 60
Max_Num_Words = 200000

Validation_Split_Ratio = 0.2

Num_Lstm = np.random.randint(175, 275)
Num_Dense = np.random.randint(100, 150)
Rate_Drop_Lstm = 0.15 + np.random.rand() * 0.25
Rate_Drop_Dense = 0.15 + np.random.rand() * 0.25

act_f = 'relu'

# embedding_method:
# 1. Bert (will use Bert large)
# 2. DeBerta (will use DeBerta-v2-xlarge)
# 3. GloVe
# 4. RoBerta (will use RoBerta-large)
# 5. Word2Vec
embedding_model = "Bert"

# Support neural network:
# LSTM
# CNN
neural_network = "LSTM"

In [None]:
# Process text in dataset
print('Processing text dataset')

# load data and process with text_to_wordlist
df_train = pd.read_csv(Train_Data_File, encoding='utf-8')

train_content = df_train['Post'].tolist()
train_labels = df_train['Incomplete?'].tolist()

df_test = pd.read_csv(Test_Data_File, encoding='utf-8')

test_content = df_test['Post'].tolist()
test_ids = df_test['ID'].tolist()

In [None]:
# Tokenize words in all sentences
tokenizer = Tokenizer(num_words=Max_Num_Words)
tokenizer.fit_on_texts(train_content + test_content)

train_sequences = tokenizer.texts_to_sequences(train_content)
test_sequences = tokenizer.texts_to_sequences(test_content)

word_index = tokenizer.word_index
print('{} unique tokens are found'.format(len(word_index)))

# pad all train with Max_Sequence_Length
train_data = pad_sequences(train_sequences, maxlen=Max_Sequence_Length)
train_labels = np.array(train_labels)
print('Shape of train data tensor:', train_data.shape)
print('Shape of train labels tensor:', train_labels.shape)

# pad all test with Max_Sequence_Length
test_data = pad_sequences(test_sequences, maxlen=Max_Sequence_Length)
test_ids = np.array(test_ids)
print('Shape of test data tensor:', test_data.shape)
print('Shape of test ids tensor:', test_ids.shape)

In [None]:
# Word embedding
if embedding_model == "GloVe":
    Embedding_Dim = 300 # Dimension of GloVe-embedding
    Embedding_File = '/Users/gubow/COMP 691H/Find Duplicates Project/glove.840B.300d.txt'
    # Create word embedding dictionary from 'glove.840B.300d.txt'
    print('Creating GloVe word embedding dictionary...')

    embeddings_index = {}
    f = open(Embedding_File, encoding='utf-8')

    for line in f:
        values = line.split()
        # word = values[0]
        word = ''.join(values[:-300])   
        coefs = np.asarray(values[-300:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Found {} word vectors of glove.'.format(len(embeddings_index)))
    
if embedding_model == "Word2Vec":
    Embedding_Dim = 300 # Dimension of Word2Vec-embedding
    Embedding_File = '/Users/gubow/COMP 691H/Find Duplicates Project/GoogleNews-vectors-negative300.bin'
    print('Creating Word2Vec word embedding dictionary...')

    word2vec = KeyedVectors.load_word2vec_format(datapath(Embedding_File), binary=True)
    print('Found %s word vectors of word2vec' % len(word2vec))
    
if embedding_model == "Bert":
    Embedding_Dim = 1024 # Dimension of Bert-embedding
    print('Creating Bert word embedding dictionary...')
    bert_embeddings = BertEmbeddings(model_name = 'bert-large-uncased-whole-word-masking')
    # This will create a tensor too large for a single computer, a more powerful one is needed
    '''
    embeddings_index = {}
    for word in word_index:
        output = bert_embeddings([word])
        for value in output[0]['embeddings_map'].values():
            embeddings_index[word] = np.array(value)
    print('Found {} word vectors of bert.'.format(len(embeddings_index)))
    '''
    # Use this instead
    embeddings_index = {}
    for word in word_index:
        output = bert_embeddings([word])
        result = np.array(output[0]['hidden_states'])[0][-1] + np.array(output[0]['hidden_states'])[0][-2] + np.array(output[0]['hidden_states'])[0][-3] + np.array(output[0]['hidden_states'])[0][-4]
        embeddings_index[word] = result
    print('Found {} word vectors of bert.'.format(len(embeddings_index)))
    
if embedding_model == "RoBerta":
    Embedding_Dim = 1024 # Dimension of Bert-embedding
    print('Creating RoBerta word embedding dictionary...')
    roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    roberta_model = TFRobertaModel.from_pretrained('roberta-large')
    embeddings_index = {}
    for word in word_index:
        inputs = roberta_tokenizer(word, return_tensors="tf")
        outputs = roberta_model(inputs)
        result = np.array(outputs.last_hidden_state[0][1:-1])
        embeddings_index[word] = result
    print('Found {} word vectors of RoBerta.'.format(len(embeddings_index)))
    
if embedding_model == "DeBerta":
    Embedding_Dim = 1536 # Dimension of Bert-embedding
    print('Creating DeBerta word embedding dictionary...')
    deberta_tokenizer = DebertaV2Tokenizer.from_pretrained('kamalkraj/deberta-v2-xlarge')
    deberta_model = TFDebertaV2Model.from_pretrained('kamalkraj/deberta-v2-xlarge')
    embeddings_index = {}
    for word in word_index:
        inputs = deberta_tokenizer(word, return_tensors="tf")
        outputs = deberta_model(inputs)
        result = np.array(outputs.last_hidden_state[0][1:-1])
        embeddings_index[word] = result
    print('Found {} word vectors of DeBerta.'.format(len(embeddings_index)))

In [None]:
leaks = df_train[['Time Feature 1', 'Time Feature 2', 'Reason Feature 1']]
test_leaks = df_test[['Time Feature 1', 'Time Feature 2', 'Reason Feature 1']]

ss = StandardScaler()
ss.fit(np.vstack((leaks, test_leaks)))
leaks = ss.transform(leaks)
test_leaks = ss.transform(test_leaks)

In [None]:
# Create embedding matrix for embedding layer
print('Preparing embedding matrix')

num_words = min(Max_Num_Words, len(word_index))+1

embedding_matrix = np.zeros((num_words, Embedding_Dim))
if embedding_model == "GloVe" or embedding_model == "Bert":
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
if embedding_model == "Word2Vec":
    for word, i in word_index.items():
        try:
            embedding_vector = word2vec.get_vector(word)
            embedding_matrix[i] = embedding_vector
        except:
            continue
if embedding_model == "RoBerta" or embedding_model == "DeBerta":
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)[0]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
print('Null word embeddings: '.format(np.sum(np.sum(embedding_matrix, axis=1) == 0)))

In [None]:
# Train Validation split
perm = np.random.permutation(len(train_data))
idx_train = perm[:int(len(train_data)*(1-Validation_Split_Ratio))]
idx_val = perm[int(len(train_data)*(1-Validation_Split_Ratio)):]

data_train = train_data[idx_train]
leaks_train = leaks[idx_train]

labels_train = train_labels[idx_train]

data_val = train_data[idx_val]
leaks_val = leaks[idx_val]

labels_val = train_labels[idx_val]

weight_val = np.ones(len(labels_val))

In [None]:
if neural_network == "LSTM":
    # The embedding layer containing the word vectors
    emb_layer = Embedding(
        input_dim=num_words,
        output_dim=Embedding_Dim,
        weights=[embedding_matrix],
        input_length=Max_Sequence_Length,
        trainable=False
    )    


    # LSTM layer

    lstm_layer = LSTM(Num_Lstm, dropout=Rate_Drop_Lstm, recurrent_dropout=Rate_Drop_Lstm)

    # Define inputs
    seq = Input(shape=(Max_Sequence_Length,), dtype='int32')

    # Run inputs through embedding
    emb = emb_layer(seq)

    # Run through LSTM layers
    lstm = lstm_layer(emb)
    # glob1 = GlobalAveragePooling1D()(lstm)

    magic_input = Input(shape=(leaks.shape[1],))
    magic_dense = BatchNormalization()(magic_input)
    magic_dense = Dense(int(Num_Dense/2), activation=act_f)(magic_input)

    merged = concatenate([lstm, magic_dense])
    merged = BatchNormalization()(merged)
    merged = Dropout(Rate_Drop_Dense)(merged)

    merged = Dense(Num_Dense, activation=act_f)(merged)
    merged = BatchNormalization()(merged)
    merged = Dropout(Rate_Drop_Dense)(merged)

    preds = Dense(1, activation='sigmoid')(merged)
    class_weight = None
    
    # Train the model
    model = Model(inputs=[seq, magic_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])

    # Set early stopping (large patience should be useful)
    early_stopping =EarlyStopping(monitor='val_acc', patience=6)
    bst_model_path = embedding_model + " + " + neural_network + '.h5'
    model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)
    hist = model.fit([data_train, leaks_train], labels_train, \
        validation_data=([data_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])
        
if neural_network == "CNN":
    # The embedding layer containing the word vectors
    emb_layer = Embedding(
        input_dim=num_words,
        output_dim=Embedding_Dim,
        weights=[embedding_matrix],
        input_length=Max_Sequence_Length,
        trainable=False
    )

    # 1D convolutions that can iterate over the word vectors
    conv1 = Conv1D(filters=128, kernel_size=1, padding='same', activation='relu')
    conv2 = Conv1D(filters=128, kernel_size=2, padding='same', activation='relu')
    conv3 = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
    conv4 = Conv1D(filters=128, kernel_size=4, padding='same', activation='relu')
    conv5 = Conv1D(filters=32, kernel_size=5, padding='same', activation='relu')
    conv6 = Conv1D(filters=32, kernel_size=6, padding='same', activation='relu')

    # Define inputs
    seq = Input(shape=(60,))

    # Run inputs through embedding
    emb = emb_layer(seq)

    # Run through CONV + GAP layers
    conv1 = conv1(emb)
    glob1 = GlobalAveragePooling1D()(conv1)

    conv2 = conv2(emb)
    glob2 = GlobalAveragePooling1D()(conv2)

    conv3 = conv3(emb)
    glob3 = GlobalAveragePooling1D()(conv3)

    conv4 = conv4(emb)
    glob4 = GlobalAveragePooling1D()(conv4)

    conv5 = conv5(emb)
    glob5 = GlobalAveragePooling1D()(conv5)

    conv6 = conv6(emb)
    glob6 = GlobalAveragePooling1D()(conv6)

    merge = concatenate([glob1, glob2, glob3, glob4, glob5, glob6])

    diff = Lambda(lambda x: K.abs(x[0]), output_shape=(4 * 128 + 2*32,))([merge])
    mul = Lambda(lambda x: x[0], output_shape=(4 * 128 + 2*32,))([merge])

    magic_input = Input(shape=(leaks.shape[1],))
    magic_dense = BatchNormalization()(magic_input)
    magic_dense = Dense(64, activation='relu')(magic_dense)

    merge = concatenate([diff, mul, magic_dense])
    class_weight = None

    # The MLP that determines the outcome
    x = Dropout(0.2)(merge)
    x = BatchNormalization()(x)
    x = Dense(300, activation='relu')(x)

    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[seq, magic_input], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

    # Set early stopping (large patience should be useful)
    early_stopping =EarlyStopping(monitor='val_acc', patience=6)
    bst_model_path = embedding_model + " + " + neural_network + '.h5' 
    model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

    hist = model.fit([data_train, leaks_train], labels_train, \
        validation_data=([data_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

In [None]:
# Now the model training has finished. Save the model for future use
model.save(bst_model_path) # store model parameters in .h5 file
bst_val_score = min(hist.history['val_acc'])