In [1]:
# connected GPU information
!nvidia-smi

Sun Nov 29 14:24:48 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0    33W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [3]:
# install huggingface transformers
!pip -q install transformers

[K     |████████████████████████████████| 1.3MB 5.7MB/s 
[K     |████████████████████████████████| 2.9MB 25.6MB/s 
[K     |████████████████████████████████| 1.1MB 49.3MB/s 
[K     |████████████████████████████████| 890kB 40.9MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [4]:
import os

import tensorflow as tf
import transformers

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import random as rnd

In [5]:
import warnings
warnings.filterwarnings('ignore')

<a name='1'></a>
#Importing the Data

In [6]:
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/quora_questions.csv")
#data.fillna("none value", inplace=True) # replace nan value to none
# drop the rows with null value
data.dropna(axis=0, inplace=True)
N=len(data)
print('Number of question pairs: ', N)
data.head()

Number of question pairs:  404348


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [7]:
def cut_to_max(text, max_len):
    words = text.split()[:max_len]
    return ' '.join(words)

# test the func
# cut_to_max(' abc test wewe dfd ddd', 3)

In [None]:
data['question1'] = data['question1'].apply(lambda x: cut_to_max(x, 65))
data['question2'] = data['question2'].apply(lambda x: cut_to_max(x, 65))

In [None]:
# for s in data['question1']:
#     if len(s.split()) > 63:
#         print(s)


In [None]:
print("dataset labels Distribution")
print(data.is_duplicate.value_counts())

dataset labels Distribution
0    255042
1    149306
Name: is_duplicate, dtype: int64


We first split the data into a train and test set. The test set will be used later to evaluate our model.

In [8]:
# model selection
from sklearn.model_selection import train_test_split
data_train , data_test = train_test_split(data, train_size=0.75, random_state=0)
# validation_data, _ = train_test_split(data_train, train_size=0.1, random_state=0)
len(data_train), len(data_test)

(303261, 101087)

In [None]:
# print("Train dataset Distribution")
# print(data_train.is_duplicate.value_counts())

# print("\n\nTest dataset Distribution")
# print(data_test.is_duplicate.value_counts())

In [9]:
train_Q1 = np.array(data_train['question1'])
train_Q2 = np.array(data_train['question2'])
t_labels = np.array(data_train['is_duplicate'])
# make it one-hot encoding
train_labels = tf.keras.utils.to_categorical(t_labels, num_classes=2)

test_Q1 = np.array(data_test['question1'])
test_Q2 = np.array(data_test['question2'])
te_labels  = np.array(data_test['is_duplicate'])
# make it one-hot encoding
test_labels = tf.keras.utils.to_categorical(te_labels, num_classes=2)

In [None]:
train_Q1.shape, train_Q2.shape, train_labels.shape, test_Q1.shape

((303261,), (303261,), (303261, 2), (101087,))

In [21]:
# data generator
class data_generator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-cased", do_lower_case=False
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.shuffle(self.indexes)


In [34]:
# make a model for detecting duplicate question
# max_length = 256
# learning_rate = 0.001
def duplicate_question_detection_model():

    # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )

    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="attention_masks"
    )

    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="token_type_ids"
    )

     # Loading pretrained BERT model.
    bert_model = transformers.TFBertModel.from_pretrained('bert-base-cased')
    # Freeze the BERT model to reuse the pretrained features without modifying them.
    bert_model.trainable = True

    sequence_output, pooled_output = bert_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    flatten =  tf.keras.layers.Flatten() (sequence_output)
    dropout = tf.keras.layers.Dropout(0.5)(flatten)
    output = tf.keras.layers.Dense(2, activation="softmax")(dropout)

    # model object
    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    # compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate),
        loss="categorical_crossentropy",
        metrics=["acc"],
    )

    return model


In [35]:
# Create the model under a distribution strategy scope.
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    max_length=128
    learning_rate = 5e-5
    model = duplicate_question_detection_model()

# print(f"Strategy: {strategy}")
model.summary()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "functional_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 128)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_4 (TFBertModel)   ((None, 128, 768), ( 108310272   input_ids[0][0]                  
                                                                 attention_masks[0][0] 

In [36]:
# data preparation
batch_size = 32
train_data = data_generator(
    data_train[["question1", "question2"]].values.astype("str"),
    train_labels,
    batch_size=batch_size,
    shuffle=True,
)
valid_data = data_generator(
    data_test[["question1", "question2"]].values.astype("str"),
    test_labels,
    batch_size=batch_size,
    shuffle=False,
)

In [41]:
# change learning rate

tf.keras.backend.set_value(model.optimizer.lr, 1e-7)
print(tf.keras.backend.get_value(model.optimizer.lr))

1e-07


In [42]:
epochs = 1
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
)



In [29]:
model.save_weights('/content/drive/My Drive/Colab Notebooks/saved weights/bert_base_cased_parameter_trainable_2_epoch.h5', overwrite=True, save_format=None, options=None)
#model.load_weights('/content/drive/My Drive/Colab Notebooks/saved weights/bert_parameter_trainable_2_epoch.h5', by_name=False, skip_mismatch=False, options=None)

In [None]:
model.evaluate(valid_data, verbose=1)

In [None]:
# draw curves 
import matplotlib.pyplot as plt
%pylab inline
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='val')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()

In [None]:
# !cp '/content/saved_model_85.h5' '/content/drive/My Drive/Thesis/Duplicate Question/'

# from keras.models import load_model

# trained_model = load_model('/content/drive/My Drive/Thesis/Duplicate Question/saved_model_85.h5')

In [None]:
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
)