In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 8.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 66.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 58.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 65.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [2]:
import numpy as np
import pandas
import tensorflow as tf
import transformers
import csv

In [3]:
train, dev, test = [], [], []

In [4]:
with open('/content/pnli_train.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        train.append(x)
print (len(train))
print (train[:3])

5983
[['Sometimes do exercise.', 'A person typically desire healthy life.', '1'], ['Who eats junk foods.', 'A person typically desire healthy life.', '0'], ['A person is sick.', 'A person typically desire healthy life.', '1']]


In [5]:
with open('/content/pnli_dev.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        dev.append(x)
print (len(dev))
print (dev[:3])

1055
[['A person is looking for accuracy.', 'A person typically desires accurate results.', '1'], ['A person does not care for accuracy.', 'A person typically desires accurate results.', '0'], ['The person double checks their data.', 'A person typically desires accurate results.', '1']]


In [6]:
with open('/content/pnli_test_unlabeled.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[0] and x[1] will be the sentence pairs.
        test.append(x)
print (len(test))
print (test[:3])

4850
[['The people want to have a romantic and pleasant feel.', 'People typically does desire to smell violets.'], ['The contract is to buy products from you.', 'Getting contract typically cause to make money or spend money.'], ['Train station is closed.', 'Line can typically be used to move train along tracks.']]


In [7]:
results = []

In [8]:
max_length = 128
batch_size = 16
epochs = 3

labels = ["0", "1"]

In [9]:
train_df = pandas.DataFrame(train)
valid_df = pandas.DataFrame(dev)
test_df = pandas.DataFrame(test)

In [10]:
print(f"Total train samples : {train_df.shape[0]}")
print(f"Total validation samples: {valid_df.shape[0]}")
print(f"Total test samples: {test_df.shape[0]}")

Total train samples : 5983
Total validation samples: 1055
Total test samples: 4850


In [11]:
train_df.columns = ['sentence1', 'sentence2', 'label']
valid_df.columns = ['sentence1', 'sentence2', 'label']
test_df.columns = ['sentence1', 'sentence2']

In [12]:
print(train_df.label.value_counts())

1    3145
0    2838
Name: label, dtype: int64


In [13]:
print(valid_df.label.value_counts())

1    554
0    501
Name: label, dtype: int64


In [14]:
train_df["label"] = train_df["label"].apply(lambda x: 0 if x == "0" else x == "1")
y_train = tf.keras.utils.to_categorical(train_df.label, num_classes=2)

valid_df["label"] = valid_df["label"].apply(lambda x: 0 if x == "0" else x == "1")
y_val = tf.keras.utils.to_categorical(valid_df.label, num_classes=2)

In [15]:
from transformers import AutoTokenizer, TFAutoModel

In [16]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(
        self,
        is_test,
        shuffle,
        batch_size,
        input,
        labels
    ):
        self.tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
        self.is_test = is_test
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.input = input
        self.indexes = np.arange(len(self.input))
        self.labels = labels
        self.on_epoch_end()

    def on_epoch_end(self):
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

    # Magic method!! :p
    def __getitem__(self, idx):
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        input = self.input[indexes]
        encoded = self.tokenizer.batch_encode_plus(
            input.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        embeddings = np.array(encoded["input_ids"], dtype="int32")
        attentions = np.array(encoded["attention_mask"], dtype="int32")
        segments = np.array(encoded["token_type_ids"], dtype="int32")

        if self.is_test:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [embeddings, attentions, segments], labels
        else:
            return [embeddings, attentions, segments]
    
    def __len__(self):
        return len(self.input) // self.batch_size

In [17]:
train_data = DataGenerator(is_test=True, shuffle=True, batch_size=batch_size, input=train_df[["sentence1", "sentence2"]].values.astype("str"), labels=y_train)
valid_data = DataGenerator(is_test=True, shuffle=True, batch_size=batch_size, input=valid_df[["sentence1", "sentence2"]].values.astype("str"), labels=y_val)

Downloading:   0%|          | 0.00/688 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [21]:
embeddings = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
attentions = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="attention_masks")
segments = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="token_type_ids")

bert_model = TFAutoModel.from_pretrained("roberta-large-mnli")

bert_output = bert_model(embeddings, attention_mask=attentions, token_type_ids=segments)
sequence_output = bert_output.last_hidden_state
pooled_output = bert_output.pooler_output

# bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(sequence_output)
avg_pool = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
max_pool = tf.keras.layers.GlobalMaxPooling1D()(sequence_output)
concat = tf.keras.layers.concatenate([avg_pool, max_pool])
dropout = tf.keras.layers.Dropout(0.3)(concat)

output = tf.keras.layers.Dense(2, activation="sigmoid")(dropout)
model = tf.keras.models.Model(inputs=[embeddings, attentions, segments], outputs=output)

Some layers from the model checkpoint at roberta-large-mnli were not used when initializing TFRobertaModel: ['classifier']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-large-mnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [22]:
bert_model.trainable = True

# training params
optimizer = tf.keras.optimizers.Adam(1e-5)
loss_function = "binary_crossentropy"
metrics = ["accuracy"]

model.compile(optimizer = optimizer, loss = loss_function, metrics = metrics)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_masks (InputLayer)   [(None, 128)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_roberta_model_1 (TFRobertaM  TFBaseModelOutputWi  355359744  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'attention_masks[0][0]',  

In [23]:
history = model.fit(train_data, validation_data=valid_data, epochs=3, use_multiprocessing=True, workers=-1)

Epoch 1/3






Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 2/3
Epoch 3/3


In [32]:
labels = ["0", "1"]

In [33]:
def nli_detection(sentence1, sentence2):
    input = np.array([[str(sentence1), str(sentence2)]])
    test_data = DataGenerator(is_test=False, shuffle=False, batch_size=1, input=input, labels=None)

    probability = model.predict(test_data[0])[0]
    index = np.argmax(probability)
    prediction = labels[index]
    return prediction

In [None]:
for index, row in test_df.iterrows():
  result = nli_detection(row['sentence1'], row['sentence2'])[0]
  print(index)
  results.append(result)

In [None]:
count_one = 0
count_zero = 0
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
      if(x == "0"):
        count_zero += 1
      else:
        count_one += 1
      fp.write(str(x) + '\n')

In [None]:
count_one

1838