<a href="https://colab.research.google.com/github/mkrsteska/BSA2020_Team_Tissot_Project_2/blob/master/code/BERT_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT

In [1]:
!pip install bert-for-tf2

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/35/5c/6439134ecd17b33fe0396fb0b7d6ce3c5a120c42a4516ba0e9a2d6e43b25/bert-for-tf2-0.14.4.tar.gz (40kB)
[K     |████████                        | 10kB 24.3MB/s eta 0:00:01[K     |████████████████▏               | 20kB 6.1MB/s eta 0:00:01[K     |████████████████████████▎       | 30kB 7.2MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 3.9MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [0]:
import pandas as pd
import numpy as np

import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

#from keras.preprocessing.sequence import pad_sequences

#### BERT Tokenizer

In [0]:
BertTokenizer = bert.bert_tokenization.FullTokenizer

bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1", trainable = True)

vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

#### Load the data

In [0]:
df_train = pd.read_csv("https://raw.githubusercontent.com/mkrsteska/BSA2020_Team_Tissot_Project_2/master/data/train.csv")

In [0]:
X_train = df_train.text.values
y_train = df_train.target.values

#### BERT encode

In [0]:
MAX_LEN = 100

In [0]:
# tokenize the tweets
tokenized_tweets = [tokenizer.tokenize(tweet) for tweet in X_train]
tokenized_tweets = [tweet[:MAX_LEN-2] for tweet in tokenized_tweets]

# add special tokens at the beginning and end of each tweet for BERT to work properly
tokenized_tweets = [["[CLS]"] + tweet + ["[SEP]"] for tweet in tokenized_tweets]

# convert the tokens to their index numbers in the BERT vocabulary
tokenized_tweets_ids = [tokenizer.convert_tokens_to_ids(tweet) for tweet in tokenized_tweets]

# pad the input tokens to max_len
all_input_word_ids = [tweet + [0]*(MAX_LEN-len(tweet)) for tweet in tokenized_tweets_ids]
# all_input_word_ids = pad_sequences(tokenized_tweets_ids, maxlen=MAX_LEN, dtype="long", padding="post")

# 1 - tokenized words, 0 - padded zeros
all_masks = [[1] * len(tweet) + [0] * (MAX_LEN - len(tweet)) for tweet in tokenized_tweets]

all_segment_ids = [[0] * MAX_LEN for tweet in tokenized_tweets]

In [0]:
all_input_word_ids = np.array(all_input_word_ids)
all_masks = np.array(all_masks)
all_segment_ids = np.array(all_segment_ids)

#### Model

In [0]:
input_word_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_word_ids")
input_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_mask")
segment_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name="segment_ids")

In [0]:
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

out = Dense(1, activation='sigmoid')(sequence_output[:, 0, :])
    
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)

In [0]:
model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 100)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [13]:
model.fit(
    (all_input_word_ids, all_masks, all_segment_ids), y_train,
    validation_split=0.2,
    epochs=2,
    batch_size=4)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f37c1afe320>

### Create a submission file

In [0]:
df_test = pd.read_csv("https://raw.githubusercontent.com/mkrsteska/BSA2020_Team_Tissot_Project_2/master/data/test.csv")

In [0]:
X_test = df_test.text.values
ids = df_test['id'].to_numpy()

In [0]:
# tokenize the tweets
tokenized_tweets_test = [tokenizer.tokenize(tweet) for tweet in X_test]
tokenized_tweets_test = [tweet[:MAX_LEN-2] for tweet in tokenized_tweets_test]

# add special tokens at the beginning and end of each tweet for BERT to work properly
tokenized_tweets_test = [["[CLS]"] + tweet + ["[SEP]"] for tweet in tokenized_tweets_test]

# convert the tokens to their index numbers in the BERT vocabulary
tokenized_tweets_ids_test = [tokenizer.convert_tokens_to_ids(tweet) for tweet in tokenized_tweets_test]

# pad the input tokens to max_len
all_input_word_ids_test = [tweet + [0]*(MAX_LEN-len(tweet)) for tweet in tokenized_tweets_ids_test]
# all_input_word_ids = pad_sequences(tokenized_tweets_ids, maxlen=MAX_LEN, dtype="long", padding="post")

# 1 - tokenized words, 0 - padded zeros
all_masks_test = [[1] * len(tweet) + [0] * (MAX_LEN - len(tweet)) for tweet in tokenized_tweets_test]

all_segment_ids_test = [[0] * MAX_LEN for tweet in tokenized_tweets_test]

In [0]:
all_input_word_ids_test = np.array(all_input_word_ids_test)
all_masks_test = np.array(all_masks_test)
all_segment_ids_test = np.array(all_segment_ids_test)

In [0]:
predictions = model.predict((all_input_word_ids_test, all_masks_test, all_segment_ids_test))

In [0]:
predictions_class = predictions.round().astype(int)[:, 0]

In [0]:
pd.DataFrame({'id': ids, 'target': predictions_class}).to_csv('16. Submission_BERT_2.csv', index=False)