In [58]:
!pip install sentencepiece
!pip install transformers
!pip install tensorflow-gpu 
!pip install bert-for-tf2

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/35/5c/6439134ecd17b33fe0396fb0b7d6ce3c5a120c42a4516ba0e9a2d6e43b25/bert-for-tf2-0.14.4.tar.gz (40kB)
[K     |████████████████████████████████| 40kB 3.8MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.4-cp36-none-any.whl size=30114 sha256=c71435558658695fc317cf828b3a33ad749d4ea0bf2b3d4810d6f88f93b7d629
  Stored in directory: /root/.cache/pip/wheels/cf/3f/4d/79d7735015a5f523648df90d871ce8e89a7df8185

In [67]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import util
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Embedding, GlobalMaxPooling1D, Dropout, Input
from sklearn.model_selection import StratifiedKFold
import numpy as np
import bert

Cargo datasets

In [68]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

BERT

In [69]:
class Bert:
    
    def __init__(self, max_seq_length, lr, epochs, batch_size):
        
        self.bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', trainable=True)
        self.max_seq_length = max_seq_length  

        vocab_file = self.bert_layer.resolved_object.vocab_file.asset_path.numpy()
        do_lower_case = self.bert_layer.resolved_object.do_lower_case.numpy()
        vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
        to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
        self.tokenizer = bert.bert_tokenization.FullTokenizer(vocabulary_file, to_lower_case)
        
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.models = []
        
        
    def encode(self, texts):
                
        all_tokens = []
        all_masks = []
        all_segments = []

        for text in texts:
            text = self.tokenizer.tokenize(text)
            text = text[:self.max_seq_length - 2]
            input_sequence = ['[CLS]'] + text + ['[SEP]']
            pad_len = self.max_seq_length - len(input_sequence)

            tokens = self.tokenizer.convert_tokens_to_ids(input_sequence)
            tokens += [0] * pad_len
            pad_masks = [1] * len(input_sequence) + [0] * pad_len
            segment_ids = [0] * self.max_seq_length

            all_tokens.append(tokens)
            all_masks.append(pad_masks)
            all_segments.append(segment_ids)

        return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
    
    
    def _build_model(self):
        
        input_word_ids = Input(shape=(self.max_seq_length,), dtype=tf.int32, name='input_word_ids')
        input_mask = Input(shape=(self.max_seq_length,), dtype=tf.int32, name='input_mask')
        segment_ids = Input(shape=(self.max_seq_length,), dtype=tf.int32, name='segment_ids')    
        
        pooled_output, sequence_output = self.bert_layer([input_word_ids, input_mask, segment_ids])   
        clf_output = sequence_output[:, 0, :]
        out = Dense(1, activation='sigmoid')(clf_output)
        
        model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
        optimizer = SGD(learning_rate=self.lr, momentum=0.8)
        model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        model.summary()
        
        return model
    
    
    def train(self, df):
        skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)
        for fold, (trn_idx, val_idx) in enumerate(skf.split(df['text'], df['target'])):
            
            print('\nFold {}\n'.format(fold))
        
            model = self._build_model()

            X_trn_encoded = self.encode(df.loc[trn_idx, 'text'].str.lower())
            y_trn = df.loc[trn_idx, 'target']
            X_val_encoded = self.encode(df.loc[val_idx, 'text'].str.lower())
            y_val = df.loc[val_idx, 'target']

            es = tf.keras.callbacks.EarlyStopping(patience=1)
            history = model.fit(X_trn_encoded, y_trn, validation_data=(X_val_encoded, y_val), epochs=self.epochs, batch_size=self.batch_size)
            util.plot_history(history)
            self.models.append(model)
        
        
    def predict(self, X):
        
        X_test_encoded = self.encode(X['text'].str.lower())
        y_pred = np.zeros((X_test_encoded[0].shape[0], 1))

        for model in self.models:
            y_pred += model.predict(X_test_encoded) / len(self.models)

        return y_pred
                        

In [None]:
bert_model = Bert(max_seq_length=128, lr=0.0001, epochs=5, batch_size=32)
model = bert_model.train(train_df)


Fold 0

Model: "functional_29"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer_3 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0

In [52]:
test_df = pd.read_csv('test.csv')
y_pred = bert_model.predict(test_df)

In [53]:
y_pred

array([[0.86688328],
       [0.99650809],
       [0.99524671],
       ...,
       [0.9958142 ],
       [0.99711007],
       [0.99538848]])

In [54]:
final_df = pd.read_csv('sample_submission.csv')
final_df['target'] = np.round(y_pred).astype('int')
final_df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [55]:
final_df['target'].value_counts()

0    1979
1    1284
Name: target, dtype: int64

In [56]:
from google.colab import files
final_df.to_csv('bert.csv', index=False)
files.download('bert.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
%tensorflow_version 2.x

import tensorflow as tf
print(tf.__version__)
print(tf.test.gpu_device_name())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

2.2.0
/device:GPU:0
Num GPUs Available:  1
