In [0]:
# https://androidkt.com/simple-text-classification-using-bert-in-tensorflow-keras-2-0/

In [2]:
!pip install bert-for-tf2



In [3]:
!pip install sentencepiece



In [0]:
%tensorflow_version 2.x

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import bert
from tensorflow.keras.models import  Model
from tqdm import tqdm
import numpy as np
from collections import namedtuple
print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)

TensorFlow Version: 2.2.0-rc3
Hub version:  0.8.0


In [0]:

bert_layer=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True)

MAX_SEQ_LEN=128
input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                    name="segment_ids")

pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [0]:
x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = tf.keras.layers.Dropout(0.2)(x)
out = tf.keras.layers.Dense(6, activation="sigmoid", name="dense_output")(x)

model = tf.keras.models.Model(
      inputs=[input_word_ids, input_mask, segment_ids], outputs=out)


model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])


In [8]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [0]:
FullTokenizer=bert.bert_tokenization.FullTokenizer

vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()

do_lower_case=bert_layer.resolved_object.do_lower_case.numpy()

tokenizer=FullTokenizer(vocab_file,do_lower_case)

In [0]:
def get_masks(tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

In [0]:
def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [0]:
import os
os.environ['KAGGLE_USERNAME'] = "jeongunkuk" # username from the json file
os.environ['KAGGLE_KEY'] = "e965f9b6b69ec1b182beef7519b4f4cc" # key from the json file
# kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

In [13]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
sample_submission.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
test_labels.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [14]:
!unzip train.csv.zip
!unzip test.csv.zip	


Archive:  train.csv.zip
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
Archive:  test.csv.zip
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [15]:
!ls

sample_data		   test.csv	 test_labels.csv.zip  train.csv.zip
sample_submission.csv.zip  test.csv.zip  train.csv


In [0]:
import pandas as pd

df=pd.read_csv('train.csv')

df = df.sample(frac=1)


In [17]:
train_sentences = df["comment_text"].fillna("CVxTz").values
print(train_sentences[0:5])
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = df[list_classes].values
print(train_y[0:5])

["Retirement \n\nWhy is Owen's retirement from football not mentioned? He hasn't played a game since 2005."
 'Another RfC on naming \n\nPlease see the further RfC here.'
 '"\n\n Please Help! \n\nPlease help!  I saw that you said ""It seems to me is that permanent magnet synchronous generator is just another name for magneto (generator). Maybe we should just change that page to redirect to this page."" on 17 October 2013. If you still hold this opinion, could you say so here: Talk:Magneto_(generator)#RFC_on_the_Status_of_This_Article ?  Actually, even if you opinion has changed, I still solicit your comment.    "'
 'SEED-WAAS\nIf the category link at the bottom of the page is a red link, that means that the category in question does not exist. Which, in turn, means that the article is not properly categorized, and thus has to be retagged as  again. Could you please take a few minutes to find an appropriate category that actually exists before attempting to replace the tag with a categor

In [0]:
def create_single_input(sentence,MAX_LEN):
  
  stokens = tokenizer.tokenize(sentence)
  
  stokens = stokens[:MAX_LEN]
  
  stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
  ids = get_ids(stokens, tokenizer, MAX_SEQ_LEN)
  masks = get_masks(stokens, MAX_SEQ_LEN)
  segments = get_segments(stokens, MAX_SEQ_LEN)

  return ids,masks,segments

In [0]:
def create_input_array(sentences):

  input_ids, input_masks, input_segments = [], [], []

  for sentence in tqdm(sentences,position=0, leave=True):
  
    ids,masks,segments=create_single_input(sentence,MAX_SEQ_LEN-2)

    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

  return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

In [20]:
inputs=create_input_array(train_sentences)

100%|██████████| 159571/159571 [03:04<00:00, 862.98it/s]


In [0]:
model.fit(inputs,train_y,epochs=1,batch_size=64,validation_split=0.2,shuffle=True) #batch_size=32

  81/1995 [>.............................] - ETA: 5:15:14 - loss: 0.1949 - accuracy: 0.6827

In [0]:
test_df=pd.read_csv("test.csv")

test_sentences = test_df["comment_text"].fillna("CVxTz").values

test_inputs=create_input_array(test_sentences[110:150])

print(model.predict(test_inputs))
