In [1]:
%%capture
!python -m spacy download en_core_web_sm
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import spacy

nlp = spacy.load('en_core_web_sm')

In [2]:
(ds_train,ds_test),ds_info = tfds.load(
    name="imdb_reviews",
    split=["train","test"],
    shuffle_files=True,
    as_supervised=True,
    with_info=True
)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete61WHE8/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete61WHE8/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete61WHE8/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [3]:
df_train = tfds.as_dataframe(ds_train, ds_info)
df_test = tfds.as_dataframe(ds_test, ds_info)

In [4]:
def clean_entry(text_list):
  str_list = []
  for text in text_list:
    str_text = str(text)
    str_text_lim = len(str_text)-1
    str_text = str_text[1:str_text_lim]
    str_list.append(str_text)
  return html_term_remover(str_list)

def prepare_for_ai(df_col):
  list_to_return = df_col.tolist()
  return clean_entry(list_to_return)

def html_term_remover(df_list: list):
  return_list = []
  for i in df_list:
    b_soup = BeautifulSoup(i, 'html.parser')
    return_list.append(b_soup.get_text())
  return apply_re(return_list)

def apply_re(str_list):
  re_list = []
  for text in str_list:
    text = re.sub("[^0-9A-Za-z .,?!]", "", text)
    re_list.append(text)
  return remove_integer(re_list)

def remove_integer(str_list):
  re_list = []
  int_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
  for text in str_list:
    sentence_list = []
    new_text = text.split()
    for word in new_text:
      if word not in int_list:
        sentence_list.append(word)
    re_list.append(' '.join(sentence_list))
  return return_lower_text(re_list)

def return_lower_text(str_list):
  re_list = []
  for word in str_list:
    re_list.append(word.lower())
  return re_list



In [5]:
x_train = prepare_for_ai(df_train['text'])
x_test = prepare_for_ai(df_test['text'])
y_train = df_train['label']
y_test = df_test['label']

In [6]:
type(x_train)

list

In [7]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=45)

In [8]:
x_train_bert = x_train.copy()
x_val_bert = x_val.copy()
x_test_bert = x_test.copy()
y_train_bert = y_train.values.tolist()
y_test_bert = y_test.values.tolist()
y_val_bert = y_val.values.tolist()

In [9]:
%%capture
!pip install transformers
import transformers
from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig
import tensorflow as tf
import keras
#tf.random.set_seed(45)

In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
x_train_input_id, x_train_mask, x_test_input_id, x_test_mask, x_val_input_id, x_val_mask = [], [], [], [], [], []

for sent in x_train_bert:
  xt_bert_in = tokenizer.encode_plus(sent, add_special_tokens=True, max_length=512, padding="max_length", return_attention_mask=True, truncation=True, is_split_into_words=True)
  x_train_input_id.append(xt_bert_in['input_ids'])
  x_train_mask.append(xt_bert_in['attention_mask'])

for sent in x_test_bert:
  xtest_bert_in = tokenizer.encode_plus(sent, add_special_tokens=True, max_length=512, padding="max_length", return_attention_mask=True, truncation=True, is_split_into_words=True)
  x_test_input_id.append(xtest_bert_in['input_ids'])
  x_test_mask.append(xtest_bert_in['attention_mask'])

for sent in x_val_bert:
  xval_bert_in = tokenizer.encode_plus(sent, add_special_tokens=True, max_length=512, padding="max_length", return_attention_mask=True, truncation=True, is_split_into_words=True)
  x_val_input_id.append(xval_bert_in['input_ids'])
  x_val_mask.append(xval_bert_in['attention_mask'])


In [12]:
from transformers import TFBertForSequenceClassification, BertConfig, TFBertModel

In [41]:
from keras import backend as K

In [42]:
K.clear_session()

In [43]:
bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)

In [44]:
bert_model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [45]:
for layer in bert_model.layers[0:1]:
    layer.trainable = False

i = -1
for layer in bert_model.layers:
    i += 1
    print(i)
    print(layer)
    print(layer.trainable)

0
<transformers.models.bert.modeling_tf_bert.TFBertMainLayer object at 0x7f3cb37bbfd0>
False
1
<keras.layers.core.dropout.Dropout object at 0x7f3cb36ba6d0>
True
2
<keras.layers.core.dense.Dense object at 0x7f3cb36baa90>
True


In [17]:
#input_ids_layer = tf.keras.layers.Input(shape=(375,), name="input_token", dtype='int32')
#input_masks_layer = tf.keras.layers.Input(shape=(375,), name="attention", dtype='int32')

#embedding_layer = bert_model(input_ids_layer, attention_mask=input_masks_layer)[0]
#output_ = tf.keras.layers.Dense(1, activation=None,use_bias=False)(embedding_layer)

#model = tf.keras.Model(inputs=[input_ids_layer, input_masks_layer], outputs=[output_])

In [18]:
#i = -1
#for layer in model.layers:
#  i += 1
#  if i < 3:
#    layer.trainable=False
#  else:
#    layer.trainable=True


In [19]:
#i = -1
#for layer in model.layers:
#  i += 1
#  print(layer)
#  print(layer.trainable)

In [20]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam

In [46]:
loss = SparseCategoricalCrossentropy(from_logits=True)
metric = SparseCategoricalAccuracy(name='accuracy')
optimizer = Adam(learning_rate=2e-3, epsilon=1e-08)

In [47]:
bert_model.compile(optimizer=optimizer, loss=loss, metrics=metric)

In [23]:
x_train_input_id = np.asarray(x_train_input_id, dtype='int32')
x_train_mask = np.asarray(x_train_mask, dtype='int32')
x_val_input_id = np.asarray(x_val_input_id, dtype='int32')
x_val_mask = np.asarray(x_val_mask, dtype='int32')
x_test_input_id = np.asarray(x_test_input_id, dtype='int32')
x_test_mask = np.asarray(x_test_mask, dtype='int32')
y_train_bert = np.asarray(y_train_bert, dtype='int32')
y_test_bert = np.asarray(y_test_bert, dtype='int32')
y_val_bert = np.asarray(y_val_bert, dtype='int32')

In [24]:
y_train_bert = np.reshape(y_train_bert, (-1, 1))
y_test_bert = np.reshape(y_test_bert, (-1, 1))
y_val_bert= np.reshape(y_val_bert, (-1, 1))

In [48]:
train_history = bert_model.fit(x=[x_train_input_id, x_train_mask], y=y_train_bert, batch_size=32, epochs=2, validation_data=([x_val_input_id, x_val_mask], y_val_bert))

Epoch 1/2
Epoch 2/2


In [49]:
K.set_value(bert_model.optimizer.learning_rate, 5e-4)
train_history = bert_model.fit(x=[x_train_input_id, x_train_mask], y=y_train_bert, batch_size=32, epochs=2, validation_data=([x_val_input_id, x_val_mask], y_val_bert))

Epoch 1/2
Epoch 2/2


In [50]:
K.set_value(bert_model.optimizer.learning_rate, 5e-5)
train_history = bert_model.fit(x=[x_train_input_id, x_train_mask], y=y_train_bert, batch_size=32, epochs=1, validation_data=([x_val_input_id, x_val_mask], y_val_bert))



In [51]:
K.set_value(bert_model.optimizer.learning_rate, 2e-6)
train_history = bert_model.fit(x=[x_train_input_id, x_train_mask], y=y_train_bert, batch_size=32, epochs=1, validation_data=([x_val_input_id, x_val_mask], y_val_bert))



In [52]:
K.set_value(bert_model.optimizer.learning_rate, 2e-4)
train_history = bert_model.fit(x=[x_train_input_id, x_train_mask], y=y_train_bert, batch_size=32, epochs=1, validation_data=([x_val_input_id, x_val_mask], y_val_bert))



In [53]:
K.set_value(bert_model.optimizer.learning_rate, 2e-3)
train_history = bert_model.fit(x=[x_train_input_id, x_train_mask], y=y_train_bert, batch_size=32, epochs=1, validation_data=([x_val_input_id, x_val_mask], y_val_bert))



In [54]:
K.set_value(bert_model.optimizer.learning_rate, 5e-4)
train_history = bert_model.fit(x=[x_train_input_id, x_train_mask], y=y_train_bert, batch_size=32, epochs=1, validation_data=([x_val_input_id, x_val_mask], y_val_bert))



In [55]:
K.set_value(bert_model.optimizer.learning_rate, 5e-7)
train_history = bert_model.fit(x=[x_train_input_id, x_train_mask], y=y_train_bert, batch_size=32, epochs=1, validation_data=([x_val_input_id, x_val_mask], y_val_bert))



In [56]:
K.set_value(bert_model.optimizer.learning_rate, 1e-8)
train_history = bert_model.fit(x=[x_train_input_id, x_train_mask], y=y_train_bert, batch_size=32, epochs=1, validation_data=([x_val_input_id, x_val_mask], y_val_bert))

