## Bert Code

In [1]:

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


## Now load pandas and tensorflow

In [2]:
import tensorflow as tf
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial.distance import cdist
import sklearn
from sklearn.model_selection import train_test_split
import math as math
from tensorflow.keras.models import Sequential, save_model, load_model
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam, Ftrl, Adamax, SGD, Adadelta, Nadam, Optimizer, RMSprop, Adagrad
from tensorflow.keras.preprocessing.text import Tokenizer as tf_tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.callbacks import CSVLogger
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import TensorBoard
from keras.callbacks import LambdaCallback
from tensorflow.keras import backend as K
from keras.activations import relu, sigmoid,softmax, tanh, elu, selu,hard_sigmoid
import skopt
from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_convergence
from skopt.plots import plot_objective, plot_evaluations
from skopt.plots import plot_histogram, plot_objective_2D
from skopt.utils import use_named_args
print("\nTensorFlow version is ",tf.__version__)
print("\nKeras version is ", tf.keras.__version__)
print("\nSciKit-optimize version is",  skopt.__version__)

Using TensorFlow backend.



TensorFlow version is  2.3.1

Keras version is  2.4.0

SciKit-optimize version is 0.8.1


## Get the Stanford IMDB dataset and remove the unlabeled reviews


In [3]:

URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

# The shutil module offers a number of high-level 
# operations on files and collections of files.
import os
import shutil
# Create main directory path ("/aclImdb")
main_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
# Create sub directory path ("/aclImdb/train")
train_dir = os.path.join(main_dir, 'train')
# Remove unsup folder since this is a supervised learning task
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)
# View the final train folder
print(os.listdir(train_dir))

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
['urls_unsup.txt', 'neg', 'urls_pos.txt', 'urls_neg.txt', 'pos', 'unsupBow.feat', 'labeledBow.feat']


## This is the twitter data set, with the training and testing set ups


In [4]:


file = '/Users/mutecypher/Documents/UW_work/GitHub/twitter-project/Data Files/twitter_sentiment_learn.csv'

learning_df = pd.read_csv(file)
print(learning_df.shape)
print(learning_df.head())

x = learning_df['text'].to_list()

y_l1 = learning_df[["NEG", "NEU", "POS"]]

y_l1['Sent']= 0

i =0
for i in range(len(y_l1)):
    if (y_l1.loc[i,'NEG'] > y_l1.loc[i,'NEU']) and  (y_l1.loc[i,'NEG'] > y_l1.loc[i,'POS']):
        y_l1.loc[i,'Sent']= -1
    elif (y_l1.loc[i,'NEU'] >= y_l1.loc[i,'NEG']) and  (y_l1.loc[i,'NEU'] >= y_l1.loc[i,'POS']):
        y_l1.loc[i, 'Sent']= 0
    else:
        y_l1.loc[i, 'Sent']= 1
    i += 1

y = y_l1['Sent']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.15)

# Convert to numpy arrays.
y_train = np.array(y_train)
y_test = np.array(y_test)

##x_train = np.array(x_train)
##x_test = np.array(x_test)
##print(x[1])

print("Train-set size: ", len(x_train))
print("Test-set size:  ", len(x_test))

data_text = x_train + x_test

## can be played with 

num_words = 30000

learn_tokenizer = tf_tokenizer(num_words=num_words)

##%%time
learn_tokenizer.fit_on_texts(data_text)

x_train_tokens = learn_tokenizer.texts_to_sequences(x_train)

x_test_tokens = learn_tokenizer.texts_to_sequences(x_test)


num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = math.floor(max_tokens)

pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

(385042, 6)
   Unnamed: 0          TWID       NEG       NEU       POS  \
0           0  7.680000e+17  0.049398  0.861395  0.089207   
1           1  7.680000e+17  0.006598  0.046810  0.946591   
2           2  7.680000e+17  0.032333  0.850945  0.116722   
3           3  7.680000e+17  0.008090  0.042331  0.949579   
4           4  7.680000e+17  0.009325  0.940488  0.050187   

                                                text  
0  #Incredible #India #Atulya #Bharat - Land of S...  
1  RT @KendallHuntRPD: The #firstdayofschool for ...  
2  RT @abbiesf_: Kate wrights figure is all I wan...  
3  Josh Jenkins is looking forward to TAB Breeder...  
4  Robert Pattinson Gets Ready to Hop on a Plane ...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Train-set size:  327285
Test-set size:   57757


In [5]:
print(len(y_l1))
print(y.head())
print(y_l1.loc[0,'NEG'])

385042
0    0
1    1
2    0
3    1
4    0
Name: Sent, dtype: int64
0.049398487


In [6]:
# We create a training dataset and a validation 
# dataset from our "aclImdb/train" directory with a 80/20 split.
train = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='training', seed=123)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='validation', seed=123)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


## make a pandas dataframe for the training data

In [7]:
for i in train.take(1):
  train_feat = i[0].numpy()
  train_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")
train.head()


Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,Canadian director Vincenzo Natali took the art...,1
1,I gave this film 10 not because it is a superb...,1
2,I admit to being somewhat jaded about the movi...,1
3,"For a long time, 'The Menagerie' was my favori...",1
4,A truly frightening film. Feels as if it were ...,0


## Now a pandas dataframe for the testing data


In [8]:
for j in test.take(1):
  test_feat = j[0].numpy()
  test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")
test.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,I can't believe that so much talent can be was...,0
1,This movie blows - let's get that straight rig...,0
2,"The saddest thing about this ""tribute"" is that...",0
3,I'm only rating this film as a 3 out of pity b...,0
4,Something surprised me about this movie - it w...,1


## Now all the input and tokenization

In [9]:

def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           'DATA_COLUMN', 
                                                                           'LABEL_COLUMN')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length = True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'


## Now call the above functions

In [10]:


train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



## Configure BERT and the models
## and make the callbacks 

In [11]:
callbackx = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', 
                                             patience =0,
                                            restore_best_weights = True)
        
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('val_accuracy') > 0.90):
      print("\nReached 90% validation accuracy, so slowing the learning rate.")
      optimizer = Adam(learning_rate = 5e-6)
      self.model.stop_training = False
    if(logs.get('accuracy') > 0.97):
      print("\nReached 97% training accuracy, so slowing the learning rate more.")
      optimizer = Adam(learning_rate = 1e-5)
      self.model.stop_training = False

call_it = myCallback()

In [12]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('val_accuracy')])

## model.fit(train_data, epochs=5, validation_data=validation_data, callbacks = [call_it, callbackx])

model.fit(x = x_train_pad,
                        y = y_train,
                        epochs=6,
                        batch_size=512,
                        validation_split=0.2
                        ## callbacks=[call_it, callbackx]
                         )

Epoch 1/6
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


InvalidArgumentError:  Received a label value of -1 which is outside the valid range of [0, 2).  Label values: 0 0 0 0 0 0 1 0 0 1 1 0 1 -1 0 1 0 0 0 0 1 1 0 0 0 0 -1 1 0 -1 1 0 0 -1 0 1 0 -1 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 -1 0 1 0 0 0 -1 0 0 0 0 0 -1 0 0 1 0 1 0 0 0 1 0 1 1 1 0 0 0 1 1 1 0 0 1 0 0 0 1 0 -1 0 0 1 1 1 1 1 1 1 0 1 0 0 0 0 1 0 1 0 0 0 1 -1 1 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0 0 0 1 1 -1 1 0 0 1 0 0 -1 -1 0 0 0 1 -1 0 1 1 0 0 1 0 1 1 0 1 1 0 0 0 1 1 0 0 0 1 0 0 -1 0 0 1 1 0 0 1 0 0 0 1 1 0 1 1 0 0 0 0 1 0 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 -1 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 -1 0 0 0 0 1 1 1 0 1 1 1 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 0 1 -1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 1 1 1 0 0 1 0 0 1 1 0 -1 -1 0 0 0 0 0 1 1 0 1 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 -1 1 1 1 0 1 0 0 0 0 0 1 0 0 1 1 0 1 1 0 0 1 0 1 0 0 0 0 1 0 -1 1 0 0 1 1 0 0 0 0 1 1 0 -1 1 1 0 0 1 1 1 0 0 1 1 1 -1 0 0 0 0 0 0 1 0 1 0 0 -1 1 0 0 0 1 1 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0 1 -1 0 0 1
	 [[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at <ipython-input-12-4a783b1ac020>:11) ]] [Op:__inference_train_function_32644]

Function call stack:
train_function


## Now some sentences to predict sentiment


In [None]:
pred_sentences = ['This was an awesome movie. I watch it twice my time watching this beautiful movie if I have known it was this good',
                  'One of the worst movies of all time. I cannot believe I wasted two hours of my life for this movie',
                 'At no point can I say I loved this movie.', 'No complaint about this movie could be justified, just pure perfection.']


## Tokenize and then sentimentize

In [None]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])
