In [None]:
# https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-bert-and-hugging-face-294e8a04b671

In [1]:
%%capture
! pip install shap
! pip install transformers

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
import shap

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

from modules.utils import *
import tensorflow as tf

In [3]:
col_names = ['sentiment', 'id', 'date', 'query', 'user', 'text']
df_train = pd.read_csv("/content/drive/MyDrive/data/Sentiment140-train.csv", encoding="latin-1", header = None, names = col_names)
df_test = pd.read_csv("/content/drive/MyDrive/data/Sentiment140-test.csv", encoding="latin-1", header = None, names = col_names)
df_dublin = pd.read_csv("/content/drive/MyDrive/data/citypulse.dublin_city_council.test.csv", encoding="latin-1" )

In [4]:
df_train['split'] = "train"
df_test['split'] = "test"
df_all = pd.concat([df_train, df_test])

In [5]:
df_train = df_train[:10000].reset_index(drop=True)

In [6]:
df_all['sentiment'] = df_all['sentiment'].apply(convert_sentiment)
df_dublin['sentiment'] = df_dublin['sentiment'].apply(convert_sentiment_dublin)

In [7]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [9]:
#df_train['text_decode'] = df_train['text'].str.decode("utf-8")
#df_test['text_decode'] = df_test['text'].str.decode("utf-8")

In [10]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

    return train_InputExamples, validation_InputExamples

def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'text'
LABEL_COLUMN = 'sentiment'

In [11]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(df_train, df_test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(128).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(128)

The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).


In [12]:
len(df_dublin)

3000

In [13]:
#df_sample = df_dublin[:128].reset_index(drop=True)

In [14]:
dublin_InputExamples = df_dublin.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                      text_a = x[DATA_COLUMN], 
                                                      text_b = None,
                                                      label = x[LABEL_COLUMN]), axis = 1)

dublin_data = convert_examples_to_tf_dataset(list(dublin_InputExamples), tokenizer)
dublin_data = dublin_data.batch(128)

The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).


In [16]:
#predictions_bert = model.predict(dublin_data)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


In [17]:
#predictions_bert

TFSequenceClassifierOutput([('logits', array([[-0.4306022 , -0.12708119],
                                    [-0.37862143, -0.12721029],
                                    [-0.40378064, -0.15274249],
                                    ...,
                                    [-0.40356782, -0.25548655],
                                    [-0.37640423, -0.14789182],
                                    [-0.402286  , -0.17748238]], dtype=float32))])

In [18]:
# tf_predictions = tf.nn.softmax(predictions_bert[0], axis=-1)

# label = tf.argmax(tf_predictions, axis=1)
# label = label.numpy()

In [19]:
# df_dublin['sentiment_pred'] = label

In [20]:
# df_dublin['sentiment_pred'].value_counts()

1    2966
0      34
Name: sentiment_pred, dtype: int64

In [21]:
# y_hat = df_dublin["sentiment"]
# y_pred = df_dublin["sentiment_pred"]
# print(metrics.confusion_matrix(y_hat, y_pred))
# print(metrics.classification_report(y_hat, y_pred))
# print("Accuracy Score: %.3f" % metrics.accuracy_score(y_hat, y_pred))

[[  0   6 994]
 [  0  20 980]
 [  0   8 992]]
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      1000
           0       0.59      0.02      0.04      1000
           1       0.33      0.99      0.50      1000

    accuracy                           0.34      3000
   macro avg       0.31      0.34      0.18      3000
weighted avg       0.31      0.34      0.18      3000

Accuracy Score: 0.337


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=1, validation_data=validation_data)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


In [None]:
predictions_bert = model.predict(dublin_data)

In [None]:
tf_predictions = tf.nn.softmax(predictions_bert[0], axis=-1)

label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()

In [None]:
df_dublin['sentiment_pred_train'] = label

In [None]:
y_hat = df_dublin["sentiment"]
y_pred = df_dublin["sentiment_pred_train"]
print(metrics.confusion_matrix(y_hat, y_pred))
print(metrics.classification_report(y_hat, y_pred))
print("Accuracy Score: %.3f" % metrics.accuracy_score(y_hat, y_pred))

In [None]:
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])