<a href="https://colab.research.google.com/github/pcampos119104/twitter-sentiment-analysis-br/blob/master/twitter_sentiment_analysis_br.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TODOs 



*  Pep 8 
*  How to access a directory on GCP Bucket
*  How to update tensorflow to 2.x without breaking bert-tf
*  why  tf.keras.utils.get_file it's not extracting the zip file?
*  Use that to split the data from sklearn.model_selection import train_test_splir



# Imports and installations


In [0]:
!pip install bert-tensorflow

In [0]:
'''
try:
  # %tensorflow_version only exists on colab
  %tensorflow_version 2.x
except Exception:
  print('exception')
  pass
'''
import os
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import bert 
from bert import run_classifier
from bert import optimization 
from bert.tokenization import FullTokenizer
from bert.modeling import BertConfig, BertModel
from google.colab import auth
from google.cloud import storage


# Data 

Config GCP Bucket


In [0]:
# https://cloud.google.com/storage/docs/downloading-objects#storage-download-object-python

def download_blob(project_id, bucket_name, source_blob_name) -> None:
    """Downloads a blob from the GCP bucket.
    
    Args:
     project_id: Project id on GCP
     bucket_name: Your bucket name
     source_blob_name: Storage object name
    
    Returns:
     None
    """
    

    storage_client = storage.Client(project_id)

    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(source_blob_name)

    print(
        "Blob {} downloaded.".format(
            source_blob_name
        )
    )

In [0]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location


#Project id on GCP 
PROJECT_ID = 'dl-sentimentbr-119104'

#Set the data directory name on GCP bucket.
INPUT_DIR = 'data'
#Set the name of the file with the data 
FILE_NAME = "tweets_dl_sentimentbr.csv" 
#Whether or not need to extract the file 
DO_EXTRACT= False 

#Set the working directory name on GCP bucket.
OUTPUT_DIR = 'dl_sentimentbr'
#Whether or not to clear/delete the directory and create a new one
DO_DELETE = True 
#Set bucket name to retrieve dataset and save model.
BUCKET = 'pcampos119104' 

OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
INPUT_DIR = 'gs://{}/{}'.format(BUCKET, INPUT_DIR)
auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('Model output directory: {}'.format(OUTPUT_DIR))


In [0]:
download_blob(PROJECT_ID, BUCKET, FILE_NAME)

In [0]:
df = pd.read_csv(FILE_NAME)

In [0]:
df.info()

In [0]:
df.head(1)


In [0]:
def split_data(train_size, dataframe):
  """Split the data in train, dev and test, the proportion depends
  on the train_size.

  Args:
    data_size: how many examples to use from dataset.
    dataframe: the dataset to be sliced.
  Returns:
    TODO
  """
  if train_size < 400000:
    ntrain = int(train_size * 0.6)
    ndev_test = int(train_size * 0.2)
  else:
    ntrain = int(train_size * 0.98)
    ndev_test = int(train_size * 0.01)
  
  data_train = dataframe[:ntrain]
  data_dev = dataframe[ntrain:ntrain + ndev_test]
  data_test = dataframe[ntrain + ndev_test:ntrain + ndev_test*2]

  return data_train, data_dev, data_test

In [0]:
train_size = 500000

data_train, data_dev, data_test = split_data(train_size, df)
print(data_train.count())
print(data_dev.count())
print(data_test.count())

In [0]:
print(data_dev['tweet_text'].where(data_dev['tweet_text'].values==data_test['tweet_text'].values).notna().unique())

# Building and exporting a tf.Module 

In [0]:
# TODO Refactor to a function
MODEL_URL = "https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-large-portuguese-cased/bert-large-portuguese-cased_tensorflow_checkpoint.zip"
MODEL_PATH_AND_NAME = "/content/model.zip"

VOCAB_URL = "https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-large-portuguese-cased/vocab.txt"
VOCAB_PATH_AND_NAME = "/content/vocab.txt"

model = tf.keras.utils.get_file(
    fname=MODEL_PATH_AND_NAME, 
    origin=MODEL_URL, 
    extract=True)

vocab_path = tf.keras.utils.get_file(
    fname=VOCAB_PATH_AND_NAME, 
    origin=VOCAB_URL)


!unzip {model}
!rm -r {model}
!ls -l

In [0]:
# Greate code copied from https://colab.research.google.com/drive/1ofSfThTBlWjOx5dqXmdsIol-MdiqCyZC#scrollTo=cOyKrgZRRqZe
def build_module_fn(config_path, vocab_path, do_lower_case=True):

    def bert_module_fn(is_training):
        """Spec function for a token embedding module."""

        input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids")
        input_mask = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_mask")
        token_type = tf.placeholder(shape=[None, None], dtype=tf.int32, name="segment_ids")

        config = BertConfig.from_json_file(config_path)
        model = BertModel(config=config, 
                          is_training=is_training,
                          input_ids=input_ids, 
                          input_mask=input_mask, 
                          token_type_ids=token_type)
          
        seq_output = model.all_encoder_layers[-1]
        pool_output = model.get_pooled_output()

        config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file")
        vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file")
        lower_case = tf.constant(do_lower_case)

        tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file)
        tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file)
        
        input_map = {"input_ids": input_ids,
                     "input_mask": input_mask,
                     "segment_ids": token_type}
        
        output_map = {"pooled_output": pool_output,
                      "sequence_output": seq_output}

        output_info_map = {"vocab_file": vocab_file,
                           "do_lower_case": lower_case}
                
        hub.add_signature(name="tokens", inputs=input_map, outputs=output_map)
        hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)

    return bert_module_fn

In [0]:
# Greate code copied from https://colab.research.google.com/drive/1ofSfThTBlWjOx5dqXmdsIol-MdiqCyZC#scrollTo=cOyKrgZRRqZe
config_path = "/content/bert_config.json"
vocab_path = "/content/vocab.txt"

tags_and_args = []
for is_training in (True, False):
  tags = set()
  if is_training:
    tags.add("train")
  tags_and_args.append((tags, dict(is_training=is_training)))
module_fn = build_module_fn(config_path, vocab_path, do_lower_case=False)
spec = hub.create_module_spec(module_fn, tags_and_args=tags_and_args)
spec.export("bert-module", 
            checkpoint_path="/content/model.ckpt-1000000")

# Using 

In [0]:
BERT_MODEL_HUB = "/content/bert-module"
# TODO Abstract more to receive https or gs path
def create_tokenizer_from_hub_module(model_hub) -> FullTokenizer:
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    
    bert_module = hub.Module(os.path.abspath(model_hub))
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])

  return FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module(BERT_MODEL_HUB)

In [0]:
tokenizer.tokenize(data_dev.iloc[3]['tweet_text'])

# Data Preprocessing 

Transform the data to input on bert model

In [0]:
data_train.columns

In [0]:
DATA_COLUMN = 'tweet_text'
LABEL_COLUMN = 'sentiment'
label_list = [0, 1]

In [0]:
train_InputExamples = data_train.apply(
    lambda x: bert.run_classifier.InputExample(
        guid=None, 
        text_a = x[DATA_COLUMN], 
        text_b = None, 
        label = x[LABEL_COLUMN]),
        axis = 1)
dev_InputExamples = data_dev.apply(
    lambda x: bert.run_classifier.InputExample(
        guid=None,
        text_a = x[DATA_COLUMN],
        text_b = None,
        label = x[LABEL_COLUMN]),
        axis = 1)
test_InputExamples = data_test.apply(
    lambda x: bert.run_classifier.InputExample(
        guid=None,
        text_a = x[DATA_COLUMN],
        text_b = None,
        label = x[LABEL_COLUMN]),
        axis = 1)


In [0]:
# We'll set sequences to be at most 64 tokens long.
MAX_SEQ_LENGTH = 64
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
dev_features = bert.run_classifier.convert_examples_to_features(dev_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

# Creating a model 

In [0]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  """Creates a classification model."""

  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
  output_layer = bert_outputs["pooled_output"]

  hidden_size = output_layer.shape[-1].value

  # Create our own layer to tune for politeness data.
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    # If we're predicting, we want predicted labels and the probabiltiies.
    if is_predicting:
      return (predicted_labels, log_probs)

    # If we're train/eval, compute loss between predicted and actual label
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)


# Building the preprocessing text pipeline 

# Predicting

# Publishing 
