<a href="https://colab.research.google.com/github/phillipinseoul/StanceDetection/blob/master/bert_stance_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Copyright 2018 The TensorFlow Hub Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [None]:
# Set tensorflow version to 1.x
%tensorflow_version 1.x

# Set up TPU environment.

In [None]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf
import run_classifier
import run_classifier_with_tfhub

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

# Prepare & Import BERT module

In [None]:
!rm -rf bert

In [None]:
import sys
import tensorflow_hub as hub

!test -d bert || git clone https://github.com/shalmolighosh/bert/
if not 'bert' in sys.path:
  sys.path += ['bert']

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!ls gdrive/My\ Drive/BERT/Data

Atheism  CC  FM  HC  LA


# Prepare for training


*   Specify task and download training data.
* Specify BERT pretrained model
* Specify GS bucket, create output directory for model checkpoints and eval results.

In [None]:
!gsutil help

In [None]:
TASK = 'HC'
assert TASK in ('MRPC', 'CoLA', 'Atheism','CC','HC','LA','FM','ALL'), 'Only (MRPC, CoLA) are demonstrated here.'

# Download glue data.
if TASK=='MRPC' or TASK=='CoLA':
  ! test -d download_glue_repo || git clone https://gist.github.com/60c2bdb54d156a41194446737ce03e2e.git download_glue_repo
  !python download_glue_repo/download_glue_data.py --data_dir='glue_data' --tasks=$TASK
  TASK_DATA_DIR = 'glue_data/' + TASK
elif TASK!='ALL':
  TASK_DATA_DIR = 'gdrive/My\ Drive/BERT/Data/' + TASK
else:
  TASK_DATA_DIR = 'gdrive/My\ Drive/BERT/Data/'

print('***** Task data directory: {} *****'.format(TASK_DATA_DIR))
!ls $TASK_DATA_DIR

# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
BERT_MODEL = 'uncased_L-24_H-1024_A-16'
BERT_PRETRAINED_DIR = 'gs://cloud-tpu-checkpoints/bert/' + BERT_MODEL
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
!gsutil ls $BERT_PRETRAINED_DIR

# BUCKET = 'bert-large-pair' #@param {type:"string"}
BUCKET = 'bert-stance-detection'    # Changed to my own bucket in Google Cloud Storage
assert BUCKET, 'Must specify an existing GCS bucket name'
OUTPUT_DIR = 'gs://{}/bert/models/{}/{}_new'.format(BUCKET,BERT_MODEL ,TASK)
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

***** Task data directory: gdrive/My\ Drive/BERT/Data/HC *****
test_preprocessed.csv  train_preprocessed.csv
***** BERT pretrained directory: gs://cloud-tpu-checkpoints/bert/uncased_L-24_H-1024_A-16 *****
gs://cloud-tpu-checkpoints/bert/uncased_L-24_H-1024_A-16/bert_config.json
gs://cloud-tpu-checkpoints/bert/uncased_L-24_H-1024_A-16/bert_model.ckpt.data-00000-of-00001
gs://cloud-tpu-checkpoints/bert/uncased_L-24_H-1024_A-16/bert_model.ckpt.index
gs://cloud-tpu-checkpoints/bert/uncased_L-24_H-1024_A-16/bert_model.ckpt.meta
gs://cloud-tpu-checkpoints/bert/uncased_L-24_H-1024_A-16/checkpoint
gs://cloud-tpu-checkpoints/bert/uncased_L-24_H-1024_A-16/vocab.txt
***** Model output directory: gs://bert-stance-detection/bert/models/uncased_L-24_H-1024_A-16/HC_new *****


In [None]:
#!gsutil cp gs://bert-final/bert/models/Atheism/* gs://bert-large-pair/bert/models/uncased_L-24_H-1024_A-16/Atheism

# Setup task specific BERT model and TPU running configuration

In [None]:
import modeling
import optimization
import run_classifier
import tokenization

if TASK!='ALL':
  TASK_DATA_DIR = 'gdrive/My Drive/BERT/Data/' + TASK
else:
  TASK_DATA_DIR = 'gdrive/My Drive/BERT/Data/'

# Model Hyper Parameters
TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = "11"
NUM_TRAIN_EPOCHS = int(NUM_TRAIN_EPOCHS)
WARMUP_PROPORTION = 0.1
MAX_SEQ_LENGTH = 128

# Model configuration
SAVE_CHECKPOINTS_STEPS = 1000
ITERATIONS_PER_LOOP = 1000
NUM_TPU_CORES = 8
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

processors = {
  "cola": run_classifier.ColaProcessor,
  "mnli": run_classifier.MnliProcessor,
  "mrpc": run_classifier.MrpcProcessor,
  "hc"  : run_classifier.SemProcessor,
    "atheism" : run_classifier.SemProcessor,
    "fm" : run_classifier.SemProcessor,
    "cc" : run_classifier.SemProcessor,
    "la" : run_classifier.SemProcessor,
    "all" : run_classifier.SemProcessor
}
print(processors[TASK.lower()])

tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=OUTPUT_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=ITERATIONS_PER_LOOP,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))


if TASK == 'ALL':
  train_examples = []
  full_forms = {'HC' : 'hillary clinton', 'CC' : 'climate change is a concern','Atheism' : 'Atheism', 'LA' : 'Legalisation of Abortion', 'FM' : 'Feminist Movement'}
  for key,value in full_forms.items():
    processor = run_classifier.SemProcessor(use_pair=True, topic = value)
    label_list = processor.get_labels()
    train_examples += processor.get_train_examples(TASK_DATA_DIR+key)   
else:
  full_forms = {'HC' : 'hillary clinton', 'CC' : 'climate change is a concern','Atheism' : 'Atheism', 'LA' : 'Legalisation of Abortion', 'FM' : 'Feminist Movement'}
  processor = processors[TASK.lower()](use_pair=False,\
                                       topic=full_forms[TASK])
  label_list = processor.get_labels()
  train_examples = processor.get_train_examples(TASK_DATA_DIR)

print("Number of train examples :",len(train_examples))


num_train_steps = int(
    len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# Model function
model_fn = run_classifier.model_fn_builder(
    bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
    num_labels=len(label_list),
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=True,
    use_one_hot_embeddings=True)

# TPU Estimator
estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    predict_batch_size = PREDICT_BATCH_SIZE)


In [None]:
import csv

"""Reads a tab separated value file."""
def read_csv(input_file, quotechar=None):
  with tf.gfile.Open(input_file, "r") as f:
    reader = reader = csv.reader(f)
    lines = []
    for line in reader:
      if sys.version_info[0]==2:
        line = list(unicode(cell, 'utf-8') for cell in line)
      lines.append(line)
    return lines

In [None]:
lines = read_csv(TASK_DATA_DIR+'/train_preprocessed.csv')

In [None]:
lines[1]

# Train the model!

In [None]:
print('MRPC/CoLA on BERT base model normally takes about 2-3 minutes. Please wait...')

train_features = run_classifier.convert_examples_to_features(
    train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

print('***** Started training at {} *****'.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(train_examples)))
print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
tf.logging.info("  Num steps = %d", num_train_steps)

train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)

estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print('***** Finished training at {} *****'.format(datetime.datetime.now()))

# Evaluate the model

In [None]:
if TASK != 'ALL':
  eval_examples = processor.get_dev_examples(TASK_DATA_DIR)
else:
  eval_examples = []
  full_forms = {'HC' : 'hillary clinton', 'CC' : 'climate change is a concern','Atheism' : 'Atheism', 'LA' : 'Legalisation of Abortion', 'FM' : 'Feminist Movement'}
  for key,value in full_forms.items():
    processor = run_classifier.SemProcessor(use_pair=True, topic = value)
    eval_examples += processor.get_dev_examples(TASK_DATA_DIR+key)   

eval_features = run_classifier.convert_examples_to_features(
    eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

print('***** Started evaluation at {} *****'.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(eval_examples)))
print('  Batch size = {}'.format(EVAL_BATCH_SIZE))

# Evaluation will be slightly WRONG on the TPU because it will truncate the last batch.
eval_steps = int(len(eval_examples) / EVAL_BATCH_SIZE)

eval_input_fn = run_classifier.input_fn_builder(
    features=eval_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=True)

result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
print('***** Finished evaluation at {} *****'.format(datetime.datetime.now()))

output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
  print("***** Eval results *****")
  for key in sorted(result.keys()):
    print('  {} = {}'.format(key, str(result[key])))
    writer.write("%s = %s\n" % (key, str(result[key])))


In [None]:
preds = estimator.predict(  
    input_fn=eval_input_fn
)

In [None]:
all_preds = []
for pred in preds:
  all_preds.append(pred)

In [None]:
import numpy as np
np.argmax(all_preds[0]['probabilities'])

0

In [None]:
test = eval_examples
test[0].text_a

In [None]:
import numpy as np

matrix = np.array([[0,0,0],[0,0,0]])

for i in range(len(all_preds)):
  gold = int(test[i].label)
  pred = np.argmax(all_preds[i]['probabilities'])
  if gold<2:
    matrix[gold][2]+=1
  if pred < 2:
    matrix[pred][1]+=1
    if gold == pred:
      matrix[gold][0]+=1
  
print(matrix)
a = matrix[0][0]/(matrix[0][1]+matrix[0][2]+1e-5)
b = matrix[1][0]/(matrix[1][1]+matrix[1][2]+1e-5)
print("fscore - ",a+b)

[[172 295 172]
 [  0   0  45]]
fscore -  0.3683083432910419


In [None]:
labels_dict = ["oppose","support","neutral"]
tweets = ["tweet"]+[t.text_a for t in test]
gold_labels = ["correct"]+[labels_dict[int(t.label)] for t in test]
pred_labels = ["predicted"]+[labels_dict[np.argmax(t['probabilities'])] for t in all_preds]

In [None]:
tweets[0],gold_labels[0],pred_labels[0]

In [None]:
# ex) Save the prediction result into "HC.csv"
np.savetxt('{}.csv'.format(TASK), [p for p in zip(tweets, gold_labels, pred_labels)], delimiter='\t', fmt='%s')

In [None]:
!pwd      # Get current directory
!ls       # Get contents

/content
adc.json  bert	gdrive	HC.csv	sample_data


In [None]:
import pandas as pd
df = pd.read_csv("{}.csv".format(TASK),sep='\t')

df.head()

In [None]:
df.tail()

In [None]:
from google.colab import files
files.download('HC.csv')    # ex) Download HC.csv which is in the current directory

In [None]:
df.to_csv('gdrive/My Drive/BERT/HC.csv')

In [None]:
SAVE_DATA_DIR = 'gdrive/My Drive/BERT/'