In [None]:
import sys
from pathlib import Path

!pip install -q fire
!git clone -q https://github.com/mdda/worldtree_corpus.git
!cd worldtree_corpus/ && git checkout textgraphs && git pull
!cp -a worldtree_corpus/textgraphs .

# Setup data
path_data = Path("worldtree_corpus_textgraphs2019sharedtask_withgraphvis")
!wget -q -nc http://cognitiveai.org/dist/{path_data}.zip
!unzip -qn {path_data}.zip

In [None]:
# Run OptimizedTFIDF method on dev set
!python textgraphs/run_ranking.py \
--path_data={path_data} \
--recurse_tfidf=False \
--do_train=False \
--do_dev=True \
--do_test=False \

In [None]:
# Run IterativeTFIDF method on dev set
!python textgraphs/run_ranking.py \
--path_data={path_data} \
--recurse_tfidf=True \
--do_train=False \
--do_dev=True \
--do_test=False \

###Setup for BERT

In [None]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)
DELETE_EXISTING_MODEL = True #@param ["True", "False"] {type:"raw"}

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.
    
    
!test -d bert_repo || git clone https://github.com/chiayewken/bert bert_repo
!cd bert_repo && git pull

# STS-B dataset is always needed as a reference for data format
!test -d download_glue_repo || git clone https://gist.github.com/60c2bdb54d156a41194446737ce03e2e.git download_glue_repo
!python download_glue_repo/download_glue_data.py --data_dir='glue_data' --tasks=STS
    
TASK = "textgraphs" #@param ["textgraphs", "STS-B", "MNLI"]
glue_tasks = {'MNLI', 'MRPC', 'CoLA', 'STS-B'}

if TASK in glue_tasks:
    # Download glue data.
    _task = TASK.split("-")[0]  # STS-B -> STS
    !python download_glue_repo/download_glue_data.py --data_dir='glue_data' --tasks=$_task

    TASK_DATA_DIR = 'glue_data/' + TASK
    print('***** Task data directory: {} *****'.format(TASK_DATA_DIR))
    !ls $TASK_DATA_DIR
else:
    print("Warning: custom task")
    TASK_DATA_DIR = None

BUCKET = 'YOUR_BUCKET' #@param {type:"string"}
assert BUCKET, 'Must specify an existing GCS bucket name'
OUTPUT_DIR = 'gs://{}/bert-tfhub/models/{}_regressor_{}'.format(BUCKET, TASK, True)
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
BERT_MODEL = "uncased_L-12_H-768_A-12" #@param ["uncased_L-12_H-768_A-12", "uncased_L-24_H-1024_A-16"]
BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_' + BERT_MODEL + '/1'

In [None]:
import sys
sys.path.append("bert_repo")
# import run_scorer
import run_regressor as run_scorer
import csv
import pandas as pd

def read_tsv(file, first_line_header=True):
    lines = run_scorer.DataProcessor._read_tsv(file)
    if first_line_header:
        df = pd.DataFrame(lines[1:], columns=lines[0])
    else:
        df = pd.DataFrame(lines)
    return df

def write_tsv(df, file):
    df.to_csv(file, sep="\t", index=False, quoting=csv.QUOTE_NONE)
    
def write_data(texts_a, texts_b, scores, file, reference_file="glue_data/STS-B/train.tsv"):
    assert len(texts_a) == len(texts_b) == len(scores)
    dummy = [None] * len(scores)
    ref_columns = read_tsv(reference_file).columns
    data = {
        "index": list(range(len(scores))),
        "sentence1": texts_a,
        "sentence2": texts_b,
        "score": scores,
    }
    assert all([col in ref_columns for col in list(data.keys())])
    
    data_list = [(data.get(col) or dummy) for col in ref_columns]
    df = pd.DataFrame(zip(*data_list), columns=ref_columns)
    print(df.shape)
    assert all([a == b for a, b in zip(ref_columns, df.columns)])
    write_tsv(df, file)

In [None]:
# Setup task specific model and TPU running config.
BERT_PRETRAINED_DIR = 'gs://cloud-tpu-checkpoints/bert/' + BERT_MODEL 
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
!gsutil ls $BERT_PRETRAINED_DIR

CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')

OUTPUT_DIR = OUTPUT_DIR.replace('bert-tfhub', 'bert-checkpoints')
if DELETE_EXISTING_MODEL:
    if tf.io.gfile.exists(OUTPUT_DIR):  # Delete and reset model_dir every time
        tf.io.gfile.rmtree(OUTPUT_DIR)
    tf.io.gfile.mkdir(OUTPUT_DIR)
!gsutil ls $OUTPUT_DIR

###Train BERT and do re-ranking

In [None]:
%%time
# Run IterativeTFIDF method on train/eval to generate data for BERT
!python textgraphs/run_ranking.py \
--path_data={path_data} \
--recurse_tfidf=True \
--do_train=True \
--do_dev=True \
--do_test=False \

In [None]:
import pandas as pd

def write_textgraph_scores_data(df_scores, out_file):
    write_data(
        texts_a=df_scores.text_q.tolist(),
        texts_b=df_scores.text_e.tolist(),
        scores=df_scores.score.tolist(),
        file=out_file,
    )
!rm -rf data
!mkdir data
!cp df_scores_dev.csv df_scores_test.csv
for mode in ["train", "dev", "test"]:
    write_textgraph_scores_data(pd.read_csv(f"df_scores_{mode}.csv"), f"data/{mode}.tsv")
!ls -lh data

In [None]:
%%time
# Save snapshot of codebase for reproducibility
!zip -qr bert_repo.zip bert_repo
!zip -qr textgraphs.zip textgraphs
!gsutil cp textgraphs.zip bert_repo.zip {OUTPUT_DIR}

!python bert_repo/run_regressor.py \
--task_name=STS-B \
--data_dir=data \
--bert_config_file=$CONFIG_FILE \
--output_dir=$OUTPUT_DIR \
--vocab_file=$VOCAB_FILE \
--do_train=True \
--do_eval=False \
--do_predict=True \
--use_tpu=True \
--tpu_name=$TPU_ADDRESS \
--eval_batch_size=32 \
--predict_batch_size=32 \
--save_checkpoints_steps=99999 \
--init_checkpoint=$BERT_PRETRAINED_DIR/bert_model.ckpt \

In [None]:
# BERT Re-ranking and re-scoring
!python textgraphs/run_ranking.py \
--path_data={path_data} \
--do_train=False \
--do_dev=True \
--do_test=False \
--bert_output_dir={OUTPUT_DIR} \