<a href="https://colab.research.google.com/github/getChan/data_campus/blob/master/NLP/BERT_Text_Classification_(Full).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
! pip install bert-tensorflow



In [0]:
import math
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import pickle
import bert
import os
from bert import run_classifier
from bert import optimization
from bert import tokenization

tf.logging.set_verbosity(tf.logging.ERROR)


def create_tokenizer_from_hub_module(bert_model_hub):
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_module = hub.Module(bert_model_hub)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                                  tokenization_info["do_lower_case"]])

        print("Using BERT from %s" %bert_model_hub)
        print("with vocab size=%d and do_lower_case=%s." %(len(vocab_file), str(do_lower_case)))

    return bert.tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)


def make_features(dataset, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN):
    input_example = dataset.apply(lambda x: bert.run_classifier.InputExample(guid=None,
                                                                             text_a=x[DATA_COLUMN],
                                                                             text_b=None,
                                                                             label=x[LABEL_COLUMN]), axis=1)
    features = bert.run_classifier.convert_examples_to_features(input_example, label_list, MAX_SEQ_LENGTH, tokenizer)
    return features


def create_model(bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
    """Creates a classification model."""

    bert_module = hub.Module(
        bert_model_hub,
        trainable=True)
    bert_inputs = dict(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids)
    bert_outputs = bert_module(
        inputs=bert_inputs,
        signature="tokens",
        as_dict=True)

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["pooled_output"]

    with tf.variable_scope("output_layer"):
        layer_out = tf.layers.dense(
            inputs=output_layer,
            units=num_labels,
            use_bias=False,
            kernel_initializer=tf.initializers.variance_scaling()
        )
        predicted_labels = tf.squeeze(tf.argmax(layer_out, axis=-1, output_type=tf.int32))

        if is_predicting:
            return predicted_labels, layer_out
        else:
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=labels,
                logits=layer_out
            )
            loss = tf.reduce_mean(loss)

            return loss, predicted_labels, layer_out


# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(bert_model_hub, num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
    """Returns `model_fn` closure for TPUEstimator."""

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)

        # TRAIN and EVAL
        if not is_predicting:

            (loss, predicted_labels, log_probs) = create_model(
                bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

            train_op = bert.optimization.create_optimizer(
                loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

            # Calculate evaluation metrics.
            def metric_fn(label_ids, predicted_labels):
                accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
                f1_score = tf.contrib.metrics.f1_score(
                    label_ids,
                    predicted_labels)
                auc = tf.metrics.auc(
                    label_ids,
                    predicted_labels)
                recall = tf.metrics.recall(
                    label_ids,
                    predicted_labels)
                precision = tf.metrics.precision(
                    label_ids,
                    predicted_labels)
                true_pos = tf.metrics.true_positives(
                    label_ids,
                    predicted_labels)
                true_neg = tf.metrics.true_negatives(
                    label_ids,
                    predicted_labels)
                false_pos = tf.metrics.false_positives(
                    label_ids,
                    predicted_labels)
                false_neg = tf.metrics.false_negatives(
                    label_ids,
                    predicted_labels)
                return {
                    "eval_accuracy": accuracy,
                    "f1_score": f1_score,
                    "auc": auc,
                    "precision": precision,
                    "recall": recall,
                    "true_positives": true_pos,
                    "true_negatives": true_neg,
                    "false_positives": false_pos,
                    "false_negatives": false_neg
                }

            eval_metrics = metric_fn(label_ids, predicted_labels)

            if mode == tf.estimator.ModeKeys.TRAIN:
                return tf.estimator.EstimatorSpec(mode=mode,
                                                  loss=loss,
                                                  train_op=train_op)
            else:
                return tf.estimator.EstimatorSpec(mode=mode,
                                                  loss=loss,
                                                  eval_metric_ops=eval_metrics)
        else:
            (predicted_labels, log_probs) = create_model(
                bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

            predictions = {
                'probabilities': log_probs,
                'labels': predicted_labels
            }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    # Return the actual model function in the closure
    return model_fn


def estimator_builder(bert_model_hub, OUTPUT_DIR, SAVE_SUMMARY_STEPS, SAVE_CHECKPOINTS_STEPS, label_list, LEARNING_RATE,
                      num_train_steps, num_warmup_steps, BATCH_SIZE):
    # Specify outpit directory and number of checkpoint steps to save
    run_config = tf.estimator.RunConfig(
        model_dir=OUTPUT_DIR,
        save_summary_steps=SAVE_SUMMARY_STEPS,
        save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

    model_fn = model_fn_builder(
        bert_model_hub=bert_model_hub,
        num_labels=len(label_list),
        learning_rate=LEARNING_RATE,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps)

    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=run_config,
        params={"batch_size": BATCH_SIZE})
    return estimator, model_fn, run_config


def run_on_dfs(train, test, data_column, label_column,
               max_seq_length=128,
               batch_size=32,
               learning_rate=2e-5,
               num_train_epochs=3,
               warmup_proportion=0.1,
               save_summary_steps=100,
               save_checkpoint_steps=10000,
               bert_model_hub="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
               output_dir="output"):
    label_list = train[label_column].unique().tolist()

    tokenizer = create_tokenizer_from_hub_module(bert_model_hub)

    train_features = make_features(train, label_list, max_seq_length, tokenizer, data_column, label_column)
    test_features = make_features(test, label_list, max_seq_length, tokenizer, data_column, label_column)

    steps_per_epoch = math.ceil(len(train_features) / batch_size)

    num_train_steps = int(len(train_features) / batch_size * num_train_epochs)
    num_warmup_steps = int(num_train_steps * warmup_proportion)

    estimator, model_fn, run_config = estimator_builder(
        bert_model_hub,
        output_dir,
        save_summary_steps,
        save_checkpoint_steps,
        label_list,
        learning_rate,
        num_train_steps,
        num_warmup_steps,
        batch_size)

    train_input_fn = bert.run_classifier.input_fn_builder(
        features=train_features,
        seq_length=max_seq_length,
        is_training=True,
        drop_remainder=False)

    test_input_fn = run_classifier.input_fn_builder(
        features=test_features,
        seq_length=max_seq_length,
        is_training=False,
        drop_remainder=False)

    results = []
    for epoch in range(num_train_epochs):
        estimator.train(input_fn=train_input_fn, steps=steps_per_epoch)

        print("End of epoch %d." %(epoch + 1))

        result_dict = estimator.evaluate(input_fn=test_input_fn, steps=None)
        print(result_dict)
        results.append(result_dict)

    return results, estimator


def pretty_print(result):
    df = pd.DataFrame([result]).T
    df.columns = ["values"]
    return df

In [0]:
def load_data(data_file):
    data = pd.read_csv(data_file)

    # Only use the top quartile as polite, and bottom quartile as impolite. Discard the rest.
    quantiles = data["Normalized Score"].quantile([0.25, 0.5, 0.75])
    # print(quantiles)

    for i in range(len(data)):
        score = data.loc[i, "Normalized Score"]
        if score <= quantiles[0.25]:
            # Bottom quartile (impolite).
            data.loc[i, "Normalized Score"] = 0
        elif score >= quantiles[0.75]:
            # Top quartile (polite).
            data.loc[i, "Normalized Score"] = 1
        else:
            # Neutral.
            data.loc[i, "Normalized Score"] = 2

    data["Normalized Score"] = data["Normalized Score"].astype(int)

    # Discard neutral examples.
    data = data[data["Normalized Score"] < 2]
    
    data.sample(frac=1).reset_index(drop=True)
    n_test = len(data) // 10
    test_data = data[:n_test]
    train_data = data[n_test:]
    
    print("Data loaded successfully. Train=%d, test=%d, total=%d." % (len(train_data), len(test_data), len(train_data) + len(test_data)))
    print("Some train samples:")
    print(train_data.head())
    print("Some test samples:")
    print(test_data.head())

    return train_data, test_data

In [0]:
if not os.path.exists("Stanford_politeness_corpus.zip"):
  !wget http://www.cs.cornell.edu/~cristian/Politeness_files/Stanford_politeness_corpus.zip

if not os.path.exists("Stanford_politeness_corpus/wikipedia.annotated.csv"):
  !unzip Stanford_politeness_corpus.zip

train_data, test_data = load_data("Stanford_politeness_corpus/wikipedia.annotated.csv")

params = {
    "data_column": "Request",
    "label_column": "Normalized Score",
    "batch_size": 16,
    "num_train_epochs": 3,
    "bert_model_hub": "https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1"
}

tf.logging.set_verbosity(tf.logging.INFO)
result, estimator = run_on_dfs(train_data, test_data, **params)
print(result)

Data loaded successfully. Train=1961, test=217, total=2178.
Some train samples:
     Community      Id  ...         TurkId5  Normalized Score
460  Wikipedia  621480  ...  A1Y3Z92RE62NPS                 1
462  Wikipedia  146267  ...  A3IHLWMZNBLUR4                 1
463  Wikipedia   84242  ...   AIPK94CUWL45W                 1
464  Wikipedia  487517  ...  A1F4D2PZ7NNWTL                 1
466  Wikipedia  629492  ...  A2WZQ92N4809N1                 1

[5 rows x 14 columns]
Some test samples:
   Community      Id  ...         TurkId5  Normalized Score
0  Wikipedia  629705  ...  A15DM9BMKZZJQ6                 0
1  Wikipedia  244336  ...  A3TFQK7QK8X6LM                 1
5  Wikipedia  214411  ...  A1Y3Z92RE62NPS                 1
8  Wikipedia  177439  ...  A29B522D0BX6HN                 0
9  Wikipedia  341534  ...  A28TXBSZPWMEU9                 0

[5 rows x 14 columns]


I0801 08:55:03.547953 140393886611328 saver.py:1499] Saver not created because there are no variables in the graph to restore
I0801 08:55:04.198145 140393886611328 run_classifier.py:774] Writing example 0 of 1961
I0801 08:55:04.200582 140393886611328 run_classifier.py:461] *** Example ***
I0801 08:55:04.205874 140393886611328 run_classifier.py:462] guid: None
I0801 08:55:04.210916 140393886611328 run_classifier.py:464] tokens: [CLS] Thanks . As an aside , since this did turn out to be fact ##ual , just very hard to source , do you think the community would count ##enan ##ce an un ##block request from B ##la ##ab ##la if he accepted some strict un ##block conditions ( such as packing in the ' systemic bias ' thing , discussing his edit ##s in a less confrontation ##al manner etc ) ? [SEP]


Using BERT from https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1
with vocab size=76 and do_lower_case=False.


I0801 08:55:04.214390 140393886611328 run_classifier.py:465] input_ids: 101 5749 119 1249 1126 4783 117 1290 1142 1225 1885 1149 1106 1129 1864 4746 117 1198 1304 1662 1106 2674 117 1202 1128 1341 1103 1661 1156 5099 25191 2093 1126 8362 27467 4566 1121 139 1742 6639 1742 1191 1119 3134 1199 9382 8362 27467 2975 113 1216 1112 16360 1107 1103 112 27410 15069 112 1645 117 10751 1117 14609 1116 1107 170 1750 14002 1348 4758 3576 114 136 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
I0801 08:55:04.215842 140393886611328 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
I0801 08:55:04.218999 140393886611328 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0