# These CSV files use the Excel dialect. Each row contains the information for a single data practice. The columns are:
1. (A) annotation ID (a globally unique identifier for a data practice)
2. (B) batch ID (name of a batch in the annotation tool; often indicates who the annotators were)
3. (C) annotator ID
4. (D) policy ID (this corresponds to the numeric prefixes in the policy filename, as found in other directories)
5. (E) segment ID (the zero-indexed, sequential identifier of the policy segment; e.g., the first segment in a policy's text is segment zero)
6. (F) category name
7. (G) attribute-value pairs (represented as JSON)
8. (H) policy URL
9. (I) date

In [None]:
import pandas as pd
import os

op115_dataset = pd.read_csv("opp115_allpols.csv")
subdata = op115_dataset[10000:10020]
segment_dataset = pd.read_csv("pol_segments.csv")
filename = "all_prettyprints.csv"
pretty_df = pd.read_csv(filename)

In [None]:
print(op115_dataset.columns)
print("\n",segment_dataset.columns)
print("\n",pretty_df.columns)

Pretty Print Uniquified
=======================

The pretty_print_uniquified/ directory contains a derivative of the data in the pretty_print/ directory. Within each CSV file (policy), practices are consolidated into sets on the basis of equivalent pretty print representations. Each line represents one such set.

The CSV files in this directory contain the following columns:
1. (A) Pretty print for the privacy practices in the set
2. (B) Number of practices in the set
3. (C) A (Python-formatted) list of tuples, with each tuple representing a unique policy practice. Each tuple contains these members in order:
        (a) annotation ID
        (b) policy segment ID
        (c) annotator ID

# Pretty Print

The CSV files in this directory contain the following columns:
1. (A) annotation ID
2. (B) policy segment ID
3. (C) annotator ID
4. (D) pretty print string

# My BERT

In [None]:
# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
# And pandas for data import + sklearn because you allways need sklearn
import pandas as pd
from sklearn.model_selection import train_test_split

# BERT 
Trying to use this towardsdatascience tutorial:
https://towardsdatascience.com/multi-label-multi-class-text-classification-with-bert-transformer-and-keras-c6355eccb63a

In [None]:
#https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1 (BERT Tiny)

In [1]:
# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
# And pandas for data import + sklearn because you allways need sklearn
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
import os
# Name of the BERT model to use
#model_name = 'bert-base-uncased'

path_to_tiny_cofig = os.path.join(os.getcwd(),"uncased_L-2_H-128_A-2","bert_config.json")
paty_to_dir = os.path.join(os.getcwd(),"uncased_L-2_H-128_A-2")
path_to_weights = os.path.join(os.getcwd(),"uncased_L-2_H-128_A-2","bert_model.ckpt.index")
path_to_weights2 = os.path.join(os.getcwd(),"uncased_L-2_H-128_A-2","bert_model.ckpt.data-00000-of-00001")


# Max length of tokens
max_length = 100
# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(path_to_tiny_cofig)
config.output_hidden_states = False
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = paty_to_dir, config = config)
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(path_to_weights2, config = config, from_tf = True)

TypeError: ('Keyword argument not understood:', 'from_tf')

In [8]:
TFBertModel.from_pretrained(path_to_weights, config = config, from_tf=True)

TypeError: ('Keyword argument not understood:', 'from_tf')

In [None]:
path2 = "small_bert_bert_en_uncased_L-2_H-128_A-2_1"
path_to_config = os.path.join(os.getcwd(),path2,"bert_config.json")
path_to_dir2 = os.path.join(os.getcwd(),path2)
tokpath =  os.path.join(os.getcwd(),path2,"assets")

# Max length of tokens
max_length = 100
# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(path_to_config)
config.output_hidden_states = False
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = tokpath, config = config)
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(path_to_dir2, config = config)



In [None]:
x = tokenizer(
    text= "hello I would like to talk to you about a little something",
    
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

print(x)

In [None]:
bert = transformer_model.layers[0]

In [None]:
# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}

In [None]:
# Import data from csv
data = pd.read_csv('complaints.csv')
# Select required columns
data = data[['Consumer complaint narrative', 'Product', 'Issue']]
# Remove a row if any of the three remaining columns are missing
data = data.dropna()
# Remove rows, where the label is present only ones (can't be split)
data = data.groupby('Issue').filter(lambda x : len(x) > 1)
data = data.groupby('Product').filter(lambda x : len(x) > 1)
# Set your model output as categorical and save in new label col
data['Issue_label'] = pd.Categorical(data['Issue'])
data['Product_label'] = pd.Categorical(data['Product'])
# Transform your output to numeric
data['Issue'] = data['Issue_label'].cat.codes
data['Product'] = data['Product_label'].cat.codes
# Split into train and test - stratify over Issue
data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Issue']])

In [None]:
data[:2]

In [None]:
len(data)
subdata = data[:5000]

In [None]:
len(data_test)
subdata_test = data_test[:10000]

In [None]:
max(subdata_test["Product"])

In [None]:
max(data_test["Product"])

In [None]:
Getting a loan
Struggling to pay mortgage
Incorrect information on your report
Improper use of your report
Problem caused by your funds being low

In [None]:
data[:5].Issue_label.value_counts()[:20]

In [None]:
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

In [None]:
# Then build your model output
issue = Dense(units=len(subdata.Issue_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='issue')(pooled_output)
product = Dense(units=len(subdata.Product_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='product')(pooled_output)
outputs = {'issue': issue, 'product': product}
# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')
# Take a look at the model
model.summary()

In [None]:
# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = {'issue': CategoricalCrossentropy(from_logits = True), 'product': CategoricalCrossentropy(from_logits = True)}
metric = {'issue': CategoricalAccuracy('accuracy'), 'product': CategoricalAccuracy('accuracy')}

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Ready output data for the model
y_issue = to_categorical(subdata['Issue'])
y_product = to_categorical(subdata['Product'])

# Tokenize the input (takes some time)
x = tokenizer(
    text=subdata['Consumer complaint narrative'].to_list(),
    
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

# Fit the model
history = model.fit(
    x={'input_ids': x['input_ids']},
    y={'issue': y_issue, 'product': y_product},
    validation_split=0.2,
    batch_size=64,
    epochs=1)

In [None]:
len(y_product[:2][0])

In [None]:
#######################################
### ----- Evaluate the model ------ ###
# Ready test data
test_y_issue = to_categorical(subdata_test['Issue'])
test_y_product = to_categorical(subdata_test['Product'])
test_x = tokenizer(
    text=subdata_test['Consumer complaint narrative'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

# Run evaluation
model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids'][:5]},
    y={'issue': test_y_issue[:5], 'product': test_y_product[:5]}
)

# BERT 0.5

Trying to do a multi classification model, following this article: 
https://towardsdatascience.com/building-a-multi-label-text-classifier-using-bert-and-tensorflow-f188e0ecdc5d

In [None]:
import os
import collections
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

In [None]:
import bert
#from bert import bert_run_classifier
from bert import bert_tokenization
#from bert import bert_tokenization
#from bert import bert_modeling

In [None]:
bert_tokenization.tokenize("adf adfad ad")

In [None]:
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.modeling.models
import official.nlp.optimization

In [None]:
class InputExample(object):
    
    """class to create an input for the BERT model 
    taken from this page: 
    https://towardsdatascience.com/building-a-multi-label-text-classifier-using-bert-and-tensorflow-f188e0ecdc5d"""
        
    def __init__(self, uid, text_a, text_b = None, labels = None):
        self.uid = uid
        self.text_a = text_a
        self.text_b = text_b
        self.labels = labels

In [None]:
def create_examples(df, labels_available = True):
    """function to create intputs from a dataframe for the BERT model
    taken from this page: 
    https://towardsdatascience.com/building-a-multi-label-text-classifier-using-bert-and-tensorflow-f188e0ecdc5d"""
    examples = []
    for (i, row) in enumerate(df.values):
        pol_uid = row[0]
        text_a = row[2]
        if labels_available:
            labels = row[3:]
        else:
            labels = [0] * 10
        examples.append(InputExample(uid = pol_uid, text_a = text_a, labels = labels))
    
    return examples

In [None]:
TRAIN_VAL_RATIO = 0.9
LEN = train.shape[0]
SIZE_TRAIN = int(TRAIN_VAL_RATIO*LEN)

x_train = train[:SIZE_TRAIN]
x_val = train[SIZE_TRAIN:]

train_examples = create_examples(x_train)

In [None]:
official.nlp.bert.tokenization.FullTokenizer.tokenize("this man is a god")

In [None]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text  # A dependency of the preprocessing model
import tensorflow_addons as tfa
from official.nlp import optimization
import numpy as np

tf.get_logger().setLevel('ERROR')

In [None]:
import tensorflow_text as text  # Registers the ops.
import tensorflow as tf
import tensorflow_hub as hub

# text_input = ["This is a sample sentence."]
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1")
encoder_inputs = preprocessor(text_input) # dict with keys: 'input_mask', 'input_type_ids', 'input_word_ids'
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3", #bert_en_uncased_L-2_H-128_A-2/1
    trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

In [None]:
import bert
FullTokenizer = bert.bert_tokenization.FullTokenizer
from tensorflow.keras.models import Model       # Keras is the new high level API for TensorFlow
import math

In [None]:
max_seq_length = 128  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])


In [None]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")


In [None]:
s = "This is a nice sentence."
tok = tokenizer(s)
print(tok)


In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
s = "This is a nice sentence."
stokens =tokenizer(s)
stokens = ["[CLS]"] + stokens + ["[SEP]"]

input_ids = get_ids(stokens, tokenizer, max_seq_length)
input_masks = get_masks(stokens, max_seq_length)
input_segments = get_segments(stokens, max_seq_length)

pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])