# BERT & TF

In [19]:
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
import numpy as np
import pickle as pk
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from collections import defaultdict
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split

# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

### Load Data - Latest Data Split - Blind

In [4]:
data_train = pd.read_csv("Data/latest/articles_train.csv")
data_test = pd.read_csv("Data/latest/articles_test.csv")

In [7]:
data_train.head()

Unnamed: 0,content_id,month,day,year,date,content_source_desc,content_title_clean,content_body_clean,blind_mean_rating,blind_rating_count,blind_ratings
0,2932,11,2,2017,2017-11-02,The New York Times,A Tax Cut That Lifts the Economy? Opinions Are...,Yet if the House plan resolves some longstandi...,3.177778,45,"[4.5, 1.5, 0.5, 4.5, 1.0, 4.0, 3.5, 3.5, 1.5, ..."
1,2870,11,1,2017,2017-11-01,Fox News,"Tom Tancredo enters Colorado governor's race, ...",Former U.S. Rep. Tom Tancredo announced Tuesda...,2.375,16,"[3.5, 4.0, 3.0, 2.5, 0.5, 3.0, 0.5, 0.5, 4.5, ..."
2,2869,11,1,2017,2017-11-01,The New York Times,Panel Recommends Opioid Solutions but Puts No ...,President Trump’s bipartisan commission on th...,3.916667,6,"[5.0, 4.0, 4.5, 3.5, 2.0, 4.5]"
3,2864,11,1,2017,2017-11-01,Fox News,"Trump vows to end non merit-base immigration, ...",President Trump vowed Wednesday to scrap the f...,2.1,5,"[2.0, 2.5, 4.0, 0.5, 1.5]"
4,2868,11,1,2017,2017-11-01,Breitbart,WATCH: Congress Holds Hearing on Banning Abort...,Congress will hold a hearing Wednesday on a bi...,2.428571,21,"[2.0, 1.0, 0.5, 1.0, 3.5, 5.0, 2.0, 0.5, 2.5, ..."


In [13]:
#clean data
def clean_data(text):
    #remove punctuation, digits, extra stuff. make lowercase
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    #lemma it - include POS tag in order to lemma it better
    tag_map = defaultdict(lambda : wordnet.NOUN)
    tag_map['J'] = wordnet.ADJ
    tag_map['V'] = wordnet.VERB
    tag_map['R'] = wordnet.ADV
    lemmatizer = WordNetLemmatizer()
    textTokens = word_tokenize(text)
    #remove stopwords
    word_tokens_nostop = [w for w in textTokens if not w in stopwords.words('english')] 
    #now lemma
    text = [lemmatizer.lemmatize(tok, tag_map[tag[0]]) for tok, tag in pos_tag(word_tokens_nostop)]
    return " ".join(text)

### Get clean body

In [14]:
clean_body_train = data_train["content_body_clean"].apply(clean_data)
y_train = data_train['blind_mean_rating']

In [18]:
clean_body_test = data_test["content_body_clean"].apply(clean_data)
y_test = data_test['blind_mean_rating']

In [16]:
VOCAB_SIZE = 29000

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(clean_body_train)

# Note, the tokenizer's word_index will not respect VOCAB_SIZE.
# but, that parameter will be respected in later methods,
# (for example, when you call text_to_sequences).
# Also note that '0' is a reserved index for padding.
print("Word index", len(tokenizer.word_index))

Word index 29377


### Split train and val

In [22]:
X_train, X_val, y_train_t, y_val = train_test_split(clean_body_train, y_train, test_size=0.15, shuffle=True, random_state=3)

In [23]:
# Use the texts_to_sequences utility to vectorize your training, 
# validation, and test questions. 
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_val = tokenizer.texts_to_sequences(X_val)
sequences_test = tokenizer.texts_to_sequences(clean_body_test)

### Choose max sequence length

In [24]:
train_word_lengths = []
for w in sequences_train:
    train_word_lengths.append(len(w))
words_length = np.array(train_word_lengths)

print("# of Words in the 90 percentile:",np.percentile(words_length, 90))
print("# of Words in the 95 percentile:",np.percentile(words_length, 95))
print("# of Words in the 99 percentile:",np.percentile(words_length, 99))
print("# of Words in the 100 percentile:",np.percentile(words_length, 100))

# of Words in the 90 percentile: 764.6
# of Words in the 95 percentile: 1100.7999999999997
# of Words in the 99 percentile: 1879.800000000003
# of Words in the 100 percentile: 9317.0


In [25]:
MAX_SEQ_LEN = 3000

### Pad

In [26]:
padded_train = tf.keras.preprocessing.sequence.pad_sequences(sequences_train, maxlen=MAX_SEQ_LEN)
padded_val = tf.keras.preprocessing.sequence.pad_sequences(sequences_val, maxlen=MAX_SEQ_LEN)
padded_test = tf.keras.preprocessing.sequence.pad_sequences(sequences_test, maxlen=MAX_SEQ_LEN)

In [27]:
padded_train[0]

array([   0,    0,    0, ...,   20, 3553,  415])

### tf.dataset and batch

In [28]:
BATCH_SIZE = 32
BUFFER_SIZE = 1000

In [30]:
# This method will create a dataset that returns four elements.
# - a batch of padded body
# - a batch of ratings
def create_dataset(body, ratings):
    dataset = tf.data.Dataset.from_tensor_slices((body, 
                                                ratings))
    # Shuffle and batch
    dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

In [31]:
train_ds = create_dataset(padded_train, y_train_t)
val_ds = create_dataset(padded_val, y_val)
test_ds = create_dataset(padded_test, y_test)

### Initial LSTM model

In [32]:
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input, LSTM
from tensorflow.keras.models import Model, Sequential

In [36]:
def relu_advanced(x):
    return K.relu(x, max_value=5)

In [46]:
body_input = Input(shape=(MAX_SEQ_LEN,), dtype='int32')
embedded_body = Embedding(input_dim=VOCAB_SIZE, output_dim=64, input_length=MAX_SEQ_LEN)(body_input)
encoded_body = LSTM(64)(embedded_body)

# Concatenate other features
#merged = tf.keras.layers.concatenate([encoded_image, encoded_question])

# dense layers
dense1 = Dense(128, activation="relu")(encoded_body)
dense2 = Dense(32, activation="relu")(dense1)

# Next, add a binary classifier on top
output = Dense(1, activation=relu_advanced)(dense2)

# Your final model
model1 = Model(inputs=body_input, outputs=output)

In [47]:
model1.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 3000)]            0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 3000, 64)          1856000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_7 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_8 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 33        
Total params: 1,901,505
Trainable params: 1,901,505
Non-trainable params: 0
_________________________________________________

In [50]:
model1.compile(optimizer='adam', 
              loss='mean_squared_error',
              metrics=['accuracy'])

In [51]:
# Utilities to help us record metrics.
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.Mean(name='val_accuracy')

In [48]:
# code to evaluate on validation set
def evaluate(max_steps=None):
    steps = 0
    for body_batch, y_batch in val_ds:
        if max_steps != None and steps == max_steps:
            break
        predictions = model1.predict(x=body_batch)
        steps += 1 
        # Record metrics after each batch
        val_loss(y_batch, predictions)
        val_accuracy(y_batch, predictions)

### Train Model

In [44]:
checkpoint_dir="checkpoints/"
if os.path.exists(checkpoint_dir):
    print("Checkpoints folder already exists")
else:
    print("Creating a checkpoints directory")
    os.makedirs(checkpoint_dir)

import time

Checkpoints folder already exists


In [None]:
train_loss_history, train_acc_history = [], []
val_loss_history, val_acc_history = [], []

epochs = 31 # Your code here

# Training loop
for epoch in range(epochs):

    start = time.time()

    # Train for one epoch
    for body_batch, y_batch in train_ds:
        result = model1.train_on_batch(x=body_batch, y=y_batch)

    # Record metrics after each batch
    train_loss(result[0])
    train_accuracy(result[1])

    # Evaluate for a few steps
    evaluate(max_steps=100)

    # Print progress
    # You should not need to modify this.
    template = 'Epoch {}, Loss: {:.2f}, Accuracy: {:.2f}, Val Loss {:.2f}, Val Accuracy {:.2f}, Time: {:.1f} secs'
    print(template.format(epoch,
                        train_loss.result(),
                        train_accuracy.result() * 100,
                        val_loss.result(),
                        val_accuracy.result() * 100,
                        time.time() - start))
  
    # Record history
    train_loss_history.append(train_loss.result())
    train_acc_history.append(train_accuracy.result() * 100)
    val_loss_history.append(val_loss.result())
    val_acc_history.append(val_accuracy.result() * 100)

    # Reset the metrics for the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    val_loss.reset_states()
    val_accuracy.reset_states()

    # Your code here
    # Save a checkpoint after each epoch
    cpNum="cp-epoch-"+str(epoch)+".ckpt"
    checkpoint_path = os.path.join(checkpoint_dir, cpNum)
    print("Saving weights")
    model1.save_weights(checkpoint_path)

In [None]:
# Your code here
def plot(train_loss_h, train_acc_h, val_loss_h, val_acc_h):
    # The history object contains results on the training and test
    # sets for each epoch
    acc = train_acc_h
    val_acc = val_acc_h
    loss = train_loss_h
    val_loss = val_loss_h

    # Get the number of epochs
    epochs = range(len(acc))

    plt.title('Training and validation accuracy')
    plt.plot(epochs, acc, color='blue', label='Train')
    plt.plot(epochs, val_acc, color='orange', label='Val')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    _ = plt.figure()
    plt.title('Training and validation loss')
    plt.plot(epochs, loss, color='blue', label='Train')
    plt.plot(epochs, val_loss, color='orange', label='Val')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()