<a href="https://colab.research.google.com/github/northpr/tensorflow-resources/blob/main/natural_language_processing/skimlit_pubmed_w_bert_glove.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Skimlit with pretrained model (bert and glove)

Source: [github](https://github.com/mrdbourke/tensorflow-deep-learning/blob/main/extras/solutions/%F0%9F%9B%A0_09_Milestone_Project_2_SkimLit_%F0%9F%93%84%F0%9F%94%A5_Exercise_Solutions.ipynb)

# Download data and preprocess

In [1]:
!wget https://raw.githubusercontent.com/northpr/helper_function/master/tensorflow/helper_tensorflow.py

--2022-10-09 14:40:25--  https://raw.githubusercontent.com/northpr/helper_function/master/tensorflow/helper_tensorflow.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7159 (7.0K) [text/plain]
Saving to: ‘helper_tensorflow.py.1’


2022-10-09 14:40:25 (76.7 MB/s) - ‘helper_tensorflow.py.1’ saved [7159/7159]



In [2]:
import tensorflow as tf
from tensorflow.keras import layers

!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git
!ls pubmed-rct

fatal: destination path 'pubmed-rct' already exists and is not an empty directory.
PubMed_200k_RCT
PubMed_200k_RCT_numbers_replaced_with_at_sign
PubMed_20k_RCT
PubMed_20k_RCT_numbers_replaced_with_at_sign
README.md


In [3]:
# Start by using the 20k dataset
data_dir = "pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"

# Check all of the filenames in the target directory
import os
filenames = [data_dir + filename for filename in os.listdir(data_dir)]
filenames

['pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt',
 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/test.txt',
 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt']

In [4]:
# Create function to read the lines of a document
def get_lines(filename):
  with open(filename, "r") as f:
    return f.readlines()

    # Creating a preprocessing function that returns a dictionary
def preprocess_text_with_line_numbers(filename):
  """Returns a list of dictionaries of abstract line data.

  Takes in filename, reads its contents and sorts through each line,
  extracting things like the target label, the text of the sentence,
  how many sentences are in the current abstract and what sentence number
  the target line is.

  Args:
      filename: a string of the target text file to read and extract line data
      from.

  Returns:
      A list of dictionaries each containing a line from an abstract,
      the lines label, the lines position in the abstract and the total number
      of lines in the abstract where the line is from. For example:

      [{"target": 'CONCLUSION',
        "text": The study couldn't have gone better, turns out people are kinder than you think",
        "line_number": 8,
        "total_lines": 8}]
  """
  input_lines = get_lines(filename) # get all lines from filename
  abstract_lines = "" # create an empty abstract
  abstract_samples = [] # create an empty list of abstracts
  
  # Loop through each line in target file
  for line in input_lines:
    if line.startswith("###"): # check to see if line is an ID line
      abstract_id = line
      abstract_lines = "" # reset abstract string
    elif line.isspace(): # check to see if line is a new line
      abstract_line_split = abstract_lines.splitlines() # split abstract into separate lines

      # Iterate through each line in abstract and count them at the same time
      for abstract_line_number, abstract_line in enumerate(abstract_line_split):
        line_data = {} # create empty dict to store data from line
        target_text_split = abstract_line.split("\t") # split target label from text
        line_data["target"] = target_text_split[0] # get target label
        line_data["text"] = target_text_split[1].lower() # get target text and lower it
        line_data["line_number"] = abstract_line_number # what number line does the line appear in the abstract?
        line_data["total_lines"] = len(abstract_line_split) - 1 # how many total lines are in the abstract? (start from 0)
        abstract_samples.append(line_data) # add line data to abstract samples list
    
    else: # if the above conditions aren't fulfilled, the line contains a labelled sentence
      abstract_lines += line
  
  return abstract_samples

In [5]:
# Get data from file and preprocess it
%%time
train_samples = preprocess_text_with_line_numbers(data_dir + "train.txt")
val_samples = preprocess_text_with_line_numbers(data_dir + "dev.txt") 
test_samples = preprocess_text_with_line_numbers(data_dir + "test.txt")

len(train_samples), len(val_samples), len(test_samples)

CPU times: user 679 ms, sys: 104 ms, total: 783 ms
Wall time: 1e+03 ms


(180040, 30212, 30135)

In [6]:
# Loading our data into a dataframe
import pandas as pd
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)
train_df.head(14)

Unnamed: 0,target,text,line_number,total_lines
0,OBJECTIVE,to investigate the efficacy of @ weeks of dail...,0,11
1,METHODS,a total of @ patients with primary knee oa wer...,1,11
2,METHODS,outcome measures included pain reduction and i...,2,11
3,METHODS,pain was assessed using the visual analog pain...,3,11
4,METHODS,secondary outcome measures included the wester...,4,11
5,METHODS,"serum levels of interleukin @ ( il-@ ) , il-@ ...",5,11
6,RESULTS,there was a clinically relevant reduction in t...,6,11
7,RESULTS,the mean difference between treatment arms ( @...,7,11
8,RESULTS,"further , there was a clinically relevant redu...",8,11
9,RESULTS,these differences remained significant at @ we...,9,11


In [7]:
# Convert abstract text lines into lists 
train_sentences = train_df["text"].tolist()
val_sentences = val_df["text"].tolist()
test_sentences = test_df["text"].tolist()
len(train_sentences), len(val_sentences), len(test_sentences)

(180040, 30212, 30135)

## OneHotEncoder

In [8]:

# One hot encoding the labels 
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse=False)

train_labels_one_hot = one_hot_encoder.fit_transform(train_df["target"].to_numpy().reshape(-1, 1))
val_labels_one_hot = one_hot_encoder.transform(val_df["target"].to_numpy().reshape(-1, 1))
test_labels_one_hot = one_hot_encoder.transform(test_df["target"].to_numpy().reshape(-1, 1))

# Check what training labels look like
train_labels_one_hot

array([[0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

## LabelEncoder

In [9]:
# Extract labels and encoder them into integers 
from sklearn.preprocessing import LabelEncoder 

label_encoder = LabelEncoder() 

train_labels_encoded = label_encoder.fit_transform(train_df["target"].to_numpy())
val_labels_encoded = label_encoder.transform(val_df["target"].to_numpy())
test_labels_encoded = label_encoder.transform(test_df["target"].to_numpy())

# Check what training labels look like
train_labels_encoded

array([3, 2, 2, ..., 4, 1, 1])

In [10]:
# Get class names and number of classes from LabelEncoder instance 
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
num_classes , class_names

(5, array(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'],
       dtype=object))

# Using Glove embeddings

In [None]:
# Loading the pre-trained embeddings 
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2022-10-09 14:40:31--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-10-09 14:40:31--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-10-09 14:40:31--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2

In [None]:
# Getting the path of the glove embedding (using 100D)
import numpy as np 
glove_path = 'glove.6B.100d.txt'

embedding_index = {}

# Making dict of vector representtion of the words (s --> [8, 48......])
with open(glove_path) as f:
  for line in f:
    
    # Getting the words and coef in a variable 
    word , coefs = line.split(maxsplit = 1)
    coefs = np.fromstring(coefs , 'f' , sep = ' ')
    
    # Adding the coefs to our embedding dict 
    embedding_index[word] = coefs

print(f'Found {len(embedding_index)} word vectors')

In [None]:
# Make function to split sentence into characters
def split_chars(text):
  return " ".join(list(text))

# Split sequence-level data splits into character-lelve data splits
train_chars = [split_chars(sentence) for sentence in train_sentences]
val_chars = [split_chars(sentence) for sentence in val_sentences]

In [None]:
# Creating a text vectorization layer (68k vocab size from the paper itself)
from tensorflow.keras.layers import TextVectorization

text_vectorizer = TextVectorization(max_tokens=68000,
                                    output_sequence_length=56)

# Adapt our text vectorizer to training sentences
text_vectorizer.adapt(train_sentences)

In [None]:
# Getting the vocabulary of the vectorizer
text_vocab = text_vectorizer.get_vocabulary()
len(text_vocab), text_vocab[0:20]

### For glove embedding

In [None]:
# Getting the dict mapping word --> index 
word_index_text = dict(zip(text_vocab , range(len(text_vocab))))
word_index_text

In [None]:
# Creating a function that will give us. a embedding matrix
def get_glove_embedding_matrix(num_tokens, embedding_dim, word_index):

  # Defining the hits and misses here
  hits, misses = 0,0

  # Prepare the embedding matrix
  embedding_matrix = np.zeros((num_tokens, embedding_dim))
  for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector
      hits += 1
    else:
      misses += 1

  return embedding_matrix, hits, misses

In [None]:
num_tokens_text = len(text_vocab)+2 # All number of the words
embedding_dim = 100

sentence_embedding_matrix , hits_ , misses_ = get_glove_embedding_matrix(num_tokens_text , embedding_dim, word_index_text)

print(f'Hits: {hits_} and Misses: {misses_} for the sentence embedding matrix')


In [None]:
# Adding the embedding matrix to our Embedding layer (Sentence and characters)
from tensorflow.keras.layers import Embedding

sen_embedding_layer = Embedding(num_tokens_text , 
                            embedding_dim , 
                            embeddings_initializer = tf.keras.initializers.Constant(sentence_embedding_matrix) , 
                            trainable = False,
                            name="glove_embedding")

In [None]:
# Checking the sentence before and after all the vectorization and embedding
import random

ran_num = random.randint(0,len(train_sentences))
random_sentence = train_sentences[ran_num]

print(f"Example of train sentence:\n{random_sentence}")

print("=======\n")
print(f"Text vectorization:\n{text_vectorizer(random_sentence)}")
print(f"Text vectorization shape: {text_vectorizer(random_sentence).shape}")

print("=======\n")
print(f"Embedded Text:\n{sen_embedding_layer(text_vectorizer(random_sentence))}\n")
print(f"Embedded text shape: {sen_embedding_layer(text_vectorizer(random_sentence)).shape}")


In [None]:
# Creating the datasets for our both sentences and chars  

train_sen_vectors = text_vectorizer(np.array([[sen] for sen in train_sentences])).numpy()
val_sen_vectors = text_vectorizer(np.array([[sen] for sen in val_sentences])).numpy()

# Training and validation dataset 
train_ds = tf.data.Dataset.from_tensor_slices((train_sen_vectors , train_labels_encoded)) # For model 1
val_ds = tf.data.Dataset.from_tensor_slices((val_sen_vectors , val_labels_encoded))


# Applying the batch size and prefetching (performance optimization )
train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(32).prefetch(tf.data.AUTOTUNE)


train_ds,  val_ds

In [None]:
# Check the shape of the input
train_sen_vectors.shape

In [None]:
train_sen_vectors[0].shape

### Model 0: Glove embedding test with sentences

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot))
valid_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot))

train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
len(train_sentences), len(train_labels_one_hot)

In [None]:
tf.random.set_seed(123)

# Sample
inputs = layers.Input(shape= (1,), dtype="string")
text_vector = text_vectorizer(inputs)
glove_emb = sen_embedding_layer(text_vector)
x = layers.Conv1D(128, 5, activation="relu", padding="same")(glove_emb)
x = layers.AveragePooling1D(5, padding="same")(x)
x = layers.Conv1D(64, 5, activation="relu", padding="same")(x)
x = layers.AveragePooling1D(5, padding="same")(x)
x = layers.Conv1D(32, 5, activation="relu", padding="same")(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(len(class_names), activation="softmax")(x)

model_0 = tf.keras.Model(inputs, outputs, name="conv1d_global_average_pooling")
model_0.summary()

In [None]:
model_0.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                    optimizer=tf.keras.optimizers.Adam(),
                    metrics=["accuracy"])

model_0.fit(train_dataset,
            epochs=5,
            validation_data=valid_dataset)

In [None]:
evaluate_0 = model_0.evaluate(valid_dataset)

### Model 1: Glove embedding with GlobalAveragePooling

In [None]:
tf.random.set_seed(123)

# Sample
inputs = layers.Input(shape= (None,), dtype="int64") # Data is already `int64` because we already vectorize it.
glove_emb = sen_embedding_layer(inputs)
x = layers.Conv1D(128, 5, activation="relu", padding="same")(glove_emb)
x = layers.AveragePooling1D(5, padding="same")(x)
x = layers.Conv1D(64, 5, activation="relu", padding="same")(x)
x = layers.AveragePooling1D(5, padding="same")(x)
x = layers.Conv1D(32, 5, activation="relu", padding="same")(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(len(class_names), activation="softmax")(x)

model_1 = tf.keras.Model(inputs, outputs, name="conv1d_global_average_pooling")
model_1.summary()

In [None]:
model_1.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                    optimizer=tf.keras.optimizers.Adam(),
                    metrics=["accuracy"])

model_1.fit(train_ds,
            epochs=5,   
            validation_data = val_ds)

In [None]:
evaluate_1 = model_1.evaluate(val_ds)

### Model 2: Glove embedding with MaxPooling

In [None]:
tf.random.set_seed(123)

# Sample
inputs = layers.Input(shape= (None,), dtype="int64") # Data is already `int64` because we already vectorize it.
glove_emb = sen_embedding_layer(inputs)
x = layers.Conv1D(128, 5, activation="relu", padding="same")(glove_emb)
x = layers.MaxPooling1D(5, padding="same")(x)
x = layers.Conv1D(64, 5, activation="relu", padding="same")(x)
x = layers.MaxPooling1D(5, padding="same")(x)
x = layers.Conv1D(32, 5, activation="relu", padding="same")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(len(class_names), activation="softmax")(x)

model_2 = tf.keras.Model(inputs, outputs, name="conv1d_global_average_pooling")
model_2.summary()

In [None]:
model_2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                    optimizer=tf.keras.optimizers.Adam(),
                    metrics=["accuracy"])

model_2.fit(train_ds,
            epochs=5,   
            validation_data = val_ds)

In [None]:
evaluate_2 = model_2.evaluate(val_ds)