Train finetuned Image model, code by Ruben van Heusden

https://github.com/RubenvanHeusden

In [None]:
"""
"""

import os
import argparse
import tensorflow
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from metricutils import *

print("Num GPUs Available: ", len(tensorflow.config.list_physical_devices('GPU')))

# https://stackoverflow.com/questions/30811918/saving-dictionary-of-numpy-arrays


class ImageModelWiedemann:
    def __init__(self, learning_rate=0.00001):

        # We use the VGG16 model pretrained on the imagenet corpus
        # As the basis of our network.
        model_vgg16 = VGG16(weights='imagenet', include_top=False,
                            input_shape=(300, 300, 3))

        # We don't want to train the first 13 layers of the VGG16 model
        # We will add our own tower to this later. It is common in the literature
        # To only freeze the first 4 of the 5 convolutional layers so that
        # the network can still learn to adjust some of the filters to specifics
        # of the dataset
        for l in model_vgg16.layers[:13]:
            l.trainable = False

        top_model = Flatten()(model_vgg16.output)
        drop1 = Dropout(0.5)(top_model)
        dense1 = Dense(512)(drop1)
        relu1 = LeakyReLU()(dense1)
        drop2 = Dropout(0.5)(relu1)
        dense2 = Dense(256)(drop2)
        relu2 = LeakyReLU()(dense2)

        # After the output of the model, we pass the output through
        # A final linear layer and a sigmoid to obtain values for prediction
        model_output = Dense(1, activation="sigmoid")(relu2)

        model = Model(model_vgg16.input, model_output)
        # Set up the optimzation steps as described in the original
        # wiedemann paper.
        model.compile(loss='binary_crossentropy', optimizer=Nadam(learning_rate=learning_rate),
                      metrics=['AUC'])

        self.intermediate_activation = Model(model_vgg16.input, dense1)
        self.intermediate_activation.compile()

        self.model = model

    def train(self, train_data, num_epochs=20):
        self.model.fit(train_data, epochs=num_epochs)

    def predict(self, test_data):
        y_predict = self.model.predict(test_data, verbose=True)
        return y_predict

    def store_vectors(self):
        pass


def prepare_df_for_model(dataframe):
    dataframe['png'] = dataframe.name + '-' + dataframe.page.astype(str) + '.png'
    dataframe['label'] = dataframe['label'].astype(str)

    return dataframe


def prepare_test_streams(test_subdataframe, png_folder,
                         batch_size):

    subtest_generator = ImageDataGenerator(
        preprocessing_function=preprocess_input).flow_from_dataframe(
        dataframe=test_subdataframe,
        directory=png_folder,
        x_col='png',
        y_col='label',
        target_size=(300, 300),
        class_mode=None,
        batch_size=batch_size,
        shuffle=False,
        seed=42,
        validate_filenames=True,
    )

    return subtest_generator


def main(args):

    train_dataframe = prepare_df_for_model(pd.read_csv(args.train_dataframe))
    test_dataframe = prepare_df_for_model(pd.read_csv(args.test_dataframe))

    train_gen = ImageDataGenerator(
        preprocessing_function=preprocess_input).flow_from_dataframe(
        dataframe=train_dataframe,
        directory=args.train_png_folder,
        x_col='png',
        y_col='label',
        target_size=(300, 300),
        class_mode='binary',
        batch_size=args.batch_size,
        shuffle=True,
        seed=42,
        validate_filenames=True)

    # We either want to train our own model and save it, or use a
    # Model we trained ourselves, and only run the prediction step.

    model = ImageModelWiedemann(learning_rate=args.learning_rate)
    if args.from_trained:
        model.model = load_model(args.save_path)
    else:
        model.train(train_data=train_gen, num_epochs=args.num_epochs)
        model.model.save(args.save_path)

    stream_predictions = {}
    vector_outputs = {}
    raw_outputs = {}

    for doc_id, stream in test_dataframe.groupby('name'):
        stream['page'] = stream['page'].astype(int)
        sorted_stream = stream.sort_values(by='page')

        test_data = prepare_test_streams(sorted_stream, args.test_png_folder,
                                         args.batch_size)

        out = model.predict(test_data).squeeze()
        stream_prediction = np.round(out).astype(int).tolist()
        stream_predictions[doc_id] = stream_prediction
        raw_outputs[doc_id] = out.tolist()

        vectors = model.intermediate_activation.predict(test_data)
        vector_outputs[doc_id] = vectors

    test_dataframe['label'] = test_dataframe['label'].astype(int)

    corpus_train = args.train_dataframe.split('/')[-3]
    corpus_test = args.test_dataframe.split('/')[-3]
    gold_standard = get_ground_truth_from_dataframe(test_dataframe,
                                                    'label')

    save_name = "text_trained_on_%s_tested_on_%s_IMAGE_CNN" % (corpus_train, corpus_test)
    np.save(os.path.join(save_name, 'vectors.npy'), vector_outputs)
    if not os.path.exists(save_name):
        # Create a new directory because it does not exist
        os.makedirs(save_name)

    with open(os.path.join(save_name, 'predictions.json'), 'w') as f:
        json.dump(stream_predictions, f)

    with open(os.path.join(save_name, 'gold_standard.json'), 'w') as f:
        json.dump(gold_standard, f)

    with open(os.path.join(save_name, 'raw_scores.json'), 'w') as f:
        json.dump(raw_outputs, f)

    evaluation_report(gold_standard, stream_predictions)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument('--train_dataframe', type=str, required=True)
    parser.add_argument('--test_dataframe', type=str, required=True)
    parser.add_argument('--train_png_folder', type=str, required=True)
    parser.add_argument('--test_png_folder', type=str, required=True)
    parser.add_argument('--learning_rate',type=float, default=0.00001)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--num_epochs', type=int, default=20)
    parser.add_argument('--save_path', type=str)
    parser.add_argument('--from_trained', type=bool, default=False)

    arguments = parser.parse_args()
    main(arguments)

Train finetuned Text model, code by Ruben van Heusden

In [None]:
"""
This file contains a PyTorch style dataloader that can be used to load in the
data from different WOB data sources, for easier use with the different models
that are used in this project.

Because we have two modalities for each page (both visual and textual) we
construct the dataset in such a way that we can use one dataloader for
the whole dataset, and just select what information we want to use based on
which model we are training.

For some of the models that we are using we need to specify a specific
pipeline for the preprocessing of the image, because we need to do some
resampling / rescaling, such as for the VGG16 model.

Although the exact structure of the dataset does not have to specified
in advance (train, val and test split) we will require this for the first versions
of the algorithms, just so that we know for sure that we actually do the
proper things for the first two corpora. Later we can then make a more
complicated version that will allow us a bit more freedom in how we interact
with the datasets.

"""

import os
import pandas as pd
from PIL import Image
# Local imports
import metricutils


def load_text_dataframe(dataframe_path: str, nan_fill_value: str = ''):
    """
    In this method we load in the csv text OCR dataset.
    We do this in a separate method because the dataset requires
    some preprocessing to make sure that the data is loading in properly
    and that we can actually combine it with the images that we get from
    the image loading module.

    :param nan_fill_value: string specifying what value to use if a value
    is missing in the text entry of a page.
    :param dataframe_path: string specifying the path to the dataframe
    that contains the text of the pages and the gold standard.
    :return:
    """
    ocr_dataframe = pd.read_csv(dataframe_path)

    # As in principle these pages could be unordered, we want to make sure
    # stream is ordered in ascending order by the page number so that it
    # lines up with the gold standard data.

    ocr_dataframe['page'] = ocr_dataframe['page'].astype(int)
    # sorting by name because this is just the name of the stream and this
    # way we can sort each stream on page number properly.
    ocr_dataframe = ocr_dataframe.sort_values(by=['name', 'page'])

    ocr_dataframe.reset_index(inplace=True, drop=True)

    # Fill any nan values in the text with the value specified
    # in 'nan_fill_value'
    ocr_dataframe.text.fillna(nan_fill_value, inplace=True)

    return ocr_dataframe


In [None]:
"""
For now, remove any of the references to the multi class options,
as we will not really use this for our experiments.

"""

import pandas as pd
from tqdm import tqdm
import argparse
import re, math
import numpy as np
import json

print(np.__version__)
from gensim.models.fasttext import load_facebook_model

ft = load_facebook_model(
    "/ivi/ilps/personal/rheusde/WOOIR/models/cc.nl.300.bin")
print('---Fasttext model has been loaded---')

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import *
from tensorflow.keras.utils import *

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

from dataloading import *
from metricutils import *


def get_data_instances(df):
    data_instances = []
    for index, row in df.iterrows():
        data_instances.append([row['label'], row['text']])
    return data_instances


def simple_tokenizer(textline: str):
    textline = re.sub(r'http\S+', 'URL', textline)
    words = re.compile(r'[#\w-]+|[^#\w-]+', re.UNICODE).findall(
        textline.strip())
    words = [w.strip() for w in words if w.strip() != '']
    return words


class TextFeatureGenerator(Sequence):
    def __init__(self, text_data, batch_size=32):
        self.text_data = text_data
        self.indices = np.arange(len(self.text_data))
        self.batch_size = batch_size
        self.sequence_length = 150
        self.embedding_dims = 300

    def __len__(self):
        return math.ceil(len(self.text_data) / self.batch_size)

    def on_epoch_end(self):
        np.random.shuffle(self.indices)

    def __getitem__(self, idx):
        inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x, batch_y = self.process_text_data(inds)
        return batch_x, batch_y

    def process_text_data(self, inds):

        word_embeddings = []
        output_labels = []

        for index in inds:
            word_embeddings.append(
                self.text_to_embedding(self.text_data[index][1]))
            output_labels.append(self.text_data[index][0])

        return np.array(word_embeddings), np.array(output_labels)

    def text_to_embedding(self, textsequence):
        temp_word = []

        # tokenize
        sentence = simple_tokenizer(textsequence)

        # trim to max sequence length
        if len(sentence) > self.sequence_length:
            half_idx = int(self.sequence_length / 2)
            tmp_sentence = sentence[:half_idx]
            tmp_sentence.extend(sentence[(len(sentence) - half_idx):])
            sentence = tmp_sentence

        # padding
        words_to_pad = self.sequence_length - len(sentence)

        for i in range(words_to_pad):
            sentence.append('PADDING_TOKEN')

        # create data input for words
        for w_i, word in enumerate(sentence):

            if word == 'PADDING_TOKEN':
                word_vector = [0] * self.embedding_dims
            else:
                word_vector = ft.wv[word.lower()]

            temp_word.append(word_vector)

        return temp_word


class TextModelWiedemann:
    def __init__(self, nb_embedding_dims=300, nb_sequence_length=150):

        filter_sizes = (3, 4, 5)

        model_input_tp = Input(shape=(nb_sequence_length, nb_embedding_dims))
        gru_block_tp = Bidirectional(
            GRU(128, dropout=0.5, return_sequences=True))(
            model_input_tp)
        conv_blocks_tp = []
        for sz in filter_sizes:
            conv = Conv1D(
                filters=200,
                kernel_size=sz,
                padding="same",
                strides=1
            )(gru_block_tp)
            conv = LeakyReLU()(conv)
            conv = GlobalMaxPooling1D()(conv)
            conv = Dropout(0.5)(conv)
            conv_blocks_tp.append(conv)
        model_concatenated_tp = concatenate(conv_blocks_tp)
        model_concatenated_tp = Dense(128)(model_concatenated_tp)
        model_concatenated_tp = LeakyReLU()(model_concatenated_tp)

        model_output = Dense(1, activation="sigmoid")(model_concatenated_tp)

        # combine final model
        model = Model(model_input_tp, model_output)
        model.compile(loss='binary_crossentropy', optimizer='nadam',
                      metrics=['accuracy'])

        self.model = model

    def train(self, train_data, batch_size, num_epochs):
        # Here we write a very simple training loop
        self.model.fit(TextFeatureGenerator(train_data, batch_size=batch_size),
                       epochs=num_epochs)

    def predict(self, test_dataframe, batch_size):
        all_stream_predictions = {}
        raw_predictions = {}

        for name, sub_df in tqdm(test_dataframe.groupby("name")):
            predictions = self.model.predict(
                TextFeatureGenerator(get_data_instances(sub_df),
                                     batch_size=batch_size)).squeeze()

            final_predictions = predictions.round().astype(int).tolist()
            all_stream_predictions[name] = final_predictions
            raw_predictions[name] = predictions.tolist()

            if sub_df.shape[0] == 1:
                all_stream_predictions[name] = [predictions]
                raw_predictions[name] = [final_predictions]                

        return all_stream_predictions, raw_predictions


def main(args):
    # Our first step here is to set up the model from its class
    if args.do_train:
        text_model = TextModelWiedemann()
    else:
        text_model = TextModelWiedemann()
        text_model.model = load_model(args.save_path)
    text_model.ft = ft
    print('Model has been loaded')

    # Now we also have to load the training and test datasets
    train_dataframe = load_text_dataframe(args.train_dataframe)
    test_dataframe = load_text_dataframe(args.test_dataframe)
    print('data has been loaded')
    gold_standard_dict = get_ground_truth_from_dataframe(test_dataframe,
                                                         col='label')

    
    for key, val in gold_standard_dict.items():
        val[0] = 1
        gold_standard_dict[key] = val

    train_instances = get_data_instances(train_dataframe)
    print('training about to start')
    if args.do_train:
        text_model.train(train_instances, batch_size=args.batch_size,
                         num_epochs=args.num_epochs)
        text_model.model.save(args.save_path)
    print('training has finised')
    prediction_dict, raw_dict = text_model.predict(test_dataframe,
                                                   batch_size=args.batch_size)

    corpus_train = args.train_dataframe.split('/')[-3]
    corpus_test = args.test_dataframe.split('/')[-3]

    save_name = "text_trained_on_%s_tested_on_%s" % (corpus_train, corpus_test)

    if not os.path.exists(save_name):
        # Create a new directory because it does not exist
        os.makedirs(save_name)

    with open(os.path.join(save_name, 'predictions.json'), 'w') as f:
        json.dump(prediction_dict, f)

    with open(os.path.join(save_name, 'gold_standard.json'), 'w') as f:
        json.dump(gold_standard_dict, f)

    with open(os.path.join(save_name, 'raw_scores.json'), 'w') as f:
        json.dump(raw_dict, f)

    evaluation_report(gold_standard_dict, prediction_dict)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument('--train_dataframe', type=str, required=True)
    parser.add_argument('--test_dataframe', type=str, required=True)
    parser.add_argument('--num_epochs', type=int, default=20)
    parser.add_argument('--save_path', type=str)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--do_train', type=bool, default=False)

    arguments = parser.parse_args()
    main(arguments)

Getting finetuned Vector representations of CNN

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pss_model import compile_model_singlepage,ValidationCheckpoint,ImageFeatureGenerator
from tqdm import tqdm
from PIL import Image 
import os
from tensorflow.keras.models import load_model, Model
import numpy as np
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
import argparse 
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def prepare_df_for_model(dataframe):
    dataframe['png'] = dataframe.name + '-' + dataframe.page.astype(str) + '.png'
    dataframe['label'] = dataframe['label'].astype(str)

    return dataframe

def prepare_test_streams(test_subdataframe, png_folder,
                         batch_size):

    subtest_generator = ImageDataGenerator(
        preprocessing_function=preprocess_input).flow_from_dataframe(
        dataframe=test_subdataframe,
        directory=png_folder,
        x_col='png',
        y_col='label',
        target_size=(224, 224),
        class_mode=None,
        batch_size=batch_size,
        shuffle=False,
        seed=42,
        validate_filenames=True,
    )

    return subtest_generator

def main():
  Image.MAX_IMAGE_PIXELS = 1000000000


  model = load_model(path_to_model)
  layer_name = 'dense'
  to_vector_model= Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
  
  train_data = prepare_df_for_model(pd.read_csv('test_data.csv'))

  for doc_id, stream in tqdm(train_data.groupby('name')):
      stream['page'] = stream['page'].astype(int)
      sorted_stream = stream.sort_values(by='page')
  
      train_data = prepare_test_streams(sorted_stream, 'test',
                                       256)
      vectors = to_vector_model.predict(train_data)
  
      full_path = '{}/{}.npy'.format('out/ft_test',doc_id)
      np.save(full_path, vectors)
      
main()

Getting pretrained Vector representations of VGG16

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pss_model import compile_model_singlepage,ValidationCheckpoint,ImageFeatureGenerator
from tqdm import tqdm
from PIL import Image 
import os
from tensorflow.keras.models import load_model, Model
import numpy as np
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
import argparse 
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def prepare_df_for_model(dataframe):
    dataframe['png'] = dataframe.name + '-' + dataframe.page.astype(str) + '.png'
    dataframe['label'] = dataframe['label'].astype(str)

    return dataframe

def prepare_test_streams(test_subdataframe, png_folder,
                         batch_size):

    subtest_generator = ImageDataGenerator(
        preprocessing_function=preprocess_input).flow_from_dataframe(
        dataframe=test_subdataframe,
        directory=png_folder,
        x_col='png',
        y_col='label',
        target_size=(224, 224),
        class_mode=None,
        batch_size=batch_size,
        shuffle=False,
        seed=42,
        validate_filenames=True,
    )

    return subtest_generator

def main():
  Image.MAX_IMAGE_PIXELS = 1000000000


  model_vgg16 = VGG16(weights = 'imagenet', include_top=True, input_shape=(224, 224, 3))
  layer_name = 'fc2'
  to_vector_model= Model(inputs=model_vgg16.input, outputs=model_vgg16.get_layer(layer_name).output)
  
  train_data = prepare_df_for_model(pd.read_csv('test_data.csv'))

  for doc_id, stream in tqdm(train_data.groupby('name')):
      stream['page'] = stream['page'].astype(int)
      sorted_stream = stream.sort_values(by='page')
  
      train_data = prepare_test_streams(sorted_stream, 'test',
                                       256)
      vectors = to_vector_model.predict(train_data)
  
      full_path = '{}/{}.npy'.format('out/vgg_test',doc_id)
      np.save(full_path, vectors)
      
main()

Get finetunted text vectors

In [None]:
import pandas as pd
from tqdm import tqdm
import argparse
import re, math
import numpy as np
import json

print(np.__version__)
from gensim.models.fasttext import load_facebook_model

ft = load_facebook_model(
    "cc.nl.300.bin")
print('---Fasttext model has been loaded---')

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import *
from tensorflow.keras.utils import *

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

from dataloading import *
from metricutils import *

def get_data_instances(df):
    data_instances = []
    for index, row in df.iterrows():
        data_instances.append([row['label'], row['text']])
    return data_instances


def simple_tokenizer(textline: str):
    textline = re.sub(r'http\S+', 'URL', textline)
    words = re.compile(r'[#\w-]+|[^#\w-]+', re.UNICODE).findall(
        textline.strip())
    words = [w.strip() for w in words if w.strip() != '']
    return words


class TextFeatureGenerator(Sequence):
    def __init__(self, text_data, batch_size=32):
        self.text_data = text_data
        self.indices = np.arange(len(self.text_data))
        self.batch_size = batch_size
        self.sequence_length = 150
        self.embedding_dims = 300

    def __len__(self):
        return math.ceil(len(self.text_data) / self.batch_size)

    def on_epoch_end(self):
        np.random.shuffle(self.indices)

    def __getitem__(self, idx):
        inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x, batch_y = self.process_text_data(inds)
        return batch_x, batch_y

    def process_text_data(self, inds):

        word_embeddings = []
        output_labels = []

        for index in inds:
            word_embeddings.append(
                self.text_to_embedding(self.text_data[index][1]))
            output_labels.append(self.text_data[index][0])

        return np.array(word_embeddings), np.array(output_labels)

    def text_to_embedding(self, textsequence):
        temp_word = []

        # tokenize
        sentence = simple_tokenizer(textsequence)

        # trim to max sequence length
        if len(sentence) > self.sequence_length:
            half_idx = int(self.sequence_length / 2)
            tmp_sentence = sentence[:half_idx]
            tmp_sentence.extend(sentence[(len(sentence) - half_idx):])
            sentence = tmp_sentence

        # padding
        words_to_pad = self.sequence_length - len(sentence)

        for i in range(words_to_pad):
            sentence.append('PADDING_TOKEN')

        # create data input for words
        for w_i, word in enumerate(sentence):

            if word == 'PADDING_TOKEN':
                word_vector = [0] * self.embedding_dims
            else:
                word_vector = ft.wv[word.lower()]

            temp_word.append(word_vector)

        return temp_word


class TextModelWiedemann:
    def __init__(self, nb_embedding_dims=300, nb_sequence_length=150):

        filter_sizes = (3, 4, 5)

        model_input_tp = Input(shape=(nb_sequence_length, nb_embedding_dims))
        gru_block_tp = Bidirectional(
            GRU(128, dropout=0.5, return_sequences=True))(
            model_input_tp)
        conv_blocks_tp = []
        for sz in filter_sizes:
            conv = Conv1D(
                filters=200,
                kernel_size=sz,
                padding="same",
                strides=1
            )(gru_block_tp)
            conv = LeakyReLU()(conv)
            conv = GlobalMaxPooling1D()(conv)
            conv = Dropout(0.5)(conv)
            conv_blocks_tp.append(conv)
        model_concatenated_tp = concatenate(conv_blocks_tp)
        model_concatenated_tp = Dense(128)(model_concatenated_tp)
        model_concatenated_tp = LeakyReLU()(model_concatenated_tp)

        model_output = Dense(1, activation="sigmoid")(model_concatenated_tp)

        # combine final model
        model = Model(model_input_tp, model_output)
        model.compile(loss='binary_crossentropy', optimizer='nadam',
                      metrics=['accuracy'])

        self.model = model

    def train(self, train_data, batch_size, num_epochs):
        # Here we write a very simple training loop
        self.model.fit(TextFeatureGenerator(train_data, batch_size=batch_size),
                       epochs=num_epochs)

    def predict(self, test_dataframe, batch_size, save =True):
        all_stream_predictions = {}

        for name, sub_df in tqdm(test_dataframe.groupby("name")):
            predictions = self.model.predict(
                TextFeatureGenerator(get_data_instances(sub_df),
                                     batch_size=batch_size))
            
            if save:
                np.save('out/text_predictions/{}.npy'.format(name), np.array(predictions))
            

def main(save_path, test_df, to_vec = True):
    # Our first step here is to set up the model from its class
    text_model = TextModelWiedemann()
    model = load_model(save_path)
    
    if to_vec:
        layer_name = 'dense'
        to_vector_model= Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
        text_model.model = to_vector_model
    else:
        text_model.model = model
    text_model.ft = ft
    print('Model has been loaded')

    # Now we also have to load the training and test datasets
    test_dataframe = load_text_dataframe(test_df)
    print('data has been loaded')


    text_model.predict(test_dataframe,batch_size=256)

        
sp = 'text_cnn_model'
td = 'data_fixed.csv'
main(sp,td,False)