In [1]:
import os
import numpy as np
import pickle
from tqdm import tqdm
import sys

In [2]:
class MyClass():
    def __init__(self, param):
        self.param = param

def save_object(path_to_batch, information, obj, name):
    try:
        with open(path_to_batch + "sequence_" + information + "/" + str(name) + ".pickle", "wb") as f:
            pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
    except Exception as ex:
        print("Error during pickling object (Possibly unsupported):", ex)

In [3]:
def generate_multimodal_data_sequences(path_to_batch, batch_name):

    data = np.load(path_to_batch + "multimodal_" + batch_name + ".npy", allow_pickle = True)
    file_names = list(np.load(path_to_batch + "paths_" + batch_name + ".npy", allow_pickle = True))

    print('Checking alignment of data ...')

    assert(len(file_names) == data.shape[0])
    print(str(len(file_names)) + " == " + str(data.shape[0]))

    print('Extracting file names ...')

    for idx in tqdm(range(len(file_names))):
        file_names[idx] = file_names[idx].split('/')[0]

    print('Done.')
    print('Extracting sequences ...')

    current_pdf_blocks = []
    current_pdf_name = file_names[0]
    for idx in tqdm(range(len(file_names))):

        if file_names[idx] != current_pdf_name:
            data_sequence_obj = MyClass(current_pdf_blocks)
            save_object(path_to_batch, "multimodal", data_sequence_obj, "sequence_" + "multimodal_" + current_pdf_name)
            current_pdf_name = file_names[idx]
            current_pdf_blocks = []

        dict_of_features = {}
        for idx_element in range(len(data[idx, :])):
            dict_of_features['multimodal_feature_' + str(idx_element + 1)] = data[idx, idx_element]
        current_pdf_blocks.append(dict_of_features)

    data_sequence_obj = MyClass(current_pdf_blocks)
    save_object(path_to_batch, "multimodal", data_sequence_obj, "sequence_multimodal_"+ current_pdf_name)

    print('Done.')

In [4]:
def generate_sequential_information_sequences(path_to_batch, batch_name):

    is_previous_block_on_different_page = np.load(path_to_batch + "is_last_block_on_different_page_" + batch_name + ".npy", allow_pickle = True)
    normalized_page_no = np.load(path_to_batch + "normalized_page_no_" + batch_name + ".npy", allow_pickle = True)
    normalized_x_distance = np.load(path_to_batch + "normalized_X_distance_with_previous_block_" + batch_name + ".npy", allow_pickle = True)
    normalized_y_distance = np.load(path_to_batch + "normalized_Y_distance_with_previous_block_" + batch_name + ".npy", allow_pickle = True)

    file_names = np.load(path_to_batch + "paths_" + batch_name + ".npy", allow_pickle = True)
    file_names = list(file_names)

    print('Checking alignment of data ...')

    assert(len(file_names) == len(is_previous_block_on_different_page))
    print(str(len(file_names)) + " == " + str(len(is_previous_block_on_different_page)) + " == " + str(len(normalized_page_no)) + " == " + str(len(normalized_x_distance)) + " == " + str(len(normalized_y_distance)))

    print('Extracting file names ...')

    for idx in tqdm(range(len(file_names))):
        file_names[idx] = file_names[idx].split('/')[0]

    print('Done.')
    print('Extracting sequences ...')

    current_pdf_blocks = []
    current_pdf_name = file_names[0]
    for idx in tqdm(range(len(file_names))):

        if file_names[idx] != current_pdf_name:
            data_sequence_obj = MyClass(current_pdf_blocks)
            save_object(path_to_batch, "sequential_information", data_sequence_obj, "sequence_sequential_information_" + current_pdf_name)
            current_pdf_name = file_names[idx]
            current_pdf_blocks = []

        dict_of_features = {}

        dict_of_features['is_previous_block_on_different_page'] = is_previous_block_on_different_page[idx]
        dict_of_features['normalized_page_no'] = normalized_page_no[idx]
        dict_of_features['normalized_x_distance_previous_block'] = normalized_x_distance[idx]
        dict_of_features['normalized_y_distance_previous_block'] = normalized_y_distance[idx]

        current_pdf_blocks.append(dict_of_features)

    data_sequence_obj = MyClass(current_pdf_blocks)
    save_object(path_to_batch, "sequential_information", data_sequence_obj, "sequence_sequential_information_"+ current_pdf_name)

    print('Done.')

In [5]:
def generate_labels_sequences(path_to_batch, batch_name):

    labels = np.load(path_to_batch + "labels_" + batch_name + ".npy", allow_pickle = True)
    file_names = np.load(path_to_batch + "paths_" + batch_name + ".npy", allow_pickle = True)
    file_names = list(file_names)

    print('Checking alignment of data ...')

    assert(len(file_names) == len(labels))
    print(str(len(file_names)) + " == " + str(len(file_names)))

    print('Extracting file names ...')

    for idx in tqdm(range(len(file_names))):
        file_names[idx] = file_names[idx].split('/')[0]

    print('Done.')
    print('Extracting sequences ...')

    current_pdf_blocks = []
    current_pdf_name = file_names[0]
    for idx in tqdm(range(len(file_names))):

        if file_names[idx] != current_pdf_name:
            data_sequence_obj = MyClass(current_pdf_blocks)
            save_object(path_to_batch, "labels", data_sequence_obj, "sequence_labels_" + current_pdf_name)
            current_pdf_name = file_names[idx]
            current_pdf_blocks = []

        current_pdf_blocks.append(labels[idx])

    data_sequence_obj = MyClass(current_pdf_blocks)
    save_object(path_to_batch, "labels", data_sequence_obj, "sequence_labels_" + current_pdf_name)

    print('Done.')

In [None]:
batch_names = ['batch_1', ...]
path_to_batches = ...

for batch_name in batch_names:
    print("\tExtraction of sequences for \"" + batch_name + "\" : ")
    print("\t\tExtraction of multimodal data sequences :")
    generate_multimodal_data_sequences(path_to_batches + "/" + batch_name + "/", batch_name)
    print("\t\tExtraction of sequential information sequences :")
    generate_sequential_information_sequences(path_to_batches + "/" + batch_name + "/", batch_name)
    print("\t\tExtraction of label sequences :")
    generate_labels_sequences(path_to_batches + "/" + batch_name + "/", batch_name)