# In this Notebook, I'll preprocess the data and generate a plug_and_play pickle file for it
-------------------------------------------------------------------------------------------------------------------
# Technology used: basic preprocessing tools

### usual utility cells

In [1]:
#!pip install numpy

In [2]:
#!pip install keras

In [3]:
#!pip install tensorflow --user

In [62]:
# packages used for processing: 
import numpy as np

# for operating system related stuff
import os
import sys # for memory usage of objects
from subprocess import check_output

# import the Text preprocessing helper to obtain the lists of field_name:content_word pairs
from Summary_Generator.Text_Preprocessing_Helpers.utils import *
from Summary_Generator.Tensorflow_Graph.utils import *
from Summary_Generator.Text_Preprocessing_Helpers.pickling_tools import *

In [5]:
# Input data files are available in the "../Data/" directory.

def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print(check_output(cmd).decode("utf8"))

In [6]:
# check the structure of the project directory
#exec_command(['ls', '..'])

In [7]:
np.random.seed(3) # set this seed for a device independant consistent behaviour

In [8]:
''' Set the constants for the script '''

# various paths of the files
data_path = "../Data" # the data path

data_files_paths = {
    "table_content": os.path.join(data_path, "train.box"),
    "nb_sentences" : os.path.join(data_path, "train.nb"),
    "train_sentences": os.path.join(data_path, "train.sent")
}

base_model_path = "Models"
plug_and_play_data_file = os.path.join(data_path, "plug_and_play.pickle")

# constants for the preprocessing script
train_percentage = 95

## Extract the data from the related files and properly structure it

In [9]:
field_content_words, field_words, content_words = prepare_input_data(data_files_paths['table_content'])

In [10]:
field_content_words

[['type pope',
  'name michael',
  'name iii',
  'name of',
  'name alexandria',
  'title 56th',
  'title pope',
  'title of',
  'title alexandria',
  'title &',
  'title patriarch',
  'title of',
  'title the',
  'title see',
  'title of',
  'title st.',
  'title mark',
  'image <none>',
  'caption <none>',
  'enthroned 25',
  'enthroned april',
  'enthroned 880',
  'ended 16',
  'ended march',
  'ended 907',
  'predecessor shenouda',
  'predecessor i',
  'successor gabriel',
  'successor i',
  'ordination <none>',
  'consecration <none>',
  'birthdate <none>',
  'birthname <none>',
  'birthplace egypt',
  'deathdate 16',
  'deathdate march',
  'deathdate 907',
  'buried monastery',
  'buried of',
  'buried saint',
  'buried macarius',
  'buried the',
  'buried great',
  'nationality egyptian',
  'religion coptic',
  'religion orthodox',
  'religion christian',
  'residence saint',
  'residence mark',
  "residence 's",
  'residence church',
  'feastday 16',
  'feastday march',
  'feas

In [11]:
field_words

['type',
 'name',
 'name',
 'name',
 'name',
 'title',
 'title',
 'title',
 'title',
 'title',
 'title',
 'title',
 'title',
 'title',
 'title',
 'title',
 'title',
 'image',
 'caption',
 'enthroned',
 'enthroned',
 'enthroned',
 'ended',
 'ended',
 'ended',
 'predecessor',
 'predecessor',
 'successor',
 'successor',
 'ordination',
 'consecration',
 'birthdate',
 'birthname',
 'birthplace',
 'deathdate',
 'deathdate',
 'deathdate',
 'buried',
 'buried',
 'buried',
 'buried',
 'buried',
 'buried',
 'nationality',
 'religion',
 'religion',
 'religion',
 'residence',
 'residence',
 'residence',
 'residence',
 'feastday',
 'feastday',
 'feastday',
 'feastday',
 'feastday',
 'feastday',
 'feastday',
 'feastday',
 'feastday',
 'feastday',
 'almamater',
 'signature',
 'articletitle',
 'articletitle',
 'articletitle',
 'articletitle',
 'articletitle',
 'name',
 'name',
 'image',
 'imagesize',
 'caption',
 'fullname',
 'fullname',
 'education',
 'nationality',
 'playingstyle',
 'birthdate',
 'b

In [12]:
content_words

['pope',
 'michael',
 'iii',
 'of',
 'alexandria',
 '56th',
 'pope',
 'of',
 'alexandria',
 '&',
 'patriarch',
 'of',
 'the',
 'see',
 'of',
 'st.',
 'mark',
 '<none>',
 '<none>',
 '25',
 'april',
 '880',
 '16',
 'march',
 '907',
 'shenouda',
 'i',
 'gabriel',
 'i',
 '<none>',
 '<none>',
 '<none>',
 '<none>',
 'egypt',
 '16',
 'march',
 '907',
 'monastery',
 'of',
 'saint',
 'macarius',
 'the',
 'great',
 'egyptian',
 'coptic',
 'orthodox',
 'christian',
 'saint',
 'mark',
 "'s",
 'church',
 '16',
 'march',
 '-lrb-',
 '20',
 'baramhat',
 'in',
 'the',
 'coptic',
 'calendar',
 '-rrb-',
 '<none>',
 '<none>',
 'pope',
 'michael',
 'iii',
 'of',
 'alexandria',
 'hui',
 'jun',
 '<none>',
 '<none>',
 '<none>',
 'hui',
 'jun',
 '<none>',
 '<none>',
 '<none>',
 '<none>',
 '<none>',
 '<none>',
 '<none>',
 '<none>',
 '<none>',
 '<none>',
 'hui',
 'jun',
 'okan',
 'Ã–ztÃ¼rk',
 '<none>',
 'okan',
 'Ã–ztÃ¼rk',
 '1.85',
 '30',
 'november',
 '1977',
 'merzifon',
 ',',
 'turkey',
 'yeni',
 'malatyaspo

In [13]:
# check if all the three lists are proper by printing them out
print("Field_content_words: ", field_content_words[1])
print("Field_words: ", field_words[:10])
print("Content_words: ", content_words[:10])

Field_content_words:  ['name hui', 'name jun', 'image <none>', 'imagesize <none>', 'caption <none>', 'fullname hui', 'fullname jun', 'education <none>', 'nationality <none>', 'playingstyle <none>', 'birthdate <none>', 'birthplace <none>', 'deathdate <none>', 'deathplace <none>', 'height <none>', 'weight <none>', 'medaltemplates <none>', 'articletitle hui', 'articletitle jun']
Field_words:  ['type', 'name', 'name', 'name', 'name', 'title', 'title', 'title', 'title', 'title']
Content_words:  ['pope', 'michael', 'iii', 'of', 'alexandria', '56th', 'pope', 'of', 'alexandria', '&']


In [14]:
# extract only the lenghts of the field_content_words and delete the field_content_words in order 
# to free up resources
pair_lengths = map(lambda x: len(x), field_content_words)
print(pair_lengths)
del field_content_words

<map object at 0x0000011C3B6311F0>


In [15]:
data_files_paths['nb_sentences']

'../Data\\train.nb'

In [16]:
data_files_paths['train_sentences']

'../Data\\train.sent'

In [17]:
def prepare_input_labels(nb_file_path, sent_file_path):
    '''
        function for concatenating the label sentences in the file according to the numbers.
        implemented by Animesh
        @param
        nb_file_path => the path pointing to the location of the train.nb file
        sent_file_path => the path pointing to the location of the train.sent file
        @return
        aligned_labels_sequences
    '''
    # generate lists of numbers and raw label_sentences
    nums = open(nb_file_path, 'r').readlines(); sents = open(sent_file_path, 'r').readlines()

    # make the nums integers
    nums = map(int, nums);

    # make sure the number of sentences and the nums match
    assert sum(nums) == len(sents), "Length mismatch between the train.nb and train.sent files"

    # run a simple loop to concatenate sentences belonging to one single training example
    label_sents = [] # initialize to empty list
    for num in nums:
        count = 0; sent = '<start>' # initialize counter and sentence
        while count < num.rstrip():
            sent += ' ' + sents.pop(0).strip(); count += 1
        # add the sentence to the label_sents list
        label_sents.append(sent + ' <eos>')

    # return the aligned_labels_sequences
    return label_sents

In [18]:
nums = open(data_files_paths['nb_sentences'], 'r').readlines(); 

In [19]:
nums

['3\n', '1\n', '2\n', '2\n', '2\n', '2\n', '5\n', '1\n', '1\n', '4\n']

In [20]:
sents = open(data_files_paths['train_sentences'], 'r').readlines()

In [21]:
sents

['pope michael iii of alexandria -lrb- also known as khail iii -rrb- was the coptic pope of alexandria and patriarch of the see of st. mark -lrb- 880 -- 907 -rrb- .\n',
 'in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community .\n',
 'this building was at one time believed to have later become the site of the cairo geniza .\n',
 'hui jun is a male former table tennis player from china .\n',
 'okan Ã–ztÃ¼rk -lrb- born 30 november 1977 -rrb- is a turkish professional footballer .\n',
 'he currently plays as a striker for yeni malatyaspor .\n',
 'marie stephan , -lrb- born march 14 , 1996 -rrb- is a professional squash player who represents france .\n',
 'she reached a career-high world ranking of world no. 101 in july 2015 .\n',
 'leonard l. martino is a former democratic member of the pennsylvania house of representatives .\n',
 'he was born in butler to michael a

In [22]:
    # run a simple loop to concatenate sentences belonging to one single training example
    label_sents = [] # initialize to empty list
    for num in nums:
        count = 0; sent = '<start>' # initialize counter and sentence
        while count < int(num):
            sent += ' ' + sents.pop(0).strip(); count += 1
        # add the sentence to the label_sents list
        label_sents.append(sent + ' <eos>')

    # return the aligned_labels_sequences
label_sents

['<start> pope michael iii of alexandria -lrb- also known as khail iii -rrb- was the coptic pope of alexandria and patriarch of the see of st. mark -lrb- 880 -- 907 -rrb- . in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community . this building was at one time believed to have later become the site of the cairo geniza . <eos>',
 '<start> hui jun is a male former table tennis player from china . <eos>',
 '<start> okan Ã–ztÃ¼rk -lrb- born 30 november 1977 -rrb- is a turkish professional footballer . he currently plays as a striker for yeni malatyaspor . <eos>',
 '<start> marie stephan , -lrb- born march 14 , 1996 -rrb- is a professional squash player who represents france . she reached a career-high world ranking of world no. 101 in july 2015 . <eos>',
 '<start> leonard l. martino is a former democratic member of the pennsylvania house of representatives . he was b

In [23]:
label_sentences = label_sents
#prepare_input_labels(data_files_paths['nb_sentences'], data_files_paths['train_sentences'])

In [24]:
label_sentences

['<start> pope michael iii of alexandria -lrb- also known as khail iii -rrb- was the coptic pope of alexandria and patriarch of the see of st. mark -lrb- 880 -- 907 -rrb- . in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community . this building was at one time believed to have later become the site of the cairo geniza . <eos>',
 '<start> hui jun is a male former table tennis player from china . <eos>',
 '<start> okan Ã–ztÃ¼rk -lrb- born 30 november 1977 -rrb- is a turkish professional footballer . he currently plays as a striker for yeni malatyaspor . <eos>',
 '<start> marie stephan , -lrb- born march 14 , 1996 -rrb- is a professional squash player who represents france . she reached a career-high world ranking of world no. 101 in july 2015 . <eos>',
 '<start> leonard l. martino is a former democratic member of the pennsylvania house of representatives . he was b

In [25]:
# label_sentences are concatenated properly to obtain the decoder sentences.
for sent in label_sentences[:3]: print(sent + '\n')

<start> pope michael iii of alexandria -lrb- also known as khail iii -rrb- was the coptic pope of alexandria and patriarch of the see of st. mark -lrb- 880 -- 907 -rrb- . in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community . this building was at one time believed to have later become the site of the cairo geniza . <eos>

<start> hui jun is a male former table tennis player from china . <eos>

<start> okan Ã–ztÃ¼rk -lrb- born 30 november 1977 -rrb- is a turkish professional footballer . he currently plays as a striker for yeni malatyaspor . <eos>



In [26]:
train_data_field, field_dict, rev_field_dict, vocab_size_field = prepare_tokenizer(field_words)

In [27]:
print(vocab_size_field, len(rev_field_dict), len(field_dict))
train_data_field[:3]

106 106 106


array([[62],
       [ 2],
       [ 2]])

In [28]:
def group_tokenized_sequences(flat_seq, lengths):
    '''
        funtion to group the seqs together to original form after tokenization
        implemented by Animesh
        @param
        flat_seq => flat list of words (in order)
        lengths => list of lengths of each sequence in the dataset
        @return => grouped_seq
    '''

    # check if the lengths and the field_seq and the content_seq lengths are compatible
    #assert sum(lengths) == len(flat_seq), "Lengths are not compatible"

    # perform the grouping:
    grouped_seqs = [] # initialize to empty list
    for length in lengths:
        count = 0; temp_grouped_seq = [] # initialize counter and storer list
        while(count < int(length)):
            temp_grouped_seq.append(flat_seq.pop(0))
            count += 1

        # add the so contructed lists to the main groupings
        grouped_seqs.append(temp_grouped_seq)

    # finally return the so created lists
    return grouped_seqs

In [29]:
#sum(pair_lengths) == len(field_seq), "Lengths are not compatible"

In [31]:
pair_lengths

<map at 0x11c3b6311f0>

In [32]:
# use the group function to bring the data together:
field_seq = np.squeeze(train_data_field).tolist()
field_sequences = group_tokenized_sequences(field_seq, pair_lengths)

In [33]:
print(field_dict)

{1: 'years', 2: 'name', 3: 'birthplace', 4: 'birthdate', 5: 'caption', 6: 'articletitle', 7: 'image', 8: 'clubs', 9: 'deathplace', 10: 'teams', 11: 'label', 12: 'associatedacts', 13: 'title', 14: 'feastday', 15: 'caps', 16: 'goals', 17: 'statlabel', 18: 'deathdate', 19: 'beatifieddate', 20: 'residence', 21: 'pcupdate', 22: 'titles', 23: 'buried', 24: 'coach', 25: 'yearsactive', 26: 'spouse', 27: 'successor', 28: 'almamater', 29: 'fullname', 30: 'position', 31: 'predecessor', 32: 'religion', 33: 'imagesize', 34: 'debutteam', 35: 'statyear', 36: 'genre', 37: 'enthroned', 38: 'ended', 39: 'height', 40: 'event', 41: 'dateofhighestranking', 42: 'dateofcurrentranking', 43: 'updated', 44: 'team', 45: 'statvalue', 46: 'origin', 47: 'veneratedin', 48: 'beatifiedby', 49: 'patronage', 50: 'nationality', 51: 'weight', 52: 'medaltemplates', 53: 'currentclub', 54: 'plays', 55: 'website', 56: 'highestranking', 57: 'currentranking', 58: 'occupation', 59: 'number', 60: 'debutdate', 61: 'college', 62: '

In [34]:
# print some slices of the field_sequences and the content_sequences:
print(field_sequences[:2])

[[62, 2, 2, 2, 2, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 7, 5], [37, 37, 37, 38, 38, 38, 31, 31, 27, 27, 63, 64, 4, 65, 3, 18, 18, 18, 23, 23, 23, 23, 23, 23, 50, 32, 32, 32, 20, 20, 20, 20, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 28, 66, 6, 6, 6, 6, 6, 2, 2, 7, 33, 5, 29, 29, 67, 50, 68, 4, 3, 18, 9, 39, 51, 52, 6, 6, 2, 2, 7, 29, 29, 39, 4, 4, 4, 3, 3, 3, 53, 53, 69, 30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


In [35]:
field_sequences

[[62, 2, 2, 2, 2, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 7, 5],
 [37,
  37,
  37,
  38,
  38,
  38,
  31,
  31,
  27,
  27,
  63,
  64,
  4,
  65,
  3,
  18,
  18,
  18,
  23,
  23,
  23,
  23,
  23,
  23,
  50,
  32,
  32,
  32,
  20,
  20,
  20,
  20,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  28,
  66,
  6,
  6,
  6,
  6,
  6,
  2,
  2,
  7,
  33,
  5,
  29,
  29,
  67,
  50,
  68,
  4,
  3,
  18,
  9,
  39,
  51,
  52,
  6,
  6,
  2,
  2,
  7,
  29,
  29,
  39,
  4,
  4,
  4,
  3,
  3,
  3,
  53,
  53,
  69,
  30,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  15,
  15,
  15,
  15,
  15,
  15,
  15,
  15,
  15,
  15,
  15,
  16,
  16,
  16,
  16,
  16,
  16],
 [16,
  16,
  16,
  16,
  16,
  21,
  21,
  21,
  21,
  21,
  21,
  21,
  21,
  6,
  6,
 

## Check if the defined pad_sequences function works properly

In [36]:
padded_field_sequences = pad_sequences(field_sequences)
print("Length of padded_sequences: ", padded_field_sequences.shape)

Length of padded_sequences:  (9, 99)


## Perform structuring of the label_sentences and the content_words in order to create a unified vocabulary of it (for copy mechanism):

Step 1: convert the label_sentences into a single flat list (order preserved) in order to tokenize it

In [44]:
def print_iterator(it):
    for x in it:
        print(x, end=' ')
    print('')  # for new line

In [45]:
# extract the length information from the label_sentences
label_sentences_lengths = map(lambda x: len(x.split()), label_sentences)
print_iterator(label_sentences_lengths)

87 14 26 35 29 35 94 33 49 59 


In [50]:
from functools import reduce
''' Warning: This is a huge map - reduce operation. And may take a long time to execute '''
label_words_list = reduce(lambda x,y: x + y, map(lambda x: x.split(), label_sentences))
print(label_words_list[:10])

['<start>', 'pope', 'michael', 'iii', 'of', 'alexandria', '-lrb-', 'also', 'known', 'as']


Step 2: store the lengths of the label_words_list and the content words in order to generate a unified vocabulary

In [51]:
content_words_label_words_split_point = len(content_words)

In [52]:
# concatenate the content_words and the label_words_list
unified_sequence = content_words + label_words_list
print("total length: ", len(unified_sequence))

total length:  994


In [53]:
# now use the tokenizer for this purpose:
temp, content_label_dict, rev_content_label_dict, vocab_size_content_label = prepare_tokenizer(unified_sequence)

In [54]:
# now again split the two lists separately and finally group them together to obtain the final stuff
content_seq = temp[: content_words_label_words_split_point]
label_seq = temp[content_words_label_words_split_point: ]

In [55]:
# use the group tokenized sequences function to restructure the tokenized input
content_seq = np.squeeze(content_seq).tolist()
label_seq = np.squeeze(label_seq).tolist()

content_sequences, label_sequences = (group_tokenized_sequences(content_seq, pair_lengths),
                                          group_tokenized_sequences(label_seq, label_sentences_lengths))

## Create the metadata file for the tensorboard_projector:

In [56]:
# metadata file path => Models/Metadata/
metadata_path = os.path.join(base_model_path, "Metadata")
print(metadata_path)

Models\Metadata


In [57]:
create_dot_vocab(field_dict, os.path.join(metadata_path, "fields.vocab"))

The file already exists:  Models\Metadata\fields.vocab


In [58]:
create_dot_vocab(content_label_dict, os.path.join(metadata_path, "content_labels.vocab"))

The file already exists:  Models\Metadata\content_labels.vocab


## Finally, perform the pickling of the Processed data

In [59]:
# create the structured dictionary to pickle in the pickle file:
pickling_data = {
    # ''' Input structured data: '''
    
    # field_encodings and related data:
    'field_encodings': field_sequences,
    'field_dict': field_dict,
    'field_rev_dict': rev_field_dict,
    'field_vocab_size': vocab_size_field,
    
    # content encodings and related data:
    'content_encodings': content_sequences,
    
    # ''' Label summary sentences: '''
    
    # label encodings and related data:
    'label_encodings': label_sequences,
    
    # V union C related data:
    'content_union_label_dict': content_label_dict,
    'rev_content_union_label_dict': rev_content_label_dict,
    'content_label_vocab_size': vocab_size_content_label
}

use the function from this repository -> https://github.com/akanimax/machine-learning-helpers to perform pickling and unpickling. The code has been taken exactly and packaged in the Text_Preprocessing_Helpers module of this implementation

In [60]:
# pickle the above defined dictionary at the plug_and_play_data_file path
pickleIt(pickling_data, plug_and_play_data_file)

The pickle file already exists: ../Data\plug_and_play.pickle


Thus, the purpose of this notebook is now complete. We can directly use this pickled data and start building the tensorflow graph to go forward.

## See you in the graph building module! Asta la vista!