In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from glob import glob
import re
import os
import xml.etree.ElementTree as ET

from visuals import *

def get_structed_data(data_path):
    all_data = {}
    for filename in glob(data_path):
        tree = ET.parse(filename)
        root = tree.getroot()
        tmp = []
        ids = []
        pos_tag = []
        writer_id = root.attrib['writer-id']
            
        for part in root.findall('handwritten-part'):
            for line in part.findall('line'):
                for word in line.findall('word'):
                    tmp.append(word.attrib['text'].rstrip())
                    ids.append(word.attrib['id'].rstrip())
                    pos_tag.append(word.attrib['tag'].rstrip())
        assert(len(tmp) == len(ids) == len(pos_tag))
        all_data[filename.split('/')[-1].split('.')[0]] = [tmp, ids, pos_tag]
#         print(all_data)
    return all_data


def create_dataframe(all_data):
    dat = pd.DataFrame(all_data).T.reset_index()
    dat.columns = ['filename', 'meta', 'ids', 'pos_tag']
    dat['folder'] = dat.filename.map(lambda x: x.split('-')[0])
    dat['meta'] = dat.meta.map(lambda x: np.array([i.replace('&quot;', '"') for i in x]))
    dat['document'] = dat.filename.map(lambda x: re.sub(r'[A-Za-z]', '', x.split('-')[-1]))
    dat['document'] = dat['folder'] + '-' + dat['document']
    return dat


def main_meta(data_path, save=False, save_path='../../data/preprocessed'): 
    all_data = get_structed_data(data_path)
    dat = create_dataframe(all_data)
    # to CSV 
    if save:
        print('saving data to {}'.format(save_path))
        dat.to_csv(os.path.join(save_path, 'meta.csv'), index=False)
        dat.to_json(os.path.join(save_path, 'meta.json'))

In [6]:
dat.head()

Unnamed: 0,filename,meta,ids,pos_tag,folder,document
0,a01-000u,"[A, MOVE, to, stop, Mr., Gaitskell, from, nomi...","[a01-000u-00-00, a01-000u-00-01, a01-000u-00-0...","[AT, NN, TO, VB, NPT, NP, IN, VBG, DTI, AP, NN...",a01,a01-000
1,a01-000x,"[A, MOVE, to, stop, Mr., Gaitskell, from, nomi...","[a01-000x-00-00, a01-000x-00-01, a01-000x-00-0...","[AT, NN, TO, VB, NPT, NP, IN, VBG, DTI, AP, NN...",a01,a01-000
2,a01-003,"[Though, they, may, gather, some, Left-wing, s...","[a01-003-00-00, a01-003-00-01, a01-003-00-02, ...","[CS, PP3AS, MD, VB, DTI, JJB, NN, ,, AT, JJ, N...",a01,a01-003
3,a01-003u,"[Though, they, may, gather, some, Left-wing, s...","[a01-003u-00-00, a01-003u-00-01, a01-003u-00-0...","[CS, PP3AS, MD, VB, DTI, JJB, NN, ,, AT, JJ, N...",a01,a01-003
4,a01-003x,"[Though, they, may, gather, some, Left-wing, s...","[a01-003x-00-00, a01-003x-00-01, a01-003x-00-0...","[CS, PP3AS, MD, VB, DTI, JJB, NN, ,, AT, JJ, N...",a01,a01-003


In [1]:
data_path = '../../data/raw/word_level'
meta_data_path = '../../data/preprocessed/meta.csv'
meta_json_data_path = '../../data/preprocessed/meta.json'

In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
from glob import glob
import re
import os
from sklearn.model_selection import train_test_split


def duplicate_row(df, col_name):
    """When cell contents are lists, create a row for each element in the list"""
    series = df.apply(lambda x: pd.Series(x[col_name]),axis=1).stack().reset_index(level=1, drop=True)
    series.name = col_name
    return series
    
def create_word_level_df(df, cols=[]):
    """Combine multiple Series into a pandas DataFrame"""
    meta_series = duplicate_row(df, 'meta')
    id_series = duplicate_row(df, 'ids')
    pos_series = duplicate_row(df, 'pos_tag')
    df = df.drop(['meta', 'ids', 'pos_tag'], axis=1).join(pd.concat([meta_series, id_series, pos_series], axis=1))
    return df.rename(columns={'meta': 'token', 'ids': 'image_name'}).reset_index()

def absoluteFilePaths(directory):
    """Walk filepaths"""
    for dirpath,_,filenames in os.walk(directory):
        for f in filenames:
            yield os.path.join(dirpath, f)

def create_image_path(df, data_path):
    """Create dictionary for mapping of word to data path"""
    all_paths = [i for i in absoluteFilePaths(data_path)]
    all_path_endings = [i.split('/')[-1].split('.')[0] for i in all_paths]
    all_path_dict = defaultdict(lambda: 0, dict(zip(all_path_endings, all_paths)))
    df['image_path'] = df['image_name'].map(lambda x: all_path_dict[x])
    return df


def main_word_level(meta_json_data_path, image_data_path, test_size=0.2, save=False, 
                                        save_path='../../data/preprocessed'):
    meta = pd.read_json(meta_json_data_path)
    meta.drop(index=494, axis=1, inplace=True)
    word_level_df = create_word_level_df(meta)
    word_level_df = create_image_path(word_level_df, image_data_path)
    # split to train / test 
    train, test = train_test_split(word_level_df, test_size=test_size, random_state=100)
    if save:
        train.to_csv(os.path.join(save_path, 'word_level_train.csv'), index=False)
        test.to_csv(os.path.join(save_path, 'word_level_test.csv'), index=False)
    
    
if __name__ == '__main__':
    image_data_path = '../../data/raw/word_level'
    meta_json_data_path = '../../data/preprocessed/meta.json'    
    main_word_level(meta_json_data_path, image_data_path, save=False, test_size=0.2)

  return f(*args, **kwds)
  return f(*args, **kwds)


In [6]:
word_level_df.head()

Unnamed: 0,index,filename,folder,document,token,image_name,pos_tag,image_path
0,0,d05-040,d05,d05-040,Or,d05-040-00-00,CC,../../data/raw/word_level/d05/d05-040/d05-040-...
1,0,d05-040,d05,d05-040,when,d05-040-00-01,WRB,../../data/raw/word_level/d05/d05-040/d05-040-...
2,0,d05-040,d05,d05-040,he,d05-040-00-02,PP3A,../../data/raw/word_level/d05/d05-040/d05-040-...
3,0,d05-040,d05,d05-040,found,d05-040-00-03,VBD,../../data/raw/word_level/d05/d05-040/d05-040-...
4,0,d05-040,d05,d05-040,cock-fighting,d05-040-00-04,NN,../../data/raw/word_level/d05/d05-040/d05-040-...


In [7]:
train, test = train_test_split(word_level_df, test_size=0.2, random_state=100)

In [8]:
train.head()

Unnamed: 0,index,filename,folder,document,token,image_name,pos_tag,image_path
64105,860,e04-011,e04,e04-011,complete,e04-011-07-06,VB,../../data/raw/word_level/e04/e04-011/e04-011-...
80300,1070,b06-036,b06,b06-036,must,b06-036-01-02,MD,../../data/raw/word_level/b06/b06-036/b06-036-...
86781,1158,e07-101,e07,e07-101,is,e07-101-00-06,BEZ,../../data/raw/word_level/e07/e07-101/e07-101-...
89113,1188,h07-075,h07,h07-075,for,h07-075-06-01,IN,../../data/raw/word_level/h07/h07-075/h07-075-...
78107,1041,g02-073,g02,g02-073,produce,g02-073-03-04,VB,../../data/raw/word_level/g02/g02-073/g02-073-...


In [2]:
!ls ../../data/preprocessed/

example.txt          meta_json.csv        word_level_test.csv
meta.csv             meta_json.json       word_level_train.csv
meta.json            word_level_meta.csv


In [3]:
test = pd.read_csv('../../data/preprocessed/word_level_test.csv')

In [4]:
test.head()

Unnamed: 0,index,filename,folder,document,token,image_name,pos_tag,image_path
0,1111,b01-027,b01,b01-027,of,b01-027-00-05,INO,../../data/raw/word_level/b01/b01-027/b01-027-...
1,68,p03-185,p03,p03-185,.,p03-185-08-06,.,../../data/raw/word_level/p03/p03-185/p03-185-...
2,1043,h01-024,h01,h01-024,as,h01-024-02-01,IN,../../data/raw/word_level/h01/h01-024/h01-024-...
3,1162,h07-060b,h07,h07-060,the,h07-060b-03-03,ATI,../../data/raw/word_level/h07/h07-060b/h07-060...
4,1168,g06-050d,g06,g06-050,strong,g06-050d-04-02,JJ,../../data/raw/word_level/g06/g06-050d/g06-050...


In [11]:
import pandas as pd
import numpy as np
import codecs
from nltk.tokenize import sent_tokenize
from collections import Counter, defaultdict
import re
import dill as pickle
import random
import os

def create_full_path(wiki_path, dataset):
    """Joins to paths together for wiki dataset"""
    return os.path.join(wiki_path, 'wiki.{}.tokens'.format(dataset))

def create_data(wiki_path, dataset, available_image_letters, save=False, target_sep=['<<', '>>'], 
                threshold_length=3, iteration_threshold=30, out_path='../../data/processed/'):
    """
    Loads and searchs for useable target word (one we have an image for) and saves wiki text dataset
    
    Args:
    
    
    Returns:
    
    """
    dataset_name = wiki_path.split('/')[-1]
    data_path = create_full_path(wikitext2_path, 'train')
    
    random.seed(100)
    X = []
    y = []
    raw = []
    bad_words = ['=', '戦', '場', 'の', 'ヴ', 'ァ', 'ル', 'キ', 'ュ', 'リ', 'ア', '戦場のヴァルキュリア3']
    N = 0
    bad_lines = 0
    counter = Counter()
    
    with codecs.open(data_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.rstrip():
                if not any(bad_word in line for bad_word in bad_words):
                    for sent in sent_tokenize(line):
                        split_sent = sent.split()
                        found = True
                        iteration = 0
                        while (found == True):
                            iteration += 1
                            random_int = random.randint(0, len(split_sent)-1)
                            target = split_sent[random_int]
                            if (len(target) >= threshold_length) and (target in available_image_letters):
#                                 X.append(split_sent[:random_int] + ['<TARGET>'] + split_sent[random_int+1:])
                                X.append(split_sent[:random_int] + [''.join([target_sep[0]] + [target] + [target_sep[1]])] 
                                             + split_sent[random_int+1:])
                                y.append((target, random_int))
                                raw.append(split_sent)
                                found = False
                                # get word counts 
                                for token in split_sent:
                                    counter[token] += 1

                            if (iteration >= iteration_threshold):
                                found = False
                                bad_lines += 1
                if N % 5000 == 0:
                    print('processed {} lines'.format(N))
#                 if N >= 100: break
                N += 1
        print('number of skipped lines', bad_lines)
    if save:
        print('Save Path', out_path + '----' + dataset_name + '-' + dataset)
        np.save(out_path + 'X_' + dataset_name + '-' + dataset, X)
        np.save(out_path + 'y_' + dataset_name + '-' + dataset, y)
        np.save(out_path + 'raw' + dataset_name + '-' + dataset, raw)


    return X, y, raw, counter


def main_wiki(wikitext_path, dataset, save=False):
    print('Reading word level meta from {}'.format(dataset))
    word_level_meta_path = '../../data/preprocessed/word_level_{}.csv'.format(dataset)
    # TODO if dataset = 'test' concat test and validation sets. 
    word_level_df = pd.read_csv(word_level_meta_path)
    available_image_letters = word_level_df.token.values.tolist()
    print('first 10 available image letters: ', available_image_letters[:10])
    print('Building Dataset')
    X, y, raw, counter = create_data(wikitext_path, dataset, available_image_letters, save=save)
    return X, y, raw, counter

if __name__ == '__main__':
    wikitext2_path = '../../data/raw/language_model/wikitext-2'
    wikitext103_path = '../../data/raw/language_model/wikitext-103'
    word_level_meta_path_all = '../../data/preprocessed/word_level_meta.csv'
    word_level_meta_path_train = '../../data/preprocessed/word_level_train.csv'
    word_level_meta_path_test = '../../data/preprocessed/word_level_test.csv'
    
    X, y, raw, counter = main_wiki(wikitext2_path, dataset='test', save=False)
    

['of', '.', 'as', 'the', 'strong', "Kennedy's", 'couple', 'that', 'whole', 'commented']
processed 0 lines
processed 5000 lines
processed 10000 lines
processed 15000 lines
processed 20000 lines
number of skipped lines 3189


In [12]:
len(X)

77567

In [13]:
len(y)

77567

In [15]:
# list(zip(X, y))[:10]

In [16]:
word_level_meta_path

NameError: name 'word_level_meta_path' is not defined