In [3]:
import os.path
import re
import sys
import numpy as np
import json
import time
from six.moves import urllib
import matplotlib as mpl
from pprint import pprint
import pandas as pd

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Preprocessing - BM Caption dataset

- Remove duplication by image url  - DONE
- Remove materials tags - DONE
- Remove dates tags - DONE
- Remove parenthesis stuff - DONE
- Remove images with short/non-descriptive captions  - DONE


In [2]:
#Load the data we pre-grabbed from the SPARQL BM endpoint(http://collection.britishmuseum.org/sparql)
with open('/data/captioning/bm_prints_urls_captions.json') as f:
    data = json.load(f)

In [3]:
img_arr = data['results']['bindings']
len(img_arr)

782550

In [20]:
pprint(img_arr[0:20])

[{u'caption': {u'type': u'literal',
               u'value': u'From the animal legend; antelope to left, another(?) lying centre. 1912\nWoodcut'},
  u'print': {u'type': u'uri',
             u'value': u'http://collection.britishmuseum.org/id/object/PPA8330'},
  u'url': {u'type': u'uri',
           u'value': u'http://www.britishmuseum.org/collectionimages/AN00107/AN00107870_001_l.jpg'}},
 {u'caption': {u'type': u'literal',
               u'value': u'From the animal legend; antelope to left, another(?) lying centre. 1912\nWoodcut'},
  u'print': {u'type': u'uri',
             u'value': u'http://collection.britishmuseum.org/id/object/P_1982-0724-21'},
  u'url': {u'type': u'uri',
           u'value': u'http://www.britishmuseum.org/collectionimages/AN00107/AN00107870_001_l.jpg'}},
 {u'caption': {u'type': u'literal',
               u'value': u'From the animal legend; antelope to left, another(?) lying centre. 1912\nWoodcut'},
  u'print': {u'type': u'uri',
             u'value': u'http://collec

In [4]:
start = time.time()

# Filter by unqiue uris
from pandas.io.json import json_normalize

df = json_normalize(img_arr)
df.drop('caption.type', axis=1, inplace=True)
df.drop('print.type', axis=1, inplace=True)
df.drop('url.type', axis=1, inplace=True)
df.head()

print "Time:" + str(time.time() - start)

Time:52.4418330193


In [5]:
df.head()

Unnamed: 0,caption.value,print.value,url.value
0,"From the animal legend; antelope to left, anot...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...
1,"From the animal legend; antelope to left, anot...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...
2,"From the animal legend; antelope to left, anot...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...
3,Woodblock print. Giant lantern in the Kaminari...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...
4,Woodblock print. Giant lantern in the Kaminari...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...


In [37]:
# Drop duplicates according to caption.value and url.value
start = time.time()

print len(df)
df.drop_duplicates(subset=['caption.value', 'url.value'], inplace=True)
print len(df)

print "Time:" + str(time.time() - start)

782550
258103
Time:8.21036314964


In [40]:
df_count = df.groupby(['url.value']).size().reset_index().rename(columns={0:'count'})

In [53]:
# look if some objects have more than 1 captions
df_count['count'].value_counts()

1    253884
2      2072
3        11
4         6
6         3
Name: count, dtype: int64

In [6]:
#Drop further dup urls
df.drop_duplicates(subset=['url.value'], inplace=True)
print len(df)

255976


In [7]:
# get shortering data to work with
df_tiny = df[:5].copy()

In [8]:
df_tiny['capt_length'] = df_tiny['caption.value'].map(lambda x: len(x.split()))
df_tiny

Unnamed: 0,caption.value,print.value,url.value,capt_length
0,"From the animal legend; antelope to left, anot...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,12
3,Woodblock print. Giant lantern in the Kaminari...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,10
6,Woodblock print. Spring night at Ginza.,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,6
9,Le Mont de Neuville; street winding to right b...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,35
12,Monochrome woodblock print. Subodai standing. ...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,10


In [9]:
## Add caption length
df['capt_length'] = df['caption.value'].map(lambda x: len(x.split()))
df.tail(5)

Unnamed: 0,caption.value,print.value,url.value,capt_length
782535,Woodblock print. Popular culture. Girl behind ...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,35
782538,Woodblock print. Facsimile reproduction. The k...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,32
782541,Woodblock print. Itinerant noodle vendor. Acco...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,9
782544,Woodblock print. Popular culture. Lady attendi...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,10
782547,Woodblock print. Straw dragon on a bamboo pole...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,17


In [137]:
#reset index 
df.reset_index(drop=True)

Unnamed: 0,caption.value,print.value,url.value,capt_length
0,"From the animal legend; antelope to left, anot...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,12
1,Woodblock print. Giant lantern in the Kaminari...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,10
2,Woodblock print. Spring night at Ginza.,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,6
3,Le Mont de Neuville; street winding to right b...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,35
4,Monochrome woodblock print. Subodai standing. ...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,10
5,Colour woodblock print with powdered mica back...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,22
6,Woodblock print. Beauty with unrolled obi?,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,6
7,Colour woodblock print with gold leaf and powd...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,24
8,"Illustrated book, orihon (folding album), 5 vo...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,31
9,"Illustrated book, folding album, 2 vols. Varie...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,11


In [10]:
#plot occurrences of each length

from collections import Counter
c = Counter(df['capt_length'])
plt.plot(*zip(*sorted(c.items())))
plt.xlim(1,10)

NameError: name 'plt' is not defined

In [11]:
#There's roughly around 1k of words less than 5. Remove them.
df = df[df['capt_length'] > 5]
df.reset_index(drop=True)

Unnamed: 0,caption.value,print.value,url.value,capt_length
0,"From the animal legend; antelope to left, anot...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,12
1,Woodblock print. Giant lantern in the Kaminari...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,10
2,Woodblock print. Spring night at Ginza.,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,6
3,Le Mont de Neuville; street winding to right b...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,35
4,Monochrome woodblock print. Subodai standing. ...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,10
5,Colour woodblock print with powdered mica back...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,22
6,Woodblock print. Beauty with unrolled obi?,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,6
7,Colour woodblock print with gold leaf and powd...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,24
8,"Illustrated book, orihon (folding album), 5 vo...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,31
9,"Illustrated book, folding album, 2 vols. Varie...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,11


In [None]:
list(df['caption.value'].sample(100))

In [13]:
from dateutil.parser import parse

def is_date(string):
    try: 
        parse(string)
        return True
    except (TypeError, ValueError):
        return False

In [14]:
# We will extract the date + the material from the description and put into another column. 

is_date_ct = 0
got_material = 0

def stripExtras(capt):
    global is_date_ct
    global got_material
    arr = capt['caption.value'].split('\n')
    mat = 'n/a'
    orig = capt['caption.value'].strip()
    if len(arr) > 1: 
        mat = arr[-1]
        orig = '\n'.join(arr[:-1]).strip()
        got_material+=1
        
    arr = orig.split('.')
    date = 'n/a'
    if len(arr) > 1 and len(arr[-1]) > 0:
        if (is_date(arr[-1])):
            is_date_ct += 1
            date = arr[-1]
            orig = '.'.join(arr[:-1]).strip()       
        
    return pd.Series([orig, mat, date])

# def stripDates(capt):
#     #print capt
#     arr = capt['caption.value'].split('.')
#     res = 'n/a'
#     orig = capt['caption.value'].strip()
#     if len(arr) > 1:
#         print "Date: ", arr[-1]
#         print "\n\n"
#         #if (is_date(arr[-1])):
#         #    res = arr[-1]
#         #    orig = '.'.join(arr[:-1]).strip()
        
#    return pd.Series([orig, res])

df[['caption.value_cleaned', 'ext_dates', 'ext_materials']] = df.apply(stripExtras, axis=1)
df.head(10)

Unnamed: 0,caption.value,print.value,url.value,capt_length,caption.value_cleaned,ext_dates,ext_materials
0,"From the animal legend; antelope to left, anot...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,12,"From the animal legend; antelope to left, anot...",Woodcut,1912.0
3,Woodblock print. Giant lantern in the Kaminari...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,10,Woodblock print. Giant lantern in the Kaminari...,,
6,Woodblock print. Spring night at Ginza.,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,6,Woodblock print. Spring night at Ginza.,,
9,Le Mont de Neuville; street winding to right b...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,35,Le Mont de Neuville; street winding to right b...,"Charcoal (?), with watercolour, over monotype",1899.0
12,Monochrome woodblock print. Subodai standing. ...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,10,Monochrome woodblock print. Subodai standing. ...,,
15,Colour woodblock print with powdered mica back...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,22,Colour woodblock print with powdered mica back...,,
18,Woodblock print. Beauty with unrolled obi?,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,6,Woodblock print. Beauty with unrolled obi?,,
21,Colour woodblock print with gold leaf and powd...,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,24,Colour woodblock print with gold leaf and powd...,,
24,"Illustrated book, orihon (folding album), 5 vo...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,31,"Illustrated book, orihon (folding album), 5 vo...",,
27,"Illustrated book, folding album, 2 vols. Varie...",http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,11,"Illustrated book, folding album, 2 vols. Varie...",,


In [15]:
print float(is_date_ct)/len(df)
print float(got_material)/len(df)

#mistake
df.rename(columns = {'ext_dates': 'materials'}, inplace = True)
df.rename(columns = {'ext_materials': 'dates'}, inplace = True)

0.486805825968
0.921739609268


In [16]:
list(df.sample(10)['caption.value_cleaned'])

[u"Portrait of the actor John Fawcett, in character as Caleb Quotem, in Lee's 'Throw Physic to the Dogs'; bust, looking upwards to the right, wearing wig and embroidered waistcoat; in oval; after De Wilde",
 u'(For description see other impression)',
 u'No.15: a tall young officer wearing a small-sword and holding a pike',
 u'Portrait of the medallist Joseph Roettier, half-length; in an oval; fourth state with extended inscription in cartouche below portrait; after Nicolas de Largilli\xe8re',
 u'Portrait of French dramatist Cr\xe9billon, bust-length, in profile to the right, on dark ground and in oval frame; tablet in the lower part',
 u"Portrait of a woman, probably Emma Johnston, the artist's wife, seen almost whole-length, sleeping in a chair.  c",
 u'Portrait of Jacob Roelants, nearly half-length to right, looking towards the viewer; wearing a  skull cap, ruff, dark doublet and wide ribbon for an order around his neck; in ornate frame, coat of arms with crest and cartouche for insc

In [17]:
start = time.time()

#use regex to extract out parentehsis stuff
has_paren_ct = 0

import re
def stripParen(input):
    global has_paren_ct
    s = input['caption.value_cleaned']
    out = re.sub('\(.*?\)', '', s).strip()
    out = re.sub('\[.*?\]', '', out).strip()
    if s != out:
        has_paren_ct += 1
    return pd.Series([out])


df[['caption.value_cleaned2']] = df.apply(stripParen, axis=1)
print list(df.head(5)['caption.value_cleaned'])
print list(df.head(5)['caption.value_cleaned2'])
print "Time:" + str(time.time() - start)
print "Percentage affected:" + str(float(has_paren_ct)/len(df))

[u'From the animal legend; antelope to left, another(?) lying centre', u'Woodblock print. Giant lantern in the Kaminari-mon of Senso-ji temple.', u'Woodblock print. Spring night at Ginza.', u'Le Mont de Neuville; street winding to right beyond, flanked by buildings on both sides, at right figure in doorway and beyond figure standing in middle of road', u'Monochrome woodblock print. Subodai standing. Signed, sealed, inscribed and marked.']
[u'From the animal legend; antelope to left, another lying centre', u'Woodblock print. Giant lantern in the Kaminari-mon of Senso-ji temple.', u'Woodblock print. Spring night at Ginza.', u'Le Mont de Neuville; street winding to right beyond, flanked by buildings on both sides, at right figure in doorway and beyond figure standing in middle of road', u'Monochrome woodblock print. Subodai standing. Signed, sealed, inscribed and marked.']
Time:77.8564760685
Percentage affected:0.193340177233


In [18]:
print list(df[6000:6010]['caption.value_cleaned'])
print list(df[6000:6010]['caption.value_cleaned2'])

[u"Plate V: Joseph, wearing a loin-cloth and shackled, stands gesticulating before Pharaoh and his queen, enthroned with a harp player kneeling beside them at r; two guards, holding Joseph's chains, and an ibis at left", u'Business card for the mould-maker Malzieux; with decorative border around inscription, of foliage, casts and tools, figures and animals', u'Woman standing at a window, whole-length; wearing very full skirts.', u'View over rolling landscape to town, rright, with tree to left', u'Woman standing at a window, whole-length; wearing very full skirts.', u'Sheet 21 (sig. l): the page is divided into two halves, in the upper compartment God is adored by angels and kings, below the temple of God (titled) is placed above the ark of the covenant, next to wwhich two angels are swinging censers; inscriptions', u'Hunting scene; four horsemen in medieval costume in the background, with a fox chased by hound in the foreground', u'A Gothic tabernacle, shown in elevation and (on a sepa

In [21]:
store = pd.HDFStore('/data/captioning/bm_data_clean.h5')

In [22]:
df_to_store = df[['print.value', 'url.value', 'caption.value_cleaned2', 'materials', 'dates']]
df_to_store.head(5)

Unnamed: 0,print.value,url.value,caption.value_cleaned2,materials,dates
0,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,"From the animal legend; antelope to left, anot...",Woodcut,1912.0
3,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Woodblock print. Giant lantern in the Kaminari...,,
6,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Woodblock print. Spring night at Ginza.,,
9,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Le Mont de Neuville; street winding to right b...,"Charcoal (?), with watercolour, over monotype",1899.0
12,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Monochrome woodblock print. Subodai standing. ...,,


In [23]:
store['df'] = df_to_store.rename(index=str, columns={'caption.value_cleaned2': 'captions'}).reset_index(drop=True)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['print.value', 'url.value', 'captions', 'materials', 'dates']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [24]:
store['df']

Unnamed: 0,print.value,url.value,captions,materials,dates
0,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,"From the animal legend; antelope to left, anot...",Woodcut,1912
1,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Woodblock print. Giant lantern in the Kaminari...,,
2,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Woodblock print. Spring night at Ginza.,,
3,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Le Mont de Neuville; street winding to right b...,"Charcoal (?), with watercolour, over monotype",1899
4,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Monochrome woodblock print. Subodai standing. ...,,
5,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Colour woodblock print with powdered mica back...,,
6,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Woodblock print. Beauty with unrolled obi?,,
7,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Colour woodblock print with gold leaf and powd...,,
8,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,"Illustrated book, orihon , 5 vols . Instructio...",,
9,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,"Illustrated book, folding album, 2 vols. Varie...",,


In [25]:
store.close()

In [4]:
df2 = pd.read_hdf('/data/captioning/bm_data_clean.h5', 'df')
df2

Unnamed: 0,print.value,url.value,captions,materials,dates
0,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,"From the animal legend; antelope to left, anot...",Woodcut,1912
1,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Woodblock print. Giant lantern in the Kaminari...,,
2,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Woodblock print. Spring night at Ginza.,,
3,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Le Mont de Neuville; street winding to right b...,"Charcoal (?), with watercolour, over monotype",1899
4,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Monochrome woodblock print. Subodai standing. ...,,
5,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Colour woodblock print with powdered mica back...,,
6,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Woodblock print. Beauty with unrolled obi?,,
7,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,Colour woodblock print with gold leaf and powd...,,
8,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,"Illustrated book, orihon , 5 vols . Instructio...",,
9,http://collection.britishmuseum.org/id/object/...,http://www.britishmuseum.org/collectionimages/...,"Illustrated book, folding album, 2 vols. Varie...",,


In [6]:
list(df2.sample(100)['url.value'])

[u'http://www.britishmuseum.org/collectionimages/AN00862/AN00862742_001_l.jpg',
 u'http://www.britishmuseum.org/collectionimages/AN00082/AN00082427_001_l.jpg',
 u'http://www.britishmuseum.org/collectionimages/AN00976/AN00976435_001_l.jpg',
 u'http://www.britishmuseum.org/collectionimages/AN00086/AN00086856_001_l.jpg',
 u'http://www.britishmuseum.org/collectionimages/AN00342/AN00342448_001_l.jpg',
 u'http://www.britishmuseum.org/collectionimages/AN00118/AN00118567_001_l.jpg',
 u'http://www.britishmuseum.org/collectionimages/AN01016/AN01016233_001_l.jpg',
 u'http://www.britishmuseum.org/collectionimages/AN00156/AN00156783_001_l.jpg',
 u'http://www.britishmuseum.org/collectionimages/AN00980/AN00980032_001_l.jpg',
 u'http://www.britishmuseum.org/collectionimages/AN01022/AN01022508_001_l.jpg',
 u'http://www.britishmuseum.org/collectionimages/AN00572/AN00572858_001_l.jpg',
 u'http://www.britishmuseum.org/collectionimages/AN00062/AN00062217_001_l.jpg',
 u'http://www.britishmuseum.org/collecti