# BL books metadata

This notebook contains the code that was used to create a sample of books from the British Library 10th century books collection.

In [27]:
# imports

import pandas as pd
import json, os, shutil, codecs

In [2]:
# read metadata file

filename = "MicrosoftBooks_filtered_list.csv"
df = pd.read_csv(filename, delimiter=";")

In [3]:
df.head(3)

Unnamed: 0,Aleph system no.,Country code,Language code (008),Language code (041),DDC,Personal author,Corporate author,Title,Edition,Imprint,Series,Subjects,Other personal authors,Other corporate authors,DOM ID,Type,Genre
0,14602826,|||,und,,,"Yearsley, Ann, 1753-1806.",,[Poems on several occasions. [With a prefatory...,The fourth edition. MS. note.,"London, 1786.",,,"More, Hannah, 1745-1833",,lsidyv3d812d61,poet,Poetry
1,14602830,|||,und,,,"A., T.",,A Satyr against Vertue. (A poem: supposed to b...,,"London, 1679.",,,"Oldham, John, 1653-1683.",,lsidyv2eb1b8fa,poet,Poetry
2,14602831,|||,und,,,,,"The Aeronaut, a poem; founded almost entirely,...",,"Dublin : Richard Milliken, 1816.",,Dublin (Ireland),,,lsidyv2eb0f8a4,poet,Poetry


In [4]:
df["Language code (008)"].value_counts()

eng    19250
und    18877
fre     3833
ger     3161
spa      769
ita      642
rus      582
mul      576
dut      551
hun      256
swe      244
dan      231
pol      175
por       75
lat       74
cze       58
gre       45
fin       19
scc       12
gle        5
rum        3
ice        3
lit        3
scr        3
slv        2
bul        2
slo        2
nno        2
Name: Language code (008), dtype: int64

In [5]:
df["Genre"].value_counts()

Prose     10532
Poetry     6150
Drama      1647
Music       980
Name: Genre, dtype: int64

In [6]:
df_eng = df[df["Language code (008)"] == "eng"]

In [7]:
df_eng["Genre"].value_counts()

Prose     3795
Poetry     876
Music      192
Drama      124
Name: Genre, dtype: int64

In [8]:
how_many = 120

df_prose = df_eng[df_eng["Genre"] == "Prose"].sample(how_many, random_state=42)
df_poetry = df_eng[df_eng["Genre"] == "Poetry"].sample(how_many, random_state=42)
df_music = df_eng[df_eng["Genre"] == "Music"].sample(how_many, random_state=42)
df_drama = df_eng[df_eng["Genre"] == "Drama"].sample(how_many, random_state=42)

df_sampled = pd.concat([df_prose,df_poetry,df_music,df_drama])

In [9]:
df_sampled.count()

Aleph system no.           480
Country code               480
Language code (008)        480
Language code (041)          1
DDC                          0
Personal author            408
Corporate author            10
Title                      480
Edition                     52
Imprint                    480
Series                       3
Subjects                    26
Other personal authors     127
Other corporate authors      0
DOM ID                     480
Type                       480
Genre                      480
dtype: int64

In [10]:
# take the first pdf for every book

df_sampled["first_pdf"] = df_sampled["DOM ID"].apply(lambda x: x.split(" -- ")[0])

In [11]:
pdfs = df_sampled["first_pdf"].values

In [14]:
# load metadata

metadata_file = "book_data.json"
meta = json.load(open(metadata_file))
meta = [b for b in meta if "pdf" in b.keys()]

In [15]:
# update pdf ids which are set to 0 if there is only one pdf, and to 1+ if there is more than one..

def correct_pdf_ids(book):
    if "0" in book["pdf"].keys():
        book["pdf"]["1"] = book["pdf"]["0"]
        book["pdf"].pop("0", None)
_ = [b for b in map(correct_pdf_ids, meta)]

In [16]:
meta[0]

{'datefield': '[1888]',
 'imgs': {'0': {'000006': ['11193640604'],
   '000021': ['11291327843'],
   '000004': ['11194557546'],
   '000016': ['11195895503'],
   '000011': ['11193211526'],
   '000010': ['11105407186', '11102797916'],
   '000012': ['11290477144'],
   '000037': ['11290757205'],
   '000035': ['11290567494'],
   '000027': ['11196186866'],
   '000033': ['11193563513'],
   '000032': ['11105696613'],
   '000031': ['11289757376'],
   '000030': ['11289521295'],
   '000023': ['11289589926'],
   '000014': ['11291300706'],
   '000015': ['11100321335'],
   '000024': ['11291438044'],
   '000007': ['11104733396'],
   '000025': ['11289879506']}},
 'shelfmarks': ['British Library HMNTS 10347.cc.13.(4.)'],
 'publisher': 'A. Heywood & Son',
 'title': ['A Gossip about Old Manchester. With illustrations. [Signed: A.]'],
 'edition': '',
 'flickr_url_to_book_images': 'http://www.flickr.com/photos/britishlibrary/tags/sysnum000000037',
 'place': 'Manchester',
 'issuance': 'monographic',
 'author

In [19]:
# pick only the sampled books

meta_sampled = [b for b in meta if ("1" in b["pdf"].keys()) and (b["pdf"]["1"] in pdfs)]

In [20]:
len(meta_sampled)

452

In [24]:
# get the full text files

local_folder = "sample"
remote_folder = "../BL_books/json" # add yours

for book in meta_sampled:
    foldername = book["identifier"][:4]
    filename = book["identifier"]+"_01_text.json"
    remote_filename = os.path.join(remote_folder, foldername, filename)
    book["fulltext_filename"] = os.path.join(local_folder, "full_texts", filename)
    shutil.copyfile(remote_filename, book["fulltext_filename"])

In [31]:
# export metadata for the sample

json.dump(meta_sampled, codecs.open(os.path.join(local_folder, "book_data_sample.json"), "w", "utf8"))
df_sampled.to_csv(codecs.open(os.path.join(local_folder, "MicrosoftBooks_filtered_list_sample.csv"), "w", "utf8"), index=False, sep=";")