## Loading in data

In [1]:
import pandas as pd
import numpy as np

from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

INFO:rdflib:RDFLib Version: 4.2.1


In [2]:
import os
import re
import gzip
import tarfile
import urllib
import xml.etree.cElementTree as ElementTree
try:
	import cPickle as pickle
except ImportError:
	import pickle

PICKLEFILE = '/tmp/md.pickle.gz'  # The Python dict produced by this module
RDFFILES = '/tmp/rdf-files.tar.bz2'  # The catalog downloaded from Gutenberg
RDFURL = r'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
META_FIELDS = ('id', 'author', 'title', 'downloads', 'formats', 'type', 'LCC',
		'subjects', 'authoryearofbirth', 'authoryearofdeath', 'language')
NS = dict(
		pg='http://www.gutenberg.org/2009/pgterms/',
		dc='http://purl.org/dc/terms/',
		dcam='http://purl.org/dc/dcam/',
		rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#')
LINEBREAKRE = re.compile(ur'[ \t]*[\n\r]+[ \t]*')
ETEXTRE = re.compile(r'''
	e(text|b?ook)
	\s*
	(\#\s*(?P<etextid_front>\d+)
	|
	(?P<etextid_back>\d+)\s*\#)
	''', re.IGNORECASE | re.VERBOSE)


def readmetadata():
	"""Read/create cached metadata dump of Gutenberg catalog.
	Returns:
		A dictionary with the following fields:
		id (int): Gutenberg identifier of text
		author (str): Last name, First name
		title (str): title of work
		subjects (list of str): list of descriptive subjects; a subject may be
			hierarchical, e.g:
			'England -- Social life and customs -- 19th century -- Fiction'
		LCC (list of str): a list of two letter Library of Congress
			Classifications, e.g., 'PS'
		language (list of str): list of two letter language codes.
		type (str): 'Text', 'Sound', ...
		formats (dict of str, str pairs): keys are MIME types, values are URLs.
		download count (int): the number of times this ebook has been
			downloaded from the Gutenberg site in the last 30 days.
	Fields that are not part of the metadata are set to None.
	http://www.gutenberg.org/wiki/Gutenberg:Help_on_Bibliographic_Record_Page
	"""
	if os.path.exists(PICKLEFILE):
		metadata = pickle.load(gzip.open(PICKLEFILE, 'rb'))
	else:
		# metadata = {}
		metadata = []
		for xml in getrdfdata():
			ebook = xml.find(r'{%(pg)s}ebook' % NS)
			if ebook is None:
				continue
			result = parsemetadata(ebook)
			if result is not None:
                # metadata[result['id']] = result
				metadata.append(result)
		pickle.dump(metadata, gzip.open(PICKLEFILE, 'wb'), protocol=-1)
	return metadata


def getrdfdata():
	"""Downloads Project Gutenberg RDF catalog.
	Yields:
		xml.etree.ElementTree.Element: An etext meta-data definition.
	"""
	if not os.path.exists(RDFFILES):
		_, _ = urllib.urlretrieve(RDFURL, RDFFILES)
	with tarfile.open(RDFFILES) as archive:
		for tarinfo in archive:
			yield ElementTree.parse(archive.extractfile(tarinfo))


def parsemetadata(ebook):
	"""Parses an etext meta-data definition to extract fields.
	Args:
		ebook (xml.etree.ElementTree.Element): An ebook meta-data definition.
	"""
	result = dict.fromkeys(META_FIELDS)
	# get etext no
	about = ebook.get('{%(rdf)s}about' % NS)
	result['id'] = int(os.path.basename(about))
	# author
	creator = ebook.find('.//{%(dc)s}creator' % NS)
	if creator is not None:
		name = creator.find('.//{%(pg)s}name' % NS)
		if name is not None:
			result['author'] = safeunicode(name.text, encoding='utf-8')
		birth = creator.find('.//{%(pg)s}birthdate' % NS)
		if birth is not None:
			result['authoryearofbirth'] = int(birth.text)
		death = creator.find('.//{%(pg)s}deathdate' % NS)
		if death is not None:
			result['authoryearofdeath'] = int(death.text)
	# title
	title = ebook.find('.//{%(dc)s}title' % NS)
	if title is not None:
		result['title'] = fixsubtitles(
				safeunicode(title.text, encoding='utf-8'))
	# subject lists
	result['subjects'], result['LCC'] = set(), set()
	for subject in ebook.findall('.//{%(dc)s}subject' % NS):
		res = subject.find('.//{%(dcam)s}memberOf' % NS)
		if res is None:
			continue
		res = res.get('{%(rdf)s}resource' % NS)
		value = subject.find('.//{%(rdf)s}value' % NS).text
		if res == ('%(dc)sLCSH' % NS):
			result['subjects'].add(value)
		elif res == ('%(dc)sLCC' % NS):
			result['LCC'].add(value)
	# formats
	result['formats'] = {file.find('{%(dc)s}format//{%(rdf)s}value' % NS).text:
			file.get('{%(rdf)s}about' % NS)
			for file in ebook.findall('.//{%(pg)s}file' % NS)}
	# type
	booktype = ebook.find('.//{%(dc)s}type//{%(rdf)s}value' % NS)
	if booktype is not None:
		result['type'] = booktype.text
	# languages
	lang = ebook.findall('.//{%(dc)s}language//{%(rdf)s}value' % NS)
	result['language'] = [a.text for a in lang] or None
	# download count
	downloads = ebook.find('.//{%(pg)s}downloads' % NS)
	if downloads is not None:
		result['downloads'] = int(downloads.text)
	return result


def etextno(lines):
	"""Retrieves the id for an etext.
	Args:
		lines (iter): The lines of the etext to search.
	Returns:
		int: The id of the etext.
	Raises:
		ValueError: If no etext id was found.
	Examples:
		>>> etextno(['Release Date: March 17, 2004 [EBook #11609]'])
		11609
		>>> etextno(['Release Date: July, 2003 [Etext# 4263]'])
		4263
		>>> etextno(['Release Date: November 29, 2003 [Eook #10335]'])
		10335
		>>> etextno(['December, 1998  [Etext 1576#]'])
		1576
		>>> etextno(['Some lines', 'without', 'Any [Etext] Number'])
		Traceback (most recent call last):
			...
		ValueError: no etext-id found
	"""
	for line in lines:
		match = ETEXTRE.search(line)
		if match is not None:
			front_match = match.group('etextid_front')
			back_match = match.group('etextid_back')
			if front_match is not None:
				return int(front_match)
			elif back_match is not None:
				return int(back_match)
			else:
				raise ValueError('no regex match (this should never happen')
	raise ValueError('no etext-id found')


def fixsubtitles(title):
	"""Introduce any subtitle with (semi)colons instead of newlines.
	The first subtitle is introduced with a colon, the rest with semicolons.
	>>> fixsubtitles(u'First Across ...\r\nThe Story of ... \r\n'
	... 'Being an investigation into ...')
	u'First Across ...: The Story of ...; Being an investigation into ...'"""
	tmp = LINEBREAKRE.sub(': ', title, 1)
	return LINEBREAKRE.sub('; ', tmp)


def safeunicode(arg, *args, **kwargs):
	"""Coerce argument to unicode, if it's not already."""
	return arg if isinstance(arg, unicode) else unicode(arg, *args, **kwargs)

__all__ = ['readmetadata']

In [3]:
x = readmetadata()

In [4]:
x

[{'LCC': set(),
  'author': None,
  'authoryearofbirth': None,
  'authoryearofdeath': None,
  'downloads': None,
  'formats': {},
  'id': 0,
  'language': None,
  'subjects': set(),
  'title': None,
  'type': 'Text'},
 {'LCC': {'JN', 'KD'},
  'author': u'Anonymous',
  'authoryearofbirth': None,
  'authoryearofdeath': None,
  'downloads': 154,
  'formats': {'application/epub+zip': 'http://www.gutenberg.org/ebooks/10000.epub.noimages',
   'application/rdf+xml': 'http://www.gutenberg.org/ebooks/10000.rdf',
   'application/x-mobipocket-ebook': 'http://www.gutenberg.org/ebooks/10000.kindle.images',
   'application/zip': 'http://www.gutenberg.org/files/10000/10000.zip',
   'text/html': 'http://www.gutenberg.org/ebooks/10000.html.images',
   'text/plain': 'http://www.gutenberg.org/ebooks/10000.txt.utf-8',
   'text/plain; charset=us-ascii': 'http://www.gutenberg.org/files/10000/10000.txt'},
  'id': 10000,
  'language': ['en'],
  'subjects': {'Constitutional history -- England -- Sources', 'Mag

In [5]:
meta_dict = {}
for d in x[1:]:
    for k, v in d.items():
        if k in meta_dict:
            meta_dict[k].append(v)
        else:
            meta_dict[k] = [v]
meta_dict

{'LCC': [{'JN', 'KD'},
  {'PA'},
  {'PR'},
  {'DC'},
  {'BR'},
  {'PS'},
  {'PQ'},
  {'PR'},
  {'PS'},
  {'SK'},
  set(),
  {'HV'},
  {'TX'},
  {'F850.5', 'QH'},
  {'AP'},
  {'AP'},
  {'AP'},
  {'AP'},
  {'AP'},
  {'AP'},
  {'AP'},
  {'PQ'},
  {'AP'},
  {'PR'},
  {'BV'},
  {'PS'},
  {'CT'},
  {'PS'},
  {'AP'},
  {'PZ'},
  {'GV'},
  {'PR'},
  {'PQ'},
  {'F590.3'},
  {'PS'},
  {'AP'},
  {'AP'},
  {'AP'},
  {'AP'},
  {'AP'},
  {'PS'},
  {'PS'},
  {'PR'},
  {'PQ'},
  {'PM'},
  {'PS'},
  {'LB'},
  {'E660'},
  {'F1001'},
  {'PZ'},
  {'PR'},
  {'AP'},
  {'PZ'},
  {'PR'},
  {'PQ'},
  {'PR'},
  {'PR'},
  {'PR'},
  {'PQ'},
  {'TN'},
  {'PN'},
  {'PJ'},
  {'PR'},
  {'BX'},
  {'PZ'},
  {'PQ'},
  {'Q'},
  {'PQ'},
  {'PS'},
  {'HQ'},
  {'PR'},
  {'JK', 'KF'},
  {'PS'},
  {'PS'},
  {'PS'},
  {'PC'},
  {'PQ'},
  {'E151'},
  {'DS'},
  {'TX'},
  {'PE'},
  {'AP'},
  {'E300'},
  {'PR'},
  {'AP'},
  {'D501'},
  {'AP'},
  {'PQ'},
  {'D501'},
  {'PZ'},
  {'PR'},
  {'PS'},
  {'PS'},
  {'PQ'},
  {'PZ'},
  {'PS

In [6]:
meta_df = pd.DataFrame(meta_dict)

In [7]:
meta_df.head(2)

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type
0,"{JN, KD}",Anonymous,,,154.0,{u'text/html': u'http://www.gutenberg.org/eboo...,10000,[en],"{Constitutional history -- England -- Sources,...",The Magna Carta,Text
1,{PA},"Seneca, Lucius Annaeus",,65.0,173.0,{u'application/x-mobipocket-ebook': u'http://w...,10001,[en],"{Claudius, Emperor of Rome, 10 B.C.-54 A.D. --...",Apocolocyntosis,Text


In [8]:
meta_df.shape

(54041, 11)

In [9]:
meta_df = meta_df.sort_values(by = 'id')

In [10]:
meta_df.reset_index(inplace = True, drop=True)
meta_df.head()

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type
0,"{E201, JK}","Jefferson, Thomas",1743.0,1826.0,668.0,{u'text/html': u'http://www.gutenberg.org/eboo...,1,[en],"{United States. Declaration of Independence, U...",The Declaration of Independence of the United ...,Text
1,"{KF, JK}",United States,,,176.0,{u'text/html': u'http://www.gutenberg.org/file...,2,[en],{United States. Constitution. 1st-10th Amendme...,The United States Bill of Rights: The Ten Orig...,Text
2,{E838},"Kennedy, John F. (John Fitzgerald)",1917.0,1963.0,26.0,{u'text/html': u'http://www.gutenberg.org/file...,3,[en],{Presidents -- United States -- Inaugural addr...,John F. Kennedy's Inaugural Address,Text
3,{E456},"Lincoln, Abraham",1809.0,1865.0,59.0,{u'text/html': u'http://www.gutenberg.org/file...,4,[en],"{Lincoln, Abraham, 1809-1865. Gettysburg addre...",Lincoln's Gettysburg Address: Given November 1...,Text
4,"{KF, JK}",United States,,,429.0,{u'application/x-mobipocket-ebook': u'http://w...,5,[en],"{United States. Constitution, United States --...",The United States Constitution,Text


In [11]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54041 entries, 0 to 54040
Data columns (total 11 columns):
LCC                  54041 non-null object
author               52065 non-null object
authoryearofbirth    39238 non-null float64
authoryearofdeath    38312 non-null float64
downloads            54040 non-null float64
formats              54041 non-null object
id                   54041 non-null int64
language             54040 non-null object
subjects             54041 non-null object
title                53967 non-null object
type                 54041 non-null object
dtypes: float64(3), int64(1), object(7)
memory usage: 4.5+ MB


## I only want to keep books with type = Text

In [12]:
meta_df['type'].unique()

array(['Text', 'Dataset', 'StillImage', 'MovingImage', 'Sound', 'Image',
       'Collection'], dtype=object)

In [13]:
meta_df[meta_df['type'] == 'Collection']

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type
10801,{},,,,24.0,{u'text/html': u'http://www.gutenberg.org/eboo...,10802,[en],{},"Project Gutenberg ""10K"" DVD",Collection
11219,{},,,,37.0,{u'text/html': u'http://www.gutenberg.org/eboo...,11220,[en],{},"Project Gutenberg ""Best Of"" CD August 2003",Collection
19158,{},,,,69.0,{u'application/rdf+xml': u'http://www.gutenber...,19159,[en],{},Project Gutenberg DVD: The July 2006 Special,Collection


In [14]:
meta_df = meta_df[meta_df['type'] == 'Text']

## Removing null weird row

In [15]:
meta_df.tail(2)

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type
54039,{},"Cavendish, George",,,0.0,{u'text/html; charset=utf-8': u'http://www.gut...,54043,[en],{},The Life of Cardinal Wolsey,Text
54040,{},,,,,{},999999,,{},Piccole anime,Text


In [16]:
meta_df = meta_df[~ meta_df['language'].isnull()]

In [17]:
meta_df.tail(2)

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type
54038,{},"Perley, Martin Van Buren",,,0.0,{u'image/jpeg': u'http://www.gutenberg.org/cac...,54042,[en],{},A Short History of the Salem Village Witchcraf...,Text
54039,{},"Cavendish, George",,,0.0,{u'text/html; charset=utf-8': u'http://www.gut...,54043,[en],{},The Life of Cardinal Wolsey,Text


## Cleaning the language column

I only want books that are exclusively written in English

In [18]:
type(meta_df['language'][0])

list

In [19]:
def clean_language(lang_list):
    if 'en' in lang_list and len(lang_list) == 1:
        return 1
    else:
        return 0
    
meta_df['english'] = meta_df['language'].map(clean_language)
meta_df.head(2)

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type,english
0,"{E201, JK}","Jefferson, Thomas",1743.0,1826.0,668.0,{u'text/html': u'http://www.gutenberg.org/eboo...,1,[en],"{United States. Declaration of Independence, U...",The Declaration of Independence of the United ...,Text,1
1,"{KF, JK}",United States,,,176.0,{u'text/html': u'http://www.gutenberg.org/file...,2,[en],{United States. Constitution. 1st-10th Amendme...,The United States Bill of Rights: The Ten Orig...,Text,1


In [20]:
meta_df = meta_df[meta_df['english'] == 1]

In [21]:
meta_df.shape

(43000, 12)

## Now I'm going to start dealing with the subjects

In [23]:
meta_df['subjects']

#here's what they look like

0        {United States. Declaration of Independence, U...
1        {United States. Constitution. 1st-10th Amendme...
2        {Presidents -- United States -- Inaugural addr...
3        {Lincoln, Abraham, 1809-1865. Gettysburg addre...
4        {United States. Constitution, United States --...
5        {Virginia -- Politics and government -- 1775-1...
6        {Pilgrims (New Plymouth Colony), Massachusetts...
7        {United States -- Politics and government -- 1...
8        {United States -- Politics and government -- 1...
9                                                  {Bible}
10                                               {Fantasy}
11                                               {Fantasy}
12                              {Nonsense verses, English}
13       {Political science -- Handbooks, manuals, etc....
14       {Ship captains -- Fiction, Whaling ships -- Fi...
15       {Peter Pan (Fictitious character) -- Fiction, ...
16       {Church of Jesus Christ of Latter-day Saints -.

In [25]:
type(meta_df.subjects[0])

set

In [67]:
meta_df.subjects[0]

{'United States -- History -- Revolution, 1775-1783 -- Sources',
 'United States. Declaration of Independence'}

In [71]:
import string

def subject_cleaning(subj_set):
    subjs = []
    for s in subj_set:
        s = s.split('--')
        s = [string.strip(i) for i in s]
        subjs.extend(s)
    return np.unique(subjs) 
        
meta_df['subjects2'] = meta_df.subjects.map(subject_cleaning)
meta_df.head()

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type,english,subjects2,subjects_there
0,"{E201, JK}","Jefferson, Thomas",1743.0,1826.0,668.0,{u'text/html': u'http://www.gutenberg.org/eboo...,1,[en],"{United States. Declaration of Independence, U...",The Declaration of Independence of the United ...,Text,1,"[History, Revolution, 1775-1783, Sources, Unit...",1
1,"{KF, JK}",United States,,,176.0,{u'text/html': u'http://www.gutenberg.org/file...,2,[en],{United States. Constitution. 1st-10th Amendme...,The United States Bill of Rights: The Ten Orig...,Text,1,"[Civil rights, Sources, United States, United ...",1
2,{E838},"Kennedy, John F. (John Fitzgerald)",1917.0,1963.0,26.0,{u'text/html': u'http://www.gutenberg.org/file...,3,[en],{Presidents -- United States -- Inaugural addr...,John F. Kennedy's Inaugural Address,Text,1,"[1961-1963, Foreign relations, Inaugural addre...",1
3,{E456},"Lincoln, Abraham",1809.0,1865.0,59.0,{u'text/html': u'http://www.gutenberg.org/file...,4,[en],"{Lincoln, Abraham, 1809-1865. Gettysburg addre...",Lincoln's Gettysburg Address: Given November 1...,Text,1,"[Consecration of cemeteries, Gettysburg, Linco...",1
4,"{KF, JK}",United States,,,429.0,{u'application/x-mobipocket-ebook': u'http://w...,5,[en],"{United States. Constitution, United States --...",The United States Constitution,Text,1,"[1783-1789, Politics and government, Sources, ...",1


In [72]:
meta_df['subjects2'][0]

array(['History', 'Revolution, 1775-1783', 'Sources', 'United States',
       'United States. Declaration of Independence'], 
      dtype='|S42')

In [73]:
meta_df['subjects_there'] = meta_df['subjects2'].map(lambda x: 1 if len(x) > 0 else 0)
meta_df[meta_df['subjects_there'] == 0]

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type,english,subjects2,subjects_there
18458,{PQ},"Colonna, Francesco",,1527.0,99.0,{u'image/jpeg': u'http://www.gutenberg.org/cac...,18459,[en],{},Hypnerotomachia: The Strife of Loue in a Dreame,Text,1,[],0
23099,{AC},"De Morgan, Augustus",1806.0,1871.0,32.0,{u'text/html; charset=iso-8859-1': u'http://ww...,23100,[en],{},"A Budget of Paradoxes, Volume I",Text,1,[],0
24095,{PQ},"Huysmans, J.-K. (Joris-Karl)",1848.0,1907.0,23.0,{u'application/rdf+xml': u'http://www.gutenber...,24096,[en],{},En Route,Text,1,[],0
25301,{PQ},"Daudet, Alphonse",1840.0,1897.0,18.0,{u'text/html; charset=utf-8': u'http://www.gut...,25302,[en],{},Jack: 1877,Text,1,[],0
25344,{PT},"Wassermann, Jakob",1873.0,1934.0,10.0,{u'text/plain; charset=utf-8': u'http://www.gu...,25345,[en],{},The Goose Man,Text,1,[],0
25404,{PQ},"France, Anatole",1844.0,1924.0,28.0,{u'image/jpeg': u'http://www.gutenberg.org/cac...,25405,[en],{},Honey-Bee: 1911,Text,1,[],0
25405,{PQ},"France, Anatole",1844.0,1924.0,12.0,{u'text/html; charset=utf-8': u'http://www.gut...,25406,[en],{},Marguerite,Text,1,[],0
25406,{PQ},"France, Anatole",1844.0,1924.0,10.0,{u'text/plain; charset=utf-8': u'http://www.gu...,25407,[en],{},The Merrie Tales of Jacques Tournebroche: And ...,Text,1,[],0
25407,{PQ},"France, Anatole",1844.0,1924.0,15.0,{u'text/html; charset=utf-8': u'http://www.gut...,25408,[en],{},Child Life In Town And Country: 1909,Text,1,[],0
25408,{PQ},"France, Anatole",1844.0,1924.0,15.0,{u'image/jpeg': u'http://www.gutenberg.org/cac...,25409,[en],{},The Story Of The Duchess Of Cicogne And Of Mon...,Text,1,[],0


## I'm going to remove these columns where there is no author or title

This means that for some reason that book has been made unavailable on Project Gutenberg
I will still have some empty subjects, but not as many.

In [74]:
meta_df = meta_df[(meta_df['title'].notnull()) & (meta_df['author'].notnull())]

In [75]:
meta_df[meta_df['subjects_there'] == 0]

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type,english,subjects2,subjects_there
18458,{PQ},"Colonna, Francesco",,1527.0,99.0,{u'image/jpeg': u'http://www.gutenberg.org/cac...,18459,[en],{},Hypnerotomachia: The Strife of Loue in a Dreame,Text,1,[],0
23099,{AC},"De Morgan, Augustus",1806.0,1871.0,32.0,{u'text/html; charset=iso-8859-1': u'http://ww...,23100,[en],{},"A Budget of Paradoxes, Volume I",Text,1,[],0
24095,{PQ},"Huysmans, J.-K. (Joris-Karl)",1848.0,1907.0,23.0,{u'application/rdf+xml': u'http://www.gutenber...,24096,[en],{},En Route,Text,1,[],0
25301,{PQ},"Daudet, Alphonse",1840.0,1897.0,18.0,{u'text/html; charset=utf-8': u'http://www.gut...,25302,[en],{},Jack: 1877,Text,1,[],0
25344,{PT},"Wassermann, Jakob",1873.0,1934.0,10.0,{u'text/plain; charset=utf-8': u'http://www.gu...,25345,[en],{},The Goose Man,Text,1,[],0
25404,{PQ},"France, Anatole",1844.0,1924.0,28.0,{u'image/jpeg': u'http://www.gutenberg.org/cac...,25405,[en],{},Honey-Bee: 1911,Text,1,[],0
25405,{PQ},"France, Anatole",1844.0,1924.0,12.0,{u'text/html; charset=utf-8': u'http://www.gut...,25406,[en],{},Marguerite,Text,1,[],0
25406,{PQ},"France, Anatole",1844.0,1924.0,10.0,{u'text/plain; charset=utf-8': u'http://www.gu...,25407,[en],{},The Merrie Tales of Jacques Tournebroche: And ...,Text,1,[],0
25407,{PQ},"France, Anatole",1844.0,1924.0,15.0,{u'text/html; charset=utf-8': u'http://www.gut...,25408,[en],{},Child Life In Town And Country: 1909,Text,1,[],0
25408,{PQ},"France, Anatole",1844.0,1924.0,15.0,{u'image/jpeg': u'http://www.gutenberg.org/cac...,25409,[en],{},The Story Of The Duchess Of Cicogne And Of Mon...,Text,1,[],0


In [76]:
meta_df.shape

(41369, 14)

### Now I'm going to start to see what subjects are tagged in multiple books

In [77]:
subject_list = []
for subs in meta_df['subjects2']:
    subject_list.extend(subs)
        
        
subject_list = np.unique(subject_list)
subject_list[1000:1050], len(subject_list)

(array(['Authors, Scandinavian', 'Authors, Scottish',
        'Authors, South African', 'Authors, Swedish', u'Authors, Swiss',
        'Authorship', 'Autobiographical fiction',
        'Autobiographical fiction, American', 'Autobiographies',
        'Autobiography', 'Autographs', 'Automation', 'Automobile driving',
        'Automobile industry and trade', 'Automobile racing',
        'Automobile travel', 'Automobiles', 'Autonomy',
        'Autonomy and independence movements', 'Avarice',
        'Avatars (Religion)', 'Aversion', 'Avila', 'Avila (Spain)',
        'Awards', 'Axholme, Isle of (England)',
        'Ayesha (Fictitious character : Haggard)', 'Ayesha (Schooner)',
        'Azores', 'Aztecs', 'Babism',
        'Babur, Emperor of Hindustan, 1483-1530', 'Babylon (Extinct city)',
        'Babylonia', 'Babysitters', 'Bacchantes',
        'Bach, Johann Sebastian, 1685-1750', 'Bachelors',
        "Bacon's Rebellion, 1676", 'Bacon, Francis, 1561-1626',
        'Baconian theory', 'Bacte

In [78]:
subject_counts = {subject:0 for subject in subject_list}

for s_list in meta_df['subjects2'].values:
    for item in s_list:
        subject_counts[item] += 1
        
subject_counts

{'Douglas, William Douglas, Earl of, 1423?-1440': 1,
 'Philanthropists': 6,
 'Girondists': 2,
 'Barcelona (Spain)': 1,
 'Poetry': 786,
 'Whittier, John Greenleaf, 1807-1892': 4,
 'Morris, Gouverneur, 1752-1816': 1,
 'Carr-Burdette College': 1,
 'Electroplating': 1,
 'Marriage proposals': 1,
 'Margaret, of Anjou, Queen, consort of Henry VI, King of England, 1430-1482': 2,
 'Vatican Council (1st : 1869-1870 : Basilica di San Pietro in Vaticano)': 1,
 'Mythology, Classical': 17,
 'De Quincey, Thomas, 1785-1859': 3,
 'Hawarden Castle (Wales)': 1,
 'Scots-Irish': 1,
 'Scott, Dred, 1809-1858': 1,
 '1517-1648': 3,
 'Vicksburg (Miss.)': 6,
 'Belgians': 1,
 'Kennedy, John F. (John Fitzgerald), 1917-1963': 12,
 'Kieffer, Henry Martyn, 1845-1930': 1,
 'Drowning': 1,
 'Anne, Queen of Great Britain, 1665-1714': 1,
 'Pride and vanity': 13,
 u'Dvo\u0159\xe1k, Anton\xedn, 1841-1904': 1,
 'Fian, John, d. 1591': 1,
 'Terry, Ellen, Dame, 1847-1928': 2,
 'Secularism': 2,
 'Women heads of state': 1,
 'Youn

#### Some subjects are very specific and some are quite broad. I'm going to find subjects tagged in more than 50  books to narrow it down.

In [80]:
count = 0
for k, v in subject_counts.items():
    if v >= 50:
        count += 1
        print v, '\t', k
print '-------------'
print count

# There are 329 subjects that are tagged in > 50 books
# There are 171 subjects that are tagged in > 100 books

786 	Poetry
102 	Antiquities
62 	Criminals
51 	Ghost stories
63 	Home missions
62 	Family life
107 	Charles II, 1660-1685
126 	Short stories, English
55 	Villages
59 	Paranormal fiction
112 	Dogs
177 	Mystery fiction
1829 	19th century
220 	Inheritance and succession
347 	Scotland
85 	Tales
101 	Fathers and daughters
166 	New York (State)
171 	Revolution, 1775-1783
230 	Orphans
249 	Humorous stories
345 	20th century
63 	Adaptations
152 	Diaries
222 	Travel
4105 	History
75 	Intellectual life
85 	Bible
55 	Free thought
419 	Western stories
71 	Twins
71 	Religious aspects
228 	India
51 	Widows
1618 	Great Britain
87 	Shakespeare, William, 1564-1616
56 	Foreign relations
325 	Canada
192 	West (U.S.)
121 	Soldiers
128 	Literature, Modern
82 	Philippines
1031 	Translations into English
53 	Northwest, Canadian
55 	Mississippi River
191 	Essays
96 	Physicians
225 	Personal narratives
548 	Drama
115 	Political fiction
55 	Time travel
354 	Voyages and travels
98 	16th century
56 	Norway
79 	Sh

In [81]:
subjects_over_50 = []
for k, v in subject_counts.items():
    if v >= 50:
        subjects_over_50.append(k)
subjects_over_50

['Poetry',
 'Antiquities',
 'Criminals',
 'Ghost stories',
 'Home missions',
 'Family life',
 'Charles II, 1660-1685',
 'Short stories, English',
 'Villages',
 'Paranormal fiction',
 'Dogs',
 'Mystery fiction',
 '19th century',
 'Inheritance and succession',
 'Scotland',
 'Tales',
 'Fathers and daughters',
 'New York (State)',
 'Revolution, 1775-1783',
 'Orphans',
 'Humorous stories',
 '20th century',
 'Adaptations',
 'Diaries',
 'Travel',
 'History',
 'Intellectual life',
 'Bible',
 'Free thought',
 'Western stories',
 'Twins',
 'Religious aspects',
 'India',
 'Widows',
 'Great Britain',
 'Shakespeare, William, 1564-1616',
 'Foreign relations',
 'Canada',
 'West (U.S.)',
 'Soldiers',
 'Literature, Modern',
 'Philippines',
 'Translations into English',
 'Northwest, Canadian',
 'Mississippi River',
 'Essays',
 'Physicians',
 'Personal narratives',
 'Drama',
 'Political fiction',
 'Time travel',
 'Voyages and travels',
 '16th century',
 'Norway',
 'Ship captains',
 'Science fiction, Amer

In [82]:
#I may decide to make columns for these subjects, so here I clean them up to do so.

col_names = [s.replace('(', '').replace(')', '').replace(',', '').replace('.', '').replace(' ', '_').replace("'","").lower() \
for s in subjects_over_50]
col_names

['poetry',
 'antiquities',
 'criminals',
 'ghost_stories',
 'home_missions',
 'family_life',
 'charles_ii_1660-1685',
 'short_stories_english',
 'villages',
 'paranormal_fiction',
 'dogs',
 'mystery_fiction',
 '19th_century',
 'inheritance_and_succession',
 'scotland',
 'tales',
 'fathers_and_daughters',
 'new_york_state',
 'revolution_1775-1783',
 'orphans',
 'humorous_stories',
 '20th_century',
 'adaptations',
 'diaries',
 'travel',
 'history',
 'intellectual_life',
 'bible',
 'free_thought',
 'western_stories',
 'twins',
 'religious_aspects',
 'india',
 'widows',
 'great_britain',
 'shakespeare_william_1564-1616',
 'foreign_relations',
 'canada',
 'west_us',
 'soldiers',
 'literature_modern',
 'philippines',
 'translations_into_english',
 'northwest_canadian',
 'mississippi_river',
 'essays',
 'physicians',
 'personal_narratives',
 'drama',
 'political_fiction',
 'time_travel',
 'voyages_and_travels',
 '16th_century',
 'norway',
 'ship_captains',
 'science_fiction_american',
 'marri

## Now I want to start looking at children's books

In [83]:
for sub in subject_list:
    if 'child' in sub.lower():
        print sub

Abandoned children
Adopted children
Anorexia in children
Bahai education of children
Bible. Apocrypha. Song of the Three Children
Blind children
Body schema in children
Boxcar children (Fictitious characters)
Catholic children
Child abuse
Child care
Child caregivers
Child development
Child labor
Child psychology
Child rearing
Child soldiers
Child welfare
Child witnesses
Child, Lydia Maria Francis, 1802-1880
Childbirth
Childers, Erskine, 1870-1922
Childhood and youth
Childhood and youth.
Children
Children and adults
Children and animals
Children and death
Children and war
Children in art
Children in literature
Children in the Bible
Children of alcoholics
Children of clergy
Children of divorced parents
Children of military personnel
Children of missionaries
Children of physicians
Children of police
Children of presidents
Children of prisoners
Children of the rich
Children of women prisoners
Children with disabilities
Children with mental disabilities
Children's accidents
Children's books

In [84]:
for sub in subjects_over_50:
    if 'child' in sub.lower():
        print sub

Children's literature
Children's poetry
Children
Children's periodicals, American
Children's stories


In [85]:
for sub in subject_list:
    if 'juvenile' in sub.lower():
        print subject_counts[sub], '\t', sub 

2 	Juvenile
2 	Juvenile Fiction
1 	Juvenile and popular literature
3 	Juvenile delinquency
3 	Juvenile drama
1 	Juvenile fction
3528 	Juvenile fiction
460 	Juvenile literature
111 	Juvenile poetry
4 	Limericks, Juvenile
1 	Participation, Juvenile
1 	Riddles, Juvenile
20 	Wit and humor, Juvenile


In [86]:
meta_df['juvenile_fiction'] = meta_df['subjects'].map(lambda s: 1 if 'juvenile fiction' in str(s).lower() else 0)
meta_df[meta_df['juvenile_fiction'] == 1].head()

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type,english,subjects2,subjects_there,juvenile_fiction
15,"{PR, PZ}","Barrie, J. M. (James Matthew)",1860.0,1937.0,4502.0,{u'text/html; charset=utf-8': u'http://www.gut...,16,[en],"{Peter Pan (Fictitious character) -- Fiction, ...",Peter Pan,Text,1,"[Fairies, Fantasy, Fiction, Juvenile fiction, ...",1,1
53,{PZ},"Baum, L. Frank (Lyman Frank)",1856.0,1919.0,576.0,{u'text/html; charset=iso-8859-1': u'http://ww...,54,[en],"{Oz (Imaginary place) -- Juvenile fiction, You...",The Marvelous Land of Oz,Text,1,"[Conduct of life, Fantasy, Friendship, Juvenil...",1,1
54,{PZ},"Baum, L. Frank (Lyman Frank)",1856.0,1919.0,2917.0,{u'text/html; charset=iso-8859-1': u'http://ww...,55,[en],{Scarecrow (Fictitious character from Baum) --...,The Wonderful Wizard of Oz,Text,1,"[Courage, Cowardly Lion (Fictitious character)...",1,1
162,{PZ},"Alcott, Louisa May",1832.0,1888.0,157.0,{u'text/html; charset=iso-8859-1': u'http://ww...,163,[en],"{Fairies -- Juvenile poetry, Children's storie...",Flower Fables,Text,1,"[Children's stories, American, Fairies, Fairy ...",1,1
270,"{PR, PZ}","Sewell, Anna",1820.0,1878.0,504.0,{u'text/plain; charset=utf-8': u'http://www.gu...,271,[en],{Great Britain -- History -- 19th century -- J...,Black Beauty,Text,1,"[19th century, Great Britain, Historical ficti...",1,1


In [87]:
meta_df[meta_df['juvenile_fiction'] == 1].shape

(3529, 15)

In [90]:
meta_df['juvenile_literature'] = meta_df['subjects'].map(lambda s: 1 if 'juvenile literature' in str(s).lower() else 0)
meta_df[meta_df['juvenile_literature'] == 1].head()

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type,english,subjects2,subjects_there,juvenile_fiction,juvenile_literature
528,{ND},"Steedman, Amy",,,168.0,{u'text/html; charset=iso-8859-1': u'http://ww...,529,[en],"{Painters -- Italy, Art -- Juvenile literature}",Knights of Art: Stories of the Italian Painters,Text,1,"[Art, Italy, Juvenile literature, Painters]",1,0,1
572,"{PZ, PR}","Lamb, Mary",1764.0,1847.0,60.0,{u'text/html; charset=iso-8859-1': u'http://ww...,573,[en],"{Shakespeare, William, 1564-1616 -- Stories, p...",Tales from Shakespeare,Text,1,"[Adaptations, Juvenile literature, Shakespeare...",1,0,1
676,{PZ},"Kingsley, Charles",1819.0,1875.0,140.0,{u'text/html': u'http://www.gutenberg.org/file...,677,[en],"{Mythology, Greek -- Juvenile literature}","The Heroes; Or, Greek Fairy Tales for My Children",Text,1,"[Juvenile literature, Mythology, Greek]",1,0,1
694,"{QH, QL}","Kingsley, Charles",1819.0,1875.0,7.0,{u'image/jpeg': u'http://www.gutenberg.org/cac...,695,[en],"{Seashore biology -- Juvenile literature, Natu...","Glaucus; Or, The Wonders of the Shore",Text,1,"[Juvenile literature, Natural history, Seashor...",1,0,1
698,{DA},"Dickens, Charles",1812.0,1870.0,229.0,{u'text/html': u'http://www.gutenberg.org/file...,699,[en],{Great Britain -- History -- Juvenile literature},A Child's History of England,Text,1,"[Great Britain, History, Juvenile literature]",1,0,1


## My plan from here:

I'm going to open the books like below. I'm going to add columns for the length of the book (total word count), diversity of vocabulary (unique word count) and more that I'm figuring out. From there, I will see how it goes. I'm going to start out by trying to just predict if the book is a childrens book or not. Then I will try and get more sophisticated with leveling the books. And I will make some charts and graphs. I promise!

In [91]:
text = strip_headers(load_etext(699)).strip()
print(text)

INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


Transcribed from the 1905 Chapman & Hall "Works of Charles Dickens"
edition by David Price, email ccx074@pglaf.org





A CHILD'S HISTORY OF ENGLAND


By CHARLES DICKENS

With Illustrations by F. H. Townsend and others

LONDON: CHAPMAN & HALL, LD.
NEW YORK: CHARLES SCRIBNER'S SONS
1905




CHAPTER I--ANCIENT ENGLAND AND THE ROMANS


If you look at a Map of the World, you will see, in the left-hand upper
corner of the Eastern Hemisphere, two Islands lying in the sea.  They are
England and Scotland, and Ireland.  England and Scotland form the greater
part of these Islands.  Ireland is the next in size.  The little
neighbouring islands, which are so small upon the Map as to be mere dots,
are chiefly little bits of Scotland,--broken off, I dare say, in the
course of a great length of time, by the power of the restless water.

In the old days, a long, long while ago, before Our Saviour was born on
earth and lay asleep in a manger, these Islands were in the same place,
and the stormy sea roa