In [12]:
import pandas as pd
import glob
import re
import xml.etree.ElementTree as ET
import requests

In [13]:
gut_url = 'https://www.gutenberg.org/ebooks/{}'
gut_txt = 'https://www.gutenberg.org/ebooks/{}.txt.utf-8'

data_dir = '/home/rca2t/Public/ETA/data/gutenberg/cache/epub'
epub_path = data_dir + '/{0}/pg{0}.rdf'
TAG = re.compile(r'<[^>]+>')

In [15]:
gids = [int(path.split('/')[-1]) for path in glob.glob(data_dir+'/*')]
gids = sorted(gids)

In [59]:
df = pd.DataFrame(gids,  columns=['gut_id'])
df = df.set_index('gut_id')

# XML Processing

In [63]:
ns = dict(base="http://www.gutenberg.org/",
    rdfs="http://www.w3.org/2000/01/rdf-schema#",
    rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    marcrel="http://id.loc.gov/vocabulary/relators/",
    pgterms="http://www.gutenberg.org/2009/pgterms/",
    cc="http://web.resource.org/cc/",
    dcam="http://purl.org/dc/dcam/",
    dcterms="http://purl.org/dc/terms/")

In [64]:
xpaths = dict(
    subjects = ".//dcterms:subject/rdf:Description/rdf:value",
    bookshelves = ".//pgterms:bookshelf/rdf:Description/rdf:value",
    languages = ".//dcterms:language/rdf:Description/rdf:value",
    agents = ".//marcrel:ill/pgterms:agent/pgterms:name",
    rights = ".//dcterms:rights",
    title = ".//dcterms:title",
    types = ".//dcterms:type/rdf:Description/rdf:value",
    creators = ".//dcterms:creator/pgterms:agent/pgterms:name",
    formats = ".//dcterms:hasFormat/pgterms:file/dcterms:format/rdf:Description/rdf:value"
)

In [64]:
default_format='text/plain; charset=utf-8'

In [65]:
for item in xpaths:
    df[item] = None

In [66]:
def get_rdf(gut_id, data_dir=data_dir):
    path = "{0}/{1}/pg{1}.rdf".format(data_dir, gut_id)
    rdf = open(path, 'r').read()
    return rdf

In [67]:
def get_text_url(gut_id):
    gut_txt = 'https://www.gutenberg.org/ebooks/{}.txt.utf-8'.format(gut_id)
    return gut_txt

In [68]:
def get_text_content(gut_id):
    gut_url = get_text_url(gut_id)
    r = requests.get(gut_url)
    return r.text

In [69]:
def get_items(el, xpath, ns=ns):
    items = [item.text for item in el.findall(xpath, namespaces=ns)]
    return items

In [70]:
def get_rdf_root(gut_id):
    rdf = get_rdf(gut_id)
    root = ET.fromstring(rdf)
    return root 

In [71]:
def is_text(formats, default_format='text/plain; charset=utf-8'):
    return 

In [72]:
def get_metadata(gut_id):
    rdf_root = get_rdf_root(gut_id)
    md = {}
    for item in xpaths:
        md[item] = get_items(rdf_root, xpaths[item])
    return md

In [75]:
md = get_metadata(10036)

In [76]:
md

{'subjects': ['American wit and humor -- Periodicals', 'AP'],
 'bookshelves': ['Punchinello'],
 'languages': ['en'],
 'agents': [],
 'rights': ['Public domain in the USA.'],
 'title': ['Punchinello, Volume 2, No. 28, October 8, 1870'],
 'types': ['Text'],
 'creators': ['Various'],
 'formats': ['text/html; charset=iso-8859-1',
  'application/zip',
  'application/zip',
  'text/plain; charset=iso-8859-1',
  'text/html; charset=iso-8859-1',
  'application/x-mobipocket-ebook',
  'application/rdf+xml',
  'text/plain',
  'text/plain; charset=iso-8859-1',
  'text/plain; charset=us-ascii',
  'application/zip',
  'text/plain; charset=us-ascii',
  'application/epub+zip',
  'application/epub+zip',
  'application/x-mobipocket-ebook']}

In [48]:
def get_data(x):
    gut_id = x.name
    md = get_metadata(gut_id)
    for item in xpaths:
        x[item] = md[item]
    return x

In [93]:
def populate_df():
    df = pd.DataFrame(gids,  columns=['gut_id'])
    df = df.set_index('gut_id')
    for item in xpaths:
        df[item] = None
    df = df.apply(get_data, axis=1)
    df.title = df.title.str.replace('&#13;', '')
    return df

In [None]:
df = populate_df()

In [57]:
df.head()

Unnamed: 0_level_0,subjects,bookshelves,languages,agents,rights,title,types,creators,formats
gut_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,[],[],[],[],[Public domain in the USA.],,[Text],[],[]
1,"[United States. Declaration of Independence, E...","[American Revolutionary War, United States Law...",[en],[],[Public domain in the USA.],,[Text],"[Jefferson, Thomas]","[text/plain; charset=us-ascii, application/zip..."
2,[United States. Constitution. 1st-10th Amendme...,"[United States Law, Politics, American Revolut...",[en],[],[Public domain in the USA.],,[Text],[United States],"[text/plain; charset=us-ascii, application/rdf..."
3,[Presidents -- United States -- Inaugural addr...,[],[en],[],[Public domain in the USA.],,[Text],"[Kennedy, John F. (John Fitzgerald)]","[application/epub+zip, application/x-mobipocke..."
4,"[Lincoln, Abraham, 1809-1865. Gettysburg addre...",[US Civil War],[en],[],[Public domain in the USA.],,[Text],"[Lincoln, Abraham]","[text/plain, text/plain; charset=us-ascii, app..."


In [68]:
data_dir2 = '/home/rca2t/Public/ETA/data/gutenberg'

In [69]:
import sqlite3

In [70]:
with sqlite3.connect(data_dir2 + '/gutenberg.db') as db:
    df.to_sql('title', db, index=True, if_exists='replace')

In [71]:
df

Unnamed: 0_level_0,title,creator,rdf
gut_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,The Declaration of Independence of the United ...,"Jefferson, Thomas","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
2,The United States Bill of Rights&#13;,United States,"<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
3,John F. Kennedy's Inaugural Address,"Kennedy, John F. (John Fitzgerald)","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
4,Lincoln's Gettysburg Address&#13;,"Lincoln, Abraham","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
5,The United States Constitution,United States,"<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
6,Give Me Liberty or Give Me Death,"Henry, Patrick","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
7,The Mayflower Compact,none,"<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
8,Abraham Lincoln's Second Inaugural Address,"Lincoln, Abraham","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
9,Abraham Lincoln's First Inaugural Address,"Lincoln, Abraham","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
10,The King James Version of the Bible,none,"<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."


In [72]:
def get_works_by(name_pat):
    works = df.loc[df.creator.str.contains(name_pat)]
    return works

In [84]:
milton = get_works_by('Milton, John')

In [85]:
milton

Unnamed: 0_level_0,title,creator,rdf
gut_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20,Paradise Lost,"Milton, John","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
26,Paradise Lost,"Milton, John","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
58,Paradise Regained,"Milton, John","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
397,"L'Allegro, Il Penseroso, Comus, and Lycidas","Milton, John","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
608,Areopagitica,"Milton, John","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
6929,"Poemata : Latin, Greek and Italian Poems by Jo...","Milton, John","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
19819,Milton's Comus,"Milton, John","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
31706,Milton: Minor Poems,"Milton, John","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."


In [75]:
austen = get_works_by('Austen, Jane')

In [76]:
austen

Unnamed: 0_level_0,title,creator,rdf
gut_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
105,Persuasion,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
121,Northanger Abbey,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
141,Mansfield Park,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
158,Emma,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
161,Sense and Sensibility,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
946,Lady Susan,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
1212,Love and Freindship [sic],"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
1342,Pride and Prejudice,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
19839,Emma,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
20682,Northanger Abbey,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."


In [77]:
import requests

In [78]:
download_dir = '/home/rca2t/Public/ETA/data/gutenberg/downloads'

In [79]:
def download_works(works, download_dir=download_dir):
    gids = works.index.tolist()
    for gid in gids:
        print(gid)
        url = 'https://www.gutenberg.org/ebooks/{}.txt.utf-8'.format(gid)
        r = requests.get(url)
        with open(download_dir+"/g{}.txt".format(gid), 'w', encoding='utf8') as out:
            out.write(r.text)

In [80]:
works = download_works(austen)

105
121
141
158
161
946
1212
1342
19839
20682
20686
20687
21839
22953
22954
22962
22963
22964
25946
26301
31100
33388
35163
37431
37634
42671
43741


In [90]:
milton.groupby('title').count()

Unnamed: 0_level_0,creator,rdf
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Areopagitica,1,1
"L'Allegro, Il Penseroso, Comus, and Lycidas",1,1
Milton's Comus,1,1
Milton: Minor Poems,1,1
Paradise Lost,2,2
Paradise Regained,1,1
"Poemata : Latin, Greek and Italian Poems by John Milton",1,1


In [94]:
milton.rdf.str.contains('audio')

gut_id
20       False
26       False
58       False
397      False
608      False
6929     False
19819    False
31706    False
Name: rdf, dtype: bool