In [12]:
import pandas as pd
import glob
import re
import xml.etree.ElementTree as ET
import requests

In [13]:
gut_url = 'https://www.gutenberg.org/ebooks/{}'
gut_txt = 'https://www.gutenberg.org/ebooks/{}.txt.utf-8'

data_dir = '/home/rca2t/Public/ETA/data/gutenberg/cache/epub'
epub_path = data_dir + '/{0}/pg{0}.rdf'
TAG = re.compile(r'<[^>]+>')

In [15]:
gids = [int(path.split('/')[-1]) for path in glob.glob(data_dir+'/*')]
gids = sorted(gids)

In [59]:
df = pd.DataFrame(gids,  columns=['gut_id'])
df = df.set_index('gut_id')

# XML Processing

In [63]:
ns = dict(base="http://www.gutenberg.org/",
    rdfs="http://www.w3.org/2000/01/rdf-schema#",
    rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    marcrel="http://id.loc.gov/vocabulary/relators/",
    pgterms="http://www.gutenberg.org/2009/pgterms/",
    cc="http://web.resource.org/cc/",
    dcam="http://purl.org/dc/dcam/",
    dcterms="http://purl.org/dc/terms/")

In [64]:
xpaths = dict(
    subjects = ".//dcterms:subject/rdf:Description/rdf:value",
    bookshelves = ".//pgterms:bookshelf/rdf:Description/rdf:value",
    languages = ".//dcterms:language/rdf:Description/rdf:value",
    agents = ".//marcrel:ill/pgterms:agent/pgterms:name",
    rights = ".//dcterms:rights",
    title = ".//dcterms:title",
    types = ".//dcterms:type/rdf:Description/rdf:value",
    creators = ".//dcterms:creator/pgterms:agent/pgterms:name",
    formats = ".//dcterms:hasFormat/pgterms:file/dcterms:format/rdf:Description/rdf:value"
)

In [289]:
default_format='text/plain; charset=utf-8'
default_formats = ["text/plain; charset={}".format(cs) 
                   for cs in ['utf8', 'ascii', 'iso-8859-1']]
default_formats += ['text/plain']

In [290]:
default_formats

['text/plain; charset=utf8',
 'text/plain; charset=ascii',
 'text/plain; charset=iso-8859-1',
 'text/plain']

In [65]:
for item in xpaths:
    df[item] = None

In [66]:
def get_rdf(gut_id, data_dir=data_dir):
    path = "{0}/{1}/pg{1}.rdf".format(data_dir, gut_id)
    rdf = open(path, 'r').read()
    return rdf

In [67]:
def get_text_url(gut_id):
    gut_txt = 'https://www.gutenberg.org/ebooks/{}.txt.utf-8'.format(gut_id)
    return gut_txt

In [68]:
def get_text_content(gut_id):
    gut_url = get_text_url(gut_id)
    r = requests.get(gut_url)
    return r.text

In [69]:
def get_items(el, xpath, ns=ns):
    items = [item.text for item in el.findall(xpath, namespaces=ns)]
    return items

In [70]:
def get_rdf_root(gut_id):
    rdf = get_rdf(gut_id)
    root = ET.fromstring(rdf)
    return root 

In [71]:
def is_text(formats, default_format='text/plain; charset=utf-8'):
    return 

In [72]:
def get_metadata(gut_id):
    rdf_root = get_rdf_root(gut_id)
    md = {}
    for item in xpaths:
        md[item] = get_items(rdf_root, xpaths[item])
    return md

In [121]:
md = get_metadata(120)

In [122]:
md

{'subjects': ['Treasure Island (Imaginary place) -- Fiction',
  'Pirates -- Fiction',
  'Treasure troves -- Fiction',
  'PZ',
  'Sea stories',
  'PR'],
 'bookshelves': ['Pirates, Buccaneers, Corsairs, etc.',
  'Historical Fiction',
  "Children's Literature",
  'Best Books Ever Listings'],
 'languages': ['en'],
 'agents': ['Rhead, Louis'],
 'rights': ['Public domain in the USA.'],
 'title': ['Treasure Island'],
 'types': ['Text'],
 'creators': ['Stevenson, Robert Louis'],
 'formats': ['text/plain; charset=utf-8',
  'application/zip',
  'application/epub+zip',
  'application/x-mobipocket-ebook',
  'image/jpeg',
  'text/html; charset=utf-8',
  'application/zip',
  'text/html; charset=utf-8',
  'image/jpeg',
  'text/plain; charset=utf-8',
  'application/epub+zip',
  'application/rdf+xml',
  'application/x-mobipocket-ebook']}

In [132]:
# def get_data(x):
#     gut_id = x.name
#     md = get_metadata(gut_id)
#     for item in md.keys():
#         x[item] = md[item]
# #         print(item, md[item])
#     return x

In [137]:
# def populate_df(gid_x=0, gid_y=-1):
#     df = pd.DataFrame(gids[gid_x:gid_y], columns=['gut_id'])
#     df = df.set_index('gut_id')
#     for item in xpaths:
#         df[item] = None
#     df = df.apply(get_data, axis=1)
# #     df.title = df.title.str.replace('&#13;', '')
#     return df

In [147]:
def populate_df2(gids):
    data = []
    for gid in gids:
        md = get_metadata(gid)
        for key in md.keys():
            for val in md[key]:
                data.append((gid, key, val))
    df = pd.DataFrame(data, columns=['gid','key','val'])
    return df

In [156]:
df = populate_df2(gids)

In [291]:
df_wide = df.groupby(['gid', 'key']).val.apply(lambda x: '|'.join(x)).unstack()

In [293]:
df_wide = df_wide.loc[df_wide.languages == 'en']

In [296]:
dfs = '|'.join(default_formats)

'text/plain; charset=utf8|text/plain; charset=ascii|text/plain; charset=iso-8859-1|text/plain'

In [298]:
df_wide = df_wide.loc[df_wide.formats.str.match(r"^\s*({})\s*$".format(dfs))]

ValueError: cannot index with vector containing NA / NaN values

In [192]:
tables = {}
for key in xpaths.keys():
    print(key)
    items = df.loc[df.key==key, ['gid','val']]
    items = items.set_index('gid')
    items = pd.Series(items.val)
    tables[key] = items

subjects
bookshelves
languages
agents
rights
title
types
creators
formats


In [200]:
UTF = tables['formats'].str.contains('text/plain; charset=utf')

In [211]:
tables['title']

gid
1         The Declaration of Independence of the United ...
2         The United States Bill of Rights\r\nThe Ten Or...
3                       John F. Kennedy's Inaugural Address
4         Lincoln's Gettysburg Address\r\nGiven November...
5                            The United States Constitution
6                          Give Me Liberty or Give Me Death
7                                     The Mayflower Compact
8                Abraham Lincoln's Second Inaugural Address
9                 Abraham Lincoln's First Inaugural Address
10                      The King James Version of the Bible
11                         Alice's Adventures in Wonderland
12                                Through the Looking-Glass
13         The Hunting of the Snark: An Agony in Eight Fits
14                              The 1990 CIA World Factbook
15                                                Moby Dick
16                                                Peter Pan
17        The Book of Mormon\r\nAn A

In [212]:
data_dir2 = '/home/rca2t/Public/ETA/data/gutenberg'

In [213]:
import sqlite3

In [216]:
with sqlite3.connect(data_dir2 + '/gutenberg.db') as db:
    for table in tables:
        print(table)
        tables[table].to_sql(table, db, index=True, if_exists='replace')

subjects
bookshelves
languages
agents
rights
title
types
creators
formats


In [236]:
df.loc[(df.key=='formats') &  (df.val.str.match('text/plain'))].val.value_counts()[:4]

text/plain; charset=us-ascii      75474
text/plain; charset=iso-8859-1    70502
text/plain                        43464
text/plain; charset=utf-8         34860
Name: val, dtype: int64

In [248]:
def get_works_by(name_pat):
    creators = pd.DataFrame(tables['creators'])
    titles = pd.DataFrame(tables['title'])
    works = titles.loc[creators.val.str.contains(name_pat)]
    return works

In [249]:
milton = get_works_by('Milton, John')

ValueError: cannot reindex from a duplicate axis

In [247]:
milton

Unnamed: 0_level_0,val
gid,Unnamed: 1_level_1
1,The Declaration of Independence of the United ...
2,The United States Bill of Rights\r\nThe Ten Or...
3,John F. Kennedy's Inaugural Address
4,Lincoln's Gettysburg Address\r\nGiven November...
5,The United States Constitution
6,Give Me Liberty or Give Me Death
8,Abraham Lincoln's Second Inaugural Address
9,Abraham Lincoln's First Inaugural Address
11,Alice's Adventures in Wonderland
12,Through the Looking-Glass


In [75]:
austen = get_works_by('Austen, Jane')

In [76]:
austen

Unnamed: 0_level_0,title,creator,rdf
gut_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
105,Persuasion,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
121,Northanger Abbey,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
141,Mansfield Park,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
158,Emma,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
161,Sense and Sensibility,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
946,Lady Susan,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
1212,Love and Freindship [sic],"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
1342,Pride and Prejudice,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
19839,Emma,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."
20682,Northanger Abbey,"Austen, Jane","<?xml version=""1.0"" encoding=""utf-8""?>\n<rdf:R..."


In [77]:
import requests

In [78]:
download_dir = '/home/rca2t/Public/ETA/data/gutenberg/downloads'

In [79]:
def download_works(works, download_dir=download_dir):
    gids = works.index.tolist()
    for gid in gids:
        print(gid)
        url = 'https://www.gutenberg.org/ebooks/{}.txt.utf-8'.format(gid)
        r = requests.get(url)
        with open(download_dir+"/g{}.txt".format(gid), 'w', encoding='utf8') as out:
            out.write(r.text)

In [80]:
works = download_works(austen)

105
121
141
158
161
946
1212
1342
19839
20682
20686
20687
21839
22953
22954
22962
22963
22964
25946
26301
31100
33388
35163
37431
37634
42671
43741


In [90]:
milton.groupby('title').count()

Unnamed: 0_level_0,creator,rdf
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Areopagitica,1,1
"L'Allegro, Il Penseroso, Comus, and Lycidas",1,1
Milton's Comus,1,1
Milton: Minor Poems,1,1
Paradise Lost,2,2
Paradise Regained,1,1
"Poemata : Latin, Greek and Italian Poems by John Milton",1,1


In [94]:
milton.rdf.str.contains('audio')

gut_id
20       False
26       False
58       False
397      False
608      False
6929     False
19819    False
31706    False
Name: rdf, dtype: bool