In [2]:
import tarfile
tf = tarfile.open('data/folger.tar.gz', 'r')
tf.extractall('data')

In [3]:
file_path = 'data/folger/txt/1H4.txt'
stream = open(file_path)
contents = stream.read()
stream.close()

print(contents[:300])

Henry IV, Part I
by William Shakespeare
Edited by Barbara A. Mowat and Paul Werstine
  with Michael Poston and Rebecca Niles
Folger Shakespeare Library
http://www.folgerdigitaltexts.org/?chapter=5&play=1H4
Created on Jul 31, 2015, from FDT version 0.9.2

Characters in the Play


In [4]:
with open(file_path) as stream:
    contents = stream.read()

print(contents[:300])

Henry IV, Part I
by William Shakespeare
Edited by Barbara A. Mowat and Paul Werstine
  with Michael Poston and Rebecca Niles
Folger Shakespeare Library
http://www.folgerdigitaltexts.org/?chapter=5&play=1H4
Created on Jul 31, 2015, from FDT version 0.9.2

Characters in the Play


In [5]:
with open('data/anna-karenina.txt', encoding='koi8-r') as stream:
    # Use stream.readline() to retrieve the next line from a file,
    # in this case the 1st one:
    line = stream.readline()

print(line)

Все счастливые семьи похожи друг на друга, каждая несчастливая семья несчастлива по-своему.



# CSV

In [6]:
csv_file = 'data/folger_shakespeare_collection.csv'
with open(csv_file) as stream:
    # call stream.readlines() to read all lines in the CSV file as a list.
    lines = stream.readlines()

print(lines[:3])

['fname,author,title,editor,publisher,pubplace,date\n', '1H4,William Shakespeare,"Henry IV, Part I",Barbara A. Mowat,Washington Square Press,New York,1994\n', '1H6,William Shakespeare,"Henry VI, Part 1",Barbara A. Mowat,Washington Square Press,New York,2008\n']


In [7]:
entries = []
for line in open(csv_file):
    entries.append(line.strip().split(','))

for entry in entries[:3]:
    print(entry)

['fname', 'author', 'title', 'editor', 'publisher', 'pubplace', 'date']
['1H4', 'William Shakespeare', '"Henry IV', ' Part I"', 'Barbara A. Mowat', 'Washington Square Press', 'New York', '1994']
['1H6', 'William Shakespeare', '"Henry VI', ' Part 1"', 'Barbara A. Mowat', 'Washington Square Press', 'New York', '2008']


In [8]:
import csv

entries = []
with open(csv_file) as stream:
    reader = csv.reader(stream, delimiter=',')
    for fname, author, title, editor, publisher, pubplace, date in reader:
        entries.append((fname, title))

for entry in entries[:5]:
    print(entry)

('fname', 'title')
('1H4', 'Henry IV, Part I')
('1H6', 'Henry VI, Part 1')
('2H4', 'Henry IV, Part 2')
('2H6', 'Henry VI, Part 2')


In [9]:
entries = []
with open(csv_file) as stream:
    reader = csv.reader(stream, delimiter=',')
    for fname, _, title, *_ in reader:
        entries.append((fname, title))

for entry in entries[:5]:
    print(entry)

('fname', 'title')
('1H4', 'Henry IV, Part I')
('1H6', 'Henry VI, Part 1')
('2H4', 'Henry IV, Part 2')
('2H6', 'Henry VI, Part 2')


In [10]:
a, _, c, _, _ = range(5)
print(a, c)

0 2


In [11]:
a, *l = range(5)
print(a, l)

0 [1, 2, 3, 4]


In [12]:
seq = range(5)
a, l = seq[0], seq[1:]
print(a, l)

0 range(1, 5)


In [13]:
a, *l, b = range(5)
print(a, l, b)

0 [1, 2, 3] 4


In [14]:
entries = []

with open(csv_file) as stream:
    reader = csv.DictReader(stream, delimiter=',')
    for row in reader:
        entries.append(row)

for entry in entries[:5]:
    print(entry['fname'], entry['title'])

1H4 Henry IV, Part I
1H6 Henry VI, Part 1
2H4 Henry IV, Part 2
2H6 Henry VI, Part 2
3H6 Henry VI, Part 3


# PDF

In [16]:
import PyPDF2 as PDF

In [23]:
file_path = 'data/folger/pdf/1H4.pdf'
# pdf = PDF.PdfFileReader(file_path) -- deprecated
pdf = PDF.PdfReader(file_path)

In [25]:
# n_pages = pdf.getNumPages() -- deprecated
n_pages = len(pdf.pages)
print(f'PDF has {n_pages} pages.')

PDF has 113 pages.


In [28]:
# page = pdf.getPage(1) -- deprecated
page = pdf.pages[1]
# content = page.extractText() -- deprecated
content = page.extract_text()
print(content[:150])

FrontMatterFrom the Director of the Folger ShakespeareLibraryTextual IntroductionSynopsisCharacters in the PlayACT 1Scene 1Scene 2Scene 3ACT 2Scene 1S


In [33]:
def pdf2txt(fname, page_numbers=None, concatenate=False):
    """Convert text from a PDF file into a string or list of strings.

    Arguments:
        fname: a string pointing to the filename of the PDF file
        page_numbers: an integer or sequence of integers pointing to the
            pages to extract. If None (default), all pages are extracted.
        concatenate: a boolean indicating whether to concatenate the
            extracted pages into a single string. When False, a list of
            strings is returned.

    Returns:
        A string or list of strings representing the text extracted
        from the supplied PDF file.

    """
    # pdf = PDF.PdfFileReader(fname, overwriteWarnings=False) -- deprecated
    pdf = PDF.PdfReader(fname)
    if page_numbers is None:
        page_numbers = range(len(pdf.pages))
    elif isinstance(page_numbers, int):
        page_numbers = [page_numbers]
    texts = [pdf.pages[n].extract_text() for n in page_numbers]
    return '\n'.join(texts) if concatenate else texts

In [36]:
text = pdf2txt(file_path, concatenate=True)
print(text)

Folger Shakespeare Libraryhttp://www.folgerdigitaltexts.org
FrontMatterFrom the Director of the Folger ShakespeareLibraryTextual IntroductionSynopsisCharacters in the PlayACT 1Scene 1Scene 2Scene 3ACT 2Scene 1Scene 2Scene 3Scene 4ACT 3Scene 1Scene 2Scene 3ACT 4Scene 1Scene 2Scene 3Scene 4ACT 5Scene 1Scene 2Scene 3Scene 4Scene 5Contents
Michael WitmoreDirector, Folger Shakespeare LibraryIt is hard to imagine a world without Shakespeare. Since theircomposition four hundred years ago, Shakespeare’s plays and poemshave traveled the globe, inviting those who see and read his works tomake them their own.Readers of the New Folger Editions are part of this ongoing processof “taking up Shakespeare,” finding our own thoughts and feelings inlanguage that strikes us as old or unusual and, for that very reason,new. We still struggle to keep up with a writer who could think a milea minute, whose words paint pictures that shift like clouds. Theseexpertly edited texts are presented to the public as a 

In [37]:
sample = pdf2txt(file_path, page_numbers=[1, 4, 9])
print(sample)

['FrontMatterFrom the Director of the Folger ShakespeareLibraryTextual IntroductionSynopsisCharacters in the PlayACT 1Scene 1Scene 2Scene 3ACT 2Scene 1Scene 2Scene 3Scene 4ACT 3Scene 1Scene 2Scene 3ACT 4Scene 1Scene 2Scene 3Scene 4ACT 5Scene 1Scene 2Scene 3Scene 4Scene 5Contents', 'chains of magic were not bound,”), half-square brackets (forexample, from Henry V: “With blood and sword and fire to win yourright,”), or angle brackets (for example, from Hamlet: “O farewell,honest soldier. Who hath relieved/you?”). At any point in the text,you can hover your cursor over a bracket for more information.Because the Folger Digital Texts are edited in accord with twenty-firstcentury knowledge about Shakespeare’s texts, the Folger hereprovides them to readers, scholars, teachers, actors, directors, andstudents, free of charge, confident of their quality as texts of the playsand pleased to be able to make this contribution to the study andenjoyment of Shakespeare.\n', '11Henry IV , Part IACT 1. S

# JSON