## Loading in data

In [1]:
import pandas as pd
import numpy as np

from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

INFO:rdflib:RDFLib Version: 4.2.1


### The code below is what I used to extract the metadata from the Project Gutenberg website. I would like to credit [Andreas van Cranenburgh](https://gist.github.com/andreasvc/b3b4189120d84dec8857) for making this code available on Github.

In [2]:
import os
import re
import gzip
import tarfile
import urllib
import xml.etree.cElementTree as ElementTree
try:
    import cPickle as pickle
except ImportError:
    import pickle


In [66]:
PICKLEFILE = '/tmp/md.pickle.gz'  # The Python dict produced by this module
RDFFILES = '/tmp/rdf-files.tar.bz2'  # The catalog downloaded from Gutenberg
RDFURL = r'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
META_FIELDS = ('id', 'author', 'title', 'downloads', 'formats', 'type', 'LCC',
		'subjects', 'authoryearofbirth', 'authoryearofdeath', 'language')
NS = dict(
		pg='http://www.gutenberg.org/2009/pgterms/',
		dc='http://purl.org/dc/terms/',
		dcam='http://purl.org/dc/dcam/',
		rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#')
LINEBREAKRE = re.compile(ur'[ \t]*[\n\r]+[ \t]*')
ETEXTRE = re.compile(r'''
	e(text|b?ook)
	\s*
	(\#\s*(?P<etextid_front>\d+)
	|
	(?P<etextid_back>\d+)\s*\#)
	''', re.IGNORECASE | re.VERBOSE)


def readmetadata():
	"""Read/create cached metadata dump of Gutenberg catalog.
	Returns:
		A dictionary with the following fields:
		id (int): Gutenberg identifier of text
		author (str): Last name, First name
		title (str): title of work
		subjects (list of str): list of descriptive subjects; a subject may be
			hierarchical, e.g:
			'England -- Social life and customs -- 19th century -- Fiction'
		LCC (list of str): a list of two letter Library of Congress
			Classifications, e.g., 'PS'
		language (list of str): list of two letter language codes.
		type (str): 'Text', 'Sound', ...
		formats (dict of str, str pairs): keys are MIME types, values are URLs.
		download count (int): the number of times this ebook has been
			downloaded from the Gutenberg site in the last 30 days.
	Fields that are not part of the metadata are set to None.
	http://www.gutenberg.org/wiki/Gutenberg:Help_on_Bibliographic_Record_Page
	"""
	if os.path.exists(PICKLEFILE):
		metadata = pickle.load(gzip.open(PICKLEFILE, 'rb'))
	else:
		metadata = {}
		for xml in getrdfdata():
			ebook = xml.find(r'{%(pg)s}ebook' % NS)
			if ebook is None:
				continue
			result = parsemetadata(ebook)
			if result is not None:
				metadata[result['id']] = result
		pickle.dump(metadata, gzip.open(PICKLEFILE, 'wb'), protocol=-1)
	return metadata

def getrdfdata():
	"""Downloads Project Gutenberg RDF catalog.
	Yields:
		xml.etree.ElementTree.Element: An etext meta-data definition.
	"""
	if not os.path.exists(RDFFILES):
		_, _ = urllib.urlretrieve(RDFURL, RDFFILES)
	with tarfile.open(RDFFILES) as archive:
		for tarinfo in archive:
			yield ElementTree.parse(archive.extractfile(tarinfo))

def parsemetadata(ebook):
	"""Parses an etext meta-data definition to extract fields.
	Args:
		ebook (xml.etree.ElementTree.Element): An ebook meta-data definition.
	"""
	result = dict.fromkeys(META_FIELDS)
	# get etext no
	about = ebook.get('{%(rdf)s}about' % NS)
	result['id'] = int(os.path.basename(about))
	# author
	creator = ebook.find('.//{%(dc)s}creator' % NS)
	if creator is not None:
		name = creator.find('.//{%(pg)s}name' % NS)
		if name is not None:
			result['author'] = safeunicode(name.text, encoding='utf-8')
		birth = creator.find('.//{%(pg)s}birthdate' % NS)
		if birth is not None:
			result['authoryearofbirth'] = int(birth.text)
		death = creator.find('.//{%(pg)s}deathdate' % NS)
		if death is not None:
			result['authoryearofdeath'] = int(death.text)
	# title
	title = ebook.find('.//{%(dc)s}title' % NS)
	if title is not None:
		result['title'] = fixsubtitles(
				safeunicode(title.text, encoding='utf-8'))
	# subject lists
	result['subjects'], result['LCC'] = set(), set()
	for subject in ebook.findall('.//{%(dc)s}subject' % NS):
		res = subject.find('.//{%(dcam)s}memberOf' % NS)
		if res is None:
			continue
		res = res.get('{%(rdf)s}resource' % NS)
		value = subject.find('.//{%(rdf)s}value' % NS).text
		if res == ('%(dc)sLCSH' % NS):
			result['subjects'].add(value)
		elif res == ('%(dc)sLCC' % NS):
			result['LCC'].add(value)
	# formats
	result['formats'] = {file.find('{%(dc)s}format//{%(rdf)s}value' % NS).text:
			file.get('{%(rdf)s}about' % NS)
			for file in ebook.findall('.//{%(pg)s}file' % NS)}
	# type
	booktype = ebook.find('.//{%(dc)s}type//{%(rdf)s}value' % NS)
	if booktype is not None:
		result['type'] = booktype.text
	# languages
	lang = ebook.findall('.//{%(dc)s}language//{%(rdf)s}value' % NS)
	result['language'] = [a.text for a in lang] or None
	# download count
	downloads = ebook.find('.//{%(pg)s}downloads' % NS)
	if downloads is not None:
		result['downloads'] = int(downloads.text)
	return result

def etextno(lines):
	"""Retrieves the id for an etext.
	Args:
		lines (iter): The lines of the etext to search.
	Returns:
		int: The id of the etext.
	Raises:
		ValueError: If no etext id was found.
	Examples:
		>>> etextno(['Release Date: March 17, 2004 [EBook #11609]'])
		11609
		>>> etextno(['Release Date: July, 2003 [Etext# 4263]'])
		4263
		>>> etextno(['Release Date: November 29, 2003 [Eook #10335]'])
		10335
		>>> etextno(['December, 1998  [Etext 1576#]'])
		1576
		>>> etextno(['Some lines', 'without', 'Any [Etext] Number'])
		Traceback (most recent call last):
			...
		ValueError: no etext-id found
	"""
	for line in lines:
		match = ETEXTRE.search(line)
		if match is not None:
			front_match = match.group('etextid_front')
			back_match = match.group('etextid_back')
			if front_match is not None:
				return int(front_match)
			elif back_match is not None:
				return int(back_match)
			else:
				raise ValueError('no regex match (this should never happen')
	raise ValueError('no etext-id found')


def fixsubtitles(title):
	"""Introduce any subtitle with (semi)colons instead of newlines.
	The first subtitle is introduced with a colon, the rest with semicolons.
	>>> fixsubtitles(u'First Across ...\r\nThe Story of ... \r\n'
	... 'Being an investigation into ...')
	u'First Across ...: The Story of ...; Being an investigation into ...'"""
	tmp = LINEBREAKRE.sub(': ', title, 1)
	return LINEBREAKRE.sub('; ', tmp)


def safeunicode(arg, *args, **kwargs):
	"""Coerce argument to unicode, if it's not already."""
	return arg if isinstance(arg, unicode) else unicode(arg, *args, **kwargs)

__all__ = ['readmetadata']

In [4]:
x = readmetadata()

In [5]:
x

{0: {'LCC': set(),
  'author': None,
  'authoryearofbirth': None,
  'authoryearofdeath': None,
  'downloads': None,
  'formats': {},
  'id': 0,
  'language': None,
  'subjects': set(),
  'title': None,
  'type': 'Text'},
 1: {'LCC': {'E201', 'JK'},
  'author': u'Jefferson, Thomas',
  'authoryearofbirth': 1743,
  'authoryearofdeath': 1826,
  'downloads': 668,
  'formats': {'application/epub+zip': 'http://www.gutenberg.org/ebooks/1.epub.images',
   'application/prs.tex': 'http://www.gutenberg.org/6/5/2/6527/6527-t/6527-t.tex',
   'application/rdf+xml': 'http://www.gutenberg.org/ebooks/1.rdf',
   'application/x-mobipocket-ebook': 'http://www.gutenberg.org/ebooks/1.kindle.noimages',
   'application/zip': 'http://www.gutenberg.org/files/1/1.zip',
   'text/html': 'http://www.gutenberg.org/ebooks/1.html.images',
   'text/plain': 'http://www.gutenberg.org/ebooks/1.txt.utf-8',
   'text/plain; charset=us-ascii': 'http://www.gutenberg.org/files/1/1.txt'},
  'id': 1,
  'language': ['en'],
  'subje

In [67]:
meta_df = pd.DataFrame(x)

In [68]:
meta_df = meta_df.T

## Now I have a dataframe that has metadata about all the books

### My next steps are to clean this up a little to make it useable. I will need to:
- Remove row 0 (there is no book with id 0)
- Remove anything that is not in a text format, such as audiobooks or datasets.
- Remove books that are not written in English
- Revove books to which access has been blocked (there are several books like this, probably because of copyright disputes)

In [69]:
meta_df.head()

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type
0,{},,,,,{},0,,{},,Text
1,"{E201, JK}","Jefferson, Thomas",1743.0,1826.0,668.0,{u'text/html': u'http://www.gutenberg.org/eboo...,1,[en],"{United States. Declaration of Independence, U...",The Declaration of Independence of the United ...,Text
2,"{KF, JK}",United States,,,176.0,{u'text/html': u'http://www.gutenberg.org/file...,2,[en],{United States. Constitution. 1st-10th Amendme...,The United States Bill of Rights: The Ten Orig...,Text
3,{E838},"Kennedy, John F. (John Fitzgerald)",1917.0,1963.0,26.0,{u'text/html': u'http://www.gutenberg.org/file...,3,[en],{Presidents -- United States -- Inaugural addr...,John F. Kennedy's Inaugural Address,Text
4,{E456},"Lincoln, Abraham",1809.0,1865.0,59.0,{u'text/html': u'http://www.gutenberg.org/file...,4,[en],"{Lincoln, Abraham, 1809-1865. Gettysburg addre...",Lincoln's Gettysburg Address: Given November 1...,Text


In [70]:
meta_df.shape

(54053, 11)

In [71]:
meta_df = meta_df.sort_values(by = 'id')

In [72]:
meta_df.reset_index(inplace = True, drop=True)
meta_df.head()

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type
0,{},,,,,{},0,,{},,Text
1,"{E201, JK}","Jefferson, Thomas",1743.0,1826.0,668.0,{u'text/html': u'http://www.gutenberg.org/eboo...,1,[en],"{United States. Declaration of Independence, U...",The Declaration of Independence of the United ...,Text
2,"{KF, JK}",United States,,,176.0,{u'text/html': u'http://www.gutenberg.org/file...,2,[en],{United States. Constitution. 1st-10th Amendme...,The United States Bill of Rights: The Ten Orig...,Text
3,{E838},"Kennedy, John F. (John Fitzgerald)",1917.0,1963.0,26.0,{u'text/html': u'http://www.gutenberg.org/file...,3,[en],{Presidents -- United States -- Inaugural addr...,John F. Kennedy's Inaugural Address,Text
4,{E456},"Lincoln, Abraham",1809.0,1865.0,59.0,{u'text/html': u'http://www.gutenberg.org/file...,4,[en],"{Lincoln, Abraham, 1809-1865. Gettysburg addre...",Lincoln's Gettysburg Address: Given November 1...,Text


In [73]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54053 entries, 0 to 54052
Data columns (total 11 columns):
LCC                  54053 non-null object
author               52072 non-null object
authoryearofbirth    39253 non-null object
authoryearofdeath    38324 non-null object
downloads            54051 non-null object
formats              54053 non-null object
id                   54053 non-null object
language             54051 non-null object
subjects             54053 non-null object
title                53978 non-null object
type                 54053 non-null object
dtypes: object(11)
memory usage: 4.5+ MB


## I only want to keep books with type = Text

In [74]:
meta_df['type'].unique()

array(['Text', 'Dataset', 'StillImage', 'MovingImage', 'Sound', 'Image',
       'Collection'], dtype=object)

In [75]:
meta_df[meta_df['type'] == 'Collection']

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type
10802,{},,,,24,{u'text/html': u'http://www.gutenberg.org/eboo...,10802,[en],{},"Project Gutenberg ""10K"" DVD",Collection
11220,{},,,,37,{u'text/html': u'http://www.gutenberg.org/eboo...,11220,[en],{},"Project Gutenberg ""Best Of"" CD August 2003",Collection
19159,{},,,,69,{u'application/rdf+xml': u'http://www.gutenber...,19159,[en],{},Project Gutenberg DVD: The July 2006 Special,Collection


In [76]:
meta_df = meta_df[meta_df['type'] == 'Text']

## Removing null weird row

In [77]:
meta_df.tail(2)

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type
54051,{},"Verrill, A. Hyatt (Alpheus Hyatt)",1871.0,1954.0,0.0,{u'application/zip': u'http://www.gutenberg.or...,54051,[en],{},"The Book of the Sailboat: How to rig, sail and...",Text
54052,{},,,,,{},999999,,{},Piccole anime,Text


In [78]:
meta_df = meta_df[~ meta_df['language'].isnull()]

In [79]:
meta_df.tail(2)

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type
54050,{},"Sale-Barker, Lucy Elizabeth Drummond",,,0,{u'image/jpeg': u'http://www.gutenberg.org/cac...,54050,[en],{},Little Wideawake: A story book for little chil...,Text
54051,{},"Verrill, A. Hyatt (Alpheus Hyatt)",1871.0,1954.0,0,{u'application/zip': u'http://www.gutenberg.or...,54051,[en],{},"The Book of the Sailboat: How to rig, sail and...",Text


In [80]:
meta_df.reset_index(inplace = True)

## Cleaning the language column

I only want books that are exclusively written in English

In [81]:
type(meta_df['language'][0])

list

In [82]:
def clean_language(lang_list):
    if 'en' in lang_list and len(lang_list) == 1:
        return 1
    else:
        return 0
    
meta_df['english'] = meta_df['language'].map(clean_language)
meta_df.head(2)

Unnamed: 0,index,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type,english
0,1,"{E201, JK}","Jefferson, Thomas",1743.0,1826.0,668,{u'text/html': u'http://www.gutenberg.org/eboo...,1,[en],"{United States. Declaration of Independence, U...",The Declaration of Independence of the United ...,Text,1
1,2,"{KF, JK}",United States,,,176,{u'text/html': u'http://www.gutenberg.org/file...,2,[en],{United States. Constitution. 1st-10th Amendme...,The United States Bill of Rights: The Ten Orig...,Text,1


In [83]:
meta_df = meta_df[meta_df['english'] == 1]

In [84]:
meta_df.shape

(43010, 13)

## Now I'm going to start dealing with the subjects

In [85]:
meta_df['subjects']

#here's what they look like

0        {United States. Declaration of Independence, U...
1        {United States. Constitution. 1st-10th Amendme...
2        {Presidents -- United States -- Inaugural addr...
3        {Lincoln, Abraham, 1809-1865. Gettysburg addre...
4        {United States. Constitution, United States --...
5        {Virginia -- Politics and government -- 1775-1...
6        {Pilgrims (New Plymouth Colony), Massachusetts...
7        {United States -- Politics and government -- 1...
8        {United States -- Politics and government -- 1...
9                                                  {Bible}
10                                               {Fantasy}
11                                               {Fantasy}
12                              {Nonsense verses, English}
13       {Political science -- Handbooks, manuals, etc....
14       {Ship captains -- Fiction, Whaling ships -- Fi...
15       {Peter Pan (Fictitious character) -- Fiction, ...
16       {Church of Jesus Christ of Latter-day Saints -.

In [86]:
type(meta_df.subjects[0])

set

In [87]:
meta_df.subjects[0]

{'United States -- History -- Revolution, 1775-1783 -- Sources',
 'United States. Declaration of Independence'}

In [88]:
import string

def subject_cleaning(subj_set):
    subjs = []
    for s in subj_set:
        s = s.split('--')
        s = [string.strip(i) for i in s]
        subjs.extend(s)
    return np.unique(subjs) 
        
meta_df['subjects2'] = meta_df.subjects.map(subject_cleaning)
meta_df.head()

Unnamed: 0,index,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type,english,subjects2
0,1,"{E201, JK}","Jefferson, Thomas",1743.0,1826.0,668,{u'text/html': u'http://www.gutenberg.org/eboo...,1,[en],"{United States. Declaration of Independence, U...",The Declaration of Independence of the United ...,Text,1,"[History, Revolution, 1775-1783, Sources, Unit..."
1,2,"{KF, JK}",United States,,,176,{u'text/html': u'http://www.gutenberg.org/file...,2,[en],{United States. Constitution. 1st-10th Amendme...,The United States Bill of Rights: The Ten Orig...,Text,1,"[Civil rights, Sources, United States, United ..."
2,3,{E838},"Kennedy, John F. (John Fitzgerald)",1917.0,1963.0,26,{u'text/html': u'http://www.gutenberg.org/file...,3,[en],{Presidents -- United States -- Inaugural addr...,John F. Kennedy's Inaugural Address,Text,1,"[1961-1963, Foreign relations, Inaugural addre..."
3,4,{E456},"Lincoln, Abraham",1809.0,1865.0,59,{u'text/html': u'http://www.gutenberg.org/file...,4,[en],"{Lincoln, Abraham, 1809-1865. Gettysburg addre...",Lincoln's Gettysburg Address: Given November 1...,Text,1,"[Consecration of cemeteries, Gettysburg, Linco..."
4,5,"{KF, JK}",United States,,,429,{u'application/x-mobipocket-ebook': u'http://w...,5,[en],"{United States. Constitution, United States --...",The United States Constitution,Text,1,"[1783-1789, Politics and government, Sources, ..."


In [89]:
meta_df['subjects2'][0]

array(['History', 'Revolution, 1775-1783', 'Sources', 'United States',
       'United States. Declaration of Independence'], 
      dtype='|S42')

In [90]:
meta_df['subjects_there'] = meta_df['subjects2'].map(lambda x: 1 if len(x) > 0 else 0)
meta_df[meta_df['subjects_there'] == 0]

Unnamed: 0,index,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type,english,subjects2,subjects_there
174,182,{},,,,0,{},182,[en],{},,Text,1,[],0
175,183,{},,,,0,{},183,[en],{},,Text,1,[],0
176,184,{},,,,0,{},184,[en],{},,Text,1,[],0
177,185,{},,,,0,{},185,[en],{},,Text,1,[],0
178,186,{},,,,0,{},186,[en],{},,Text,1,[],0
179,187,{},,,,0,{},187,[en],{},,Text,1,[],0
180,188,{},,,,0,{},188,[en],{},,Text,1,[],0
181,189,{},,,,0,{},189,[en],{},,Text,1,[],0
182,190,{},,,,0,{},190,[en],{},,Text,1,[],0
183,191,{},,,,0,{},191,[en],{},,Text,1,[],0


## I'm going to remove these columns where there is no author or title

There are many books that showed up in my dataframe with a book id but no data about them. This means that for some reason that book has been made unavailable on Project Gutenberg for some reason or another.
After removing these books, I will still have some empty subjects, but not as many.

In [91]:
meta_df = meta_df[(meta_df['title'].notnull()) & (meta_df['author'].notnull())]

In [92]:
meta_df[meta_df['subjects_there'] == 0]

Unnamed: 0,index,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type,english,subjects2,subjects_there
17761,18459,{PQ},"Colonna, Francesco",,1527,99,{u'image/jpeg': u'http://www.gutenberg.org/cac...,18459,[en],{},Hypnerotomachia: The Strife of Loue in a Dreame,Text,1,[],0
22068,23100,{AC},"De Morgan, Augustus",1806,1871,32,{u'text/html; charset=iso-8859-1': u'http://ww...,23100,[en],{},"A Budget of Paradoxes, Volume I",Text,1,[],0
23013,24096,{PQ},"Huysmans, J.-K. (Joris-Karl)",1848,1907,23,{u'application/rdf+xml': u'http://www.gutenber...,24096,[en],{},En Route,Text,1,[],0
24219,25302,{PQ},"Daudet, Alphonse",1840,1897,18,{u'text/html; charset=utf-8': u'http://www.gut...,25302,[en],{},Jack: 1877,Text,1,[],0
24262,25345,{PT},"Wassermann, Jakob",1873,1934,10,{u'text/plain; charset=utf-8': u'http://www.gu...,25345,[en],{},The Goose Man,Text,1,[],0
24322,25405,{PQ},"France, Anatole",1844,1924,28,{u'image/jpeg': u'http://www.gutenberg.org/cac...,25405,[en],{},Honey-Bee: 1911,Text,1,[],0
24323,25406,{PQ},"France, Anatole",1844,1924,12,{u'text/html; charset=utf-8': u'http://www.gut...,25406,[en],{},Marguerite,Text,1,[],0
24324,25407,{PQ},"France, Anatole",1844,1924,10,{u'text/plain; charset=utf-8': u'http://www.gu...,25407,[en],{},The Merrie Tales of Jacques Tournebroche: And ...,Text,1,[],0
24325,25408,{PQ},"France, Anatole",1844,1924,15,{u'text/html; charset=utf-8': u'http://www.gut...,25408,[en],{},Child Life In Town And Country: 1909,Text,1,[],0
24326,25409,{PQ},"France, Anatole",1844,1924,15,{u'image/jpeg': u'http://www.gutenberg.org/cac...,25409,[en],{},The Story Of The Duchess Of Cicogne And Of Mon...,Text,1,[],0


In [93]:
meta_df.shape

(41376, 15)

In [94]:
meta_df.reset_index(inplace = True, drop = True)

In [95]:
meta_df.head()

Unnamed: 0,index,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type,english,subjects2,subjects_there
0,1,"{E201, JK}","Jefferson, Thomas",1743.0,1826.0,668,{u'text/html': u'http://www.gutenberg.org/eboo...,1,[en],"{United States. Declaration of Independence, U...",The Declaration of Independence of the United ...,Text,1,"[History, Revolution, 1775-1783, Sources, Unit...",1
1,2,"{KF, JK}",United States,,,176,{u'text/html': u'http://www.gutenberg.org/file...,2,[en],{United States. Constitution. 1st-10th Amendme...,The United States Bill of Rights: The Ten Orig...,Text,1,"[Civil rights, Sources, United States, United ...",1
2,3,{E838},"Kennedy, John F. (John Fitzgerald)",1917.0,1963.0,26,{u'text/html': u'http://www.gutenberg.org/file...,3,[en],{Presidents -- United States -- Inaugural addr...,John F. Kennedy's Inaugural Address,Text,1,"[1961-1963, Foreign relations, Inaugural addre...",1
3,4,{E456},"Lincoln, Abraham",1809.0,1865.0,59,{u'text/html': u'http://www.gutenberg.org/file...,4,[en],"{Lincoln, Abraham, 1809-1865. Gettysburg addre...",Lincoln's Gettysburg Address: Given November 1...,Text,1,"[Consecration of cemeteries, Gettysburg, Linco...",1
4,5,"{KF, JK}",United States,,,429,{u'application/x-mobipocket-ebook': u'http://w...,5,[en],"{United States. Constitution, United States --...",The United States Constitution,Text,1,"[1783-1789, Politics and government, Sources, ...",1


In [106]:
meta_df.drop(['index', 'english', 'subjects_there'], axis = 1)

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,language,subjects,title,type,subjects2
0,"{E201, JK}","Jefferson, Thomas",1743,1826,668,{u'text/html': u'http://www.gutenberg.org/eboo...,1,[en],"{United States. Declaration of Independence, U...",The Declaration of Independence of the United ...,Text,"[History, Revolution, 1775-1783, Sources, Unit..."
1,"{KF, JK}",United States,,,176,{u'text/html': u'http://www.gutenberg.org/file...,2,[en],{United States. Constitution. 1st-10th Amendme...,The United States Bill of Rights: The Ten Orig...,Text,"[Civil rights, Sources, United States, United ..."
2,{E838},"Kennedy, John F. (John Fitzgerald)",1917,1963,26,{u'text/html': u'http://www.gutenberg.org/file...,3,[en],{Presidents -- United States -- Inaugural addr...,John F. Kennedy's Inaugural Address,Text,"[1961-1963, Foreign relations, Inaugural addre..."
3,{E456},"Lincoln, Abraham",1809,1865,59,{u'text/html': u'http://www.gutenberg.org/file...,4,[en],"{Lincoln, Abraham, 1809-1865. Gettysburg addre...",Lincoln's Gettysburg Address: Given November 1...,Text,"[Consecration of cemeteries, Gettysburg, Linco..."
4,"{KF, JK}",United States,,,429,{u'application/x-mobipocket-ebook': u'http://w...,5,[en],"{United States. Constitution, United States --...",The United States Constitution,Text,"[1783-1789, Politics and government, Sources, ..."
5,{E201},"Henry, Patrick",1736,1799,89,{u'text/html': u'http://www.gutenberg.org/file...,6,[en],{Virginia -- Politics and government -- 1775-1...,Give Me Liberty or Give Me Death,Text,"[1775-1783, Politics and government, Sources, ..."
6,{E456},"Lincoln, Abraham",1809,1865,21,{u'text/html; charset=us-ascii': u'http://www....,8,[en],{United States -- Politics and government -- 1...,Abraham Lincoln's Second Inaugural Address,Text,"[1861-1865, Inaugural addresses, Politics and ..."
7,{E456},"Lincoln, Abraham",1809,1865,23,{u'text/html; charset=iso-8859-1': u'http://ww...,9,[en],{United States -- Politics and government -- 1...,Abraham Lincoln's First Inaugural Address,Text,"[1861-1865, Inaugural addresses, Politics and ..."
8,"{PZ, PR}","Carroll, Lewis",1832,1898,13208,{u'text/plain; charset=utf-8': u'http://www.gu...,11,[en],{Fantasy},Alice's Adventures in Wonderland,Text,[Fantasy]
9,"{PZ, PR}","Carroll, Lewis",1832,1898,2176,{u'text/html; charset=utf-8': u'http://www.gut...,12,[en],{Fantasy},Through the Looking-Glass,Text,[Fantasy]


In [111]:
meta_df = meta_df.reindex_axis(['id', 'title', 'author', 'LCC', 'downloads', 'subjects', 'subjects2', 'formats', 'authoryearofbirth', 'authoryearofdeath', 'type', 'language'], axis = 1)

In [112]:
meta_df.head()

Unnamed: 0,id,title,author,LCC,downloads,subjects,subjects2,formats,authoryearofbirth,authoryearofdeath,type,language
0,1,The Declaration of Independence of the United ...,"Jefferson, Thomas","{E201, JK}",668,"{United States. Declaration of Independence, U...","[History, Revolution, 1775-1783, Sources, Unit...",{u'text/html': u'http://www.gutenberg.org/eboo...,1743.0,1826.0,Text,[en]
1,2,The United States Bill of Rights: The Ten Orig...,United States,"{KF, JK}",176,{United States. Constitution. 1st-10th Amendme...,"[Civil rights, Sources, United States, United ...",{u'text/html': u'http://www.gutenberg.org/file...,,,Text,[en]
2,3,John F. Kennedy's Inaugural Address,"Kennedy, John F. (John Fitzgerald)",{E838},26,{Presidents -- United States -- Inaugural addr...,"[1961-1963, Foreign relations, Inaugural addre...",{u'text/html': u'http://www.gutenberg.org/file...,1917.0,1963.0,Text,[en]
3,4,Lincoln's Gettysburg Address: Given November 1...,"Lincoln, Abraham",{E456},59,"{Lincoln, Abraham, 1809-1865. Gettysburg addre...","[Consecration of cemeteries, Gettysburg, Linco...",{u'text/html': u'http://www.gutenberg.org/file...,1809.0,1865.0,Text,[en]
4,5,The United States Constitution,United States,"{KF, JK}",429,"{United States. Constitution, United States --...","[1783-1789, Politics and government, Sources, ...",{u'application/x-mobipocket-ebook': u'http://w...,,,Text,[en]


## At this point I'm going to save a csv file
I want this dataframe to be available to me in a new notebook so I can do further anaysis.

In [None]:
meta_df.to_csv('gutenberg_metadata_en.csv', encoding = 'utf8')