In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

%matplotlib inline
%config InlineBackend.feature_format = 'retina'

INFO:rdflib:RDFLib Version: 4.2.1


## Loading in Meta Data for all books

In [2]:
meta_df = pd.read_csv('gutenberg_metadata_en.csv', index_col = 'Unnamed: 0')

In [3]:
meta_df.head(2)

Unnamed: 0,id,title,author,LCC,downloads,subjects,subjects2,formats,authoryearofbirth,authoryearofdeath,type,language
0,1,The Declaration of Independence of the United ...,"Jefferson, Thomas","{E201, JK}",668,"{United States. Declaration of Independence, U...","[History, Revolution, 1775-1783, Sources, Unit...",{u'text/html': u'http://www.gutenberg.org/eboo...,1743.0,1826.0,Text,[en]
1,2,The United States Bill of Rights: The Ten Orig...,United States,"{KF, JK}",176,{United States. Constitution. 1st-10th Amendme...,"[Civil rights, Sources, United States, United ...",{u'text/html': u'http://www.gutenberg.org/file...,,,Text,[en]


In [4]:
meta_df.shape

(41376, 12)

In [5]:
meta_df['subjects2'] = meta_df['subjects2'].map(lambda s: s.replace('[', '').replace(']', '').split(','))

## Narrowing down data - creating a data frame with only literature or fiction.

In [6]:
meta_df['literature'] = meta_df['subjects2'].map(lambda sub: 1 if 'literature' in str(sub).lower().replace("'", '') else 0)
meta_df['fiction'] = meta_df['subjects2'].map(lambda sub: 1 if 'fiction' in str(sub).lower().replace("'", '') else 0)

In [7]:
meta_df.literature.value_counts()

0    39657
1     1719
Name: literature, dtype: int64

In [8]:
meta_df.fiction.value_counts()

0    26928
1    14448
Name: fiction, dtype: int64

In [9]:
lit_fiction = meta_df[(meta_df.literature == 1) | (meta_df.fiction == 1)].reset_index(drop = True)

In [10]:
lit_fiction.head(2)

Unnamed: 0,id,title,author,LCC,downloads,subjects,subjects2,formats,authoryearofbirth,authoryearofdeath,type,language,literature,fiction
0,15,Moby Dick,"Melville, Herman",{PS},707,"{Ship captains -- Fiction, Whaling ships -- Fi...","[Adventure stories, Ahab, Captain (Fictitiou...",{u'text/html': u'http://www.gutenberg.org/eboo...,1819.0,1891.0,Text,[en],0,1
1,16,Peter Pan,"Barrie, J. M. (James Matthew)","{PZ, PR}",4778,"{Peter Pan (Fictitious character) -- Fiction, ...","[Fairies, Fantasy, Fiction, Juvenile fictio...",{u'text/plain; charset=utf-8': u'http://www.gu...,1860.0,1937.0,Text,[en],0,1


In [11]:
lit_fiction.shape, lit_fiction.shape[0] / 700.

((16049, 14), 22.927142857142858)

In [12]:
test = pd.read_csv('book_data_22', encoding='utf8')
test.tail(2)

Unnamed: 0.1,Unnamed: 0,id,text
698,698,44445,BESSIE AMONG THE MOUNTAINS\n\n\n\n\nTHE BESSIE...
699,699,44448,(http://www.freeliterature.org) from page imag...


In [13]:
del test

In [14]:
lit_fiction[lit_fiction['id'] == 44448]

Unnamed: 0,id,title,author,LCC,downloads,subjects,subjects2,formats,authoryearofbirth,authoryearofdeath,type,language,literature,fiction
15403,44448,The Queen of the Savannah: A Story of the Mexi...,"Aimard, Gustave",{PQ},5,"{Mexico -- History -- Wars of Independence, 18...","[Fiction, History, Mexico, Wars of Independ...",{u'text/html': u'http://www.gutenberg.org/file...,1818.0,1883.0,Text,[en],0,1


## Now I need to get the text of the books!

In [15]:
exceptions = []
book_data = []

for i, book_id in enumerate(lit_fiction.ix[15404:,'id'].tolist()):
    try:
        book_data.append({
            'id':   book_id,
            'text': strip_headers(load_etext(book_id)).strip()
        })
        
    except:
        print "Could not fetch: ", book_id
        exceptions.append({'id':   book_id})
    
    #if (i > 0)  and (i % 700 == 0):
    #    raw_df = pd.DataFrame(book_data)
    #    raw_df.to_csv('book_data_%d' % (i/700 + 16), encoding = 'utf8')
    #    del raw_df
    #    book_data = []

#if i % 700 != 0:
raw_df = pd.DataFrame(book_data)
raw_df.to_csv('book_data_23', encoding = 'utf8')
del raw_df

INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP 

## Count Vectorizing!

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
cvect = CountVectorizer(stop_words='english')

In [4]:
current_books = pd.read_csv('book_data_1', encoding = 'utf8', index_col='Unnamed: 0')
current_books.dropna(inplace = True)

In [5]:
X = cvect.fit_transform(current_books['text'])
count_df = pd.DataFrame(X.toarray(), columns=cvect.get_feature_names())

In [6]:
count_df.insert(0, 'book_id', current_books.id)

In [7]:
count_df.head(2)

Unnamed: 0,book_id,00,000,0001,000436,0009,000_l,000â,001,002,...,égal,émigré,était,étienne,étoiles_,être,îles,îsles,îµî¹,î½
0,15,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
count_df.tail(2)

Unnamed: 0,book_id,00,000,0001,000436,0009,000_l,000â,001,002,...,égal,émigré,était,étienne,étoiles_,être,îles,îsles,îµî¹,î½
697,1433,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
698,1437,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
count_df.shape

(699, 185564)

In [4]:
for i in range(2, 24):
    current_books = pd.read_csv('book_data_%d' %i, encoding = 'utf8', index_col='Unnamed: 0')
    current_books.dropna(inplace = True)
    X = cvect.fit_transform(current_books['text'])
    batch = pd.DataFrame(X.toarray(), columns=cvect.get_feature_names())
    batch.insert(0, 'book_id', current_books.id)
    count_df = count_df.append(batch)
    count_df.reset_index(drop = True, inplace = True)


In [12]:
count_df.shape

(3491, 411111)

In [None]:
count_df.to_csv('count_df_1_5', encoding = 'utf8')

In [10]:
len(count_df)

699

## Investigating the subjects in this new dataframe

In [56]:
subject_list = []
for subs in lit_fiction['subjects2'].values:
    for s in subs:
        subject_list.append(s.replace('"', '').replace("'", '').replace('(', '').replace(')', '').replace('.', '').strip().lower())
        
        
subject_list = np.unique(subject_list)
subject_list[1000:1010], len(subject_list)

(array(['armenia', 'armies', 'arran', 'arranged marriage', 'arson',
        'arsonists', 'ars\xc3\xa8ne fictitious character', 'art',
        'art and literature', 'art and morals'], 
       dtype='|S94'), 5948)

In [57]:
subject_list_all = []
for subs in meta_df['subjects2'].values:
    for s in subs:
        subject_list_all.append(s.replace('"', '').replace("'", '').replace('(', '').replace(')', '').replace('.', '').strip().lower())
        
        
subject_list_all = np.unique(subject_list_all)
subject_list_all[5000:5010], len(subject_list_all)

(array(['clothing workers', 'clouds', 'clover', 'clowns', 'clubs',
        'clusters', 'cnaeus marcius', 'coach horses',
        'coaching transportation', 'coahuila state'], 
       dtype='|S135'), 15458)

In [59]:
subject_counts = {subject:0 for subject in subject_list}
subject_counts_all = {subject:0 for subject in subject_list_all}

for s_list in lit_fiction['subjects2'].values:
    for item in s_list:
        s = item.replace('"', '').replace("'", "").replace('(', '').replace(')', '').replace('.', '').lstrip().lower()
        subject_counts[s] += 1
        
for s_list in meta_df['subjects2'].values:
    for item in s_list:
        s = item.replace('"', '').replace("'", "").replace('(', '').replace(')', '').replace('.', '').lstrip().lower()
        try:
            subject_counts_all[s] += 1
        except:
            print s
        
subject_counts, subject_counts_all

republican party us : 1854- 
ku klux klan 1915- 
republican party us : 1854- 


({'404-362 bc': 2,
  'spiders': 1,
  'cyprus': 1,
  'german east africa': 2,
  'mentally ill': 9,
  'electricity': 9,
  '1550-1588': 3,
  '1517-1648': 1,
  'father and child': 1,
  'meadows': 1,
  'frame-stories': 5,
  'tunis tunisia': 1,
  'saskatchewan': 3,
  '1756-1791': 1,
  'mary i': 4,
  'eugenics': 1,
  'whitman': 3,
  'yachting': 8,
  'upper class families': 11,
  '1810-1856': 1,
  'succession': 2,
  'women journalists': 4,
  'rowing clubs': 1,
  'prayers and devotions': 2,
  'milles': 1,
  'bacon': 1,
  'elections': 2,
  'second': 2,
  'edgar allan': 2,
  'forest animals': 5,
  'whaling': 20,
  'grocery shopping': 1,
  'cooking': 4,
  'of austria': 1,
  '12-41': 1,
  'women plantation owners': 1,
  'mennonites': 2,
  'consort of charles': 1,
  'fossil': 1,
  'the great': 2,
  'atoms': 1,
  'canes': 1,
  'china': 22,
  '1736-1796 ossian': 1,
  'scuba diving': 1,
  'slaveholders': 1,
  'bunny fictitious character': 12,
  'continuity of the church': 1,
  'military': 9,
  'critici

#### Some subjects are very specific and some are quite broad. I'm going to find subjects tagged in more than 50  books to narrow it down.

In [48]:
count = 0
for k, v in subject_counts.items():
    if v >= 100:
        count += 1
        print v, '\t', k
print '-------------'
print count

# There are 183 subjects that are tagged in > 50 books
# There are 100 subjects that are tagged in > 100 books

167 	schools
3531 	juvenile fiction
177 	war stories
1127 	19th century
110 	american fiction
191 	20th century
158 	new england
327 	world war
115 	christian fiction
1102 	england
102 	marriage
460 	juvenile literature
667 	adventure stories
143 	young men
254 	animals
350 	humorous stories
1311 	science fiction
137 	sailors
233 	new york ny
275 	history and criticism
583 	periodicals
115 	india
1420 	social life and customs
126 	childrens literature
161 	civil war
228 	fantasy fiction
192 	families
203 	popular literature
135 	revolution
118 	courtship
155 	british
308 	adventure and adventurers
147 	family
1517 	history
450 	american
167 	scotland
363 	man-woman relationships
259 	domestic fiction
105 	shipwrecks
315 	english fiction
607 	translations into english
117 	political fiction
306 	1914-1918
177 	mystery fiction
418 	young women
113 	california
316 	western stories
147 	americans
305 	girls
199 	modern
339 	christian life
125 	american literature
143 	bildungsromans
174 	p

In [42]:
subjects_over_50 = []
for k, v in subject_counts.items():
    if v >= 50:
        subjects_over_50.append(k)
subjects_over_50

['schools',
 'juvenile fiction',
 'war stories',
 'cousins',
 'oceania',
 'tom fictitious character',
 '19th century',
 'american fiction',
 '20th century',
 'russia',
 'new england',
 'world war',
 'christian fiction',
 'college students',
 'description and travel',
 'science',
 'england',
 'marriage',
 '16th century',
 'juvenile literature',
 'europe',
 'adventure stories',
 'satire',
 'australia',
 'young men',
 'rescues',
 'magic',
 'twins',
 'animals',
 'humorous stories',
 'southern states',
 'science fiction',
 'sailors',
 'ship captains',
 'new york ny',
 'spain',
 'physicians',
 'fairies',
 'controversial literature',
 'history and criticism',
 'periodicals',
 'aeronautics',
 'india',
 'paranormal fiction',
 'social life and customs',
 'childrens literature',
 'dogs',
 'germany',
 'civil war',
 'epistolary fiction',
 'fantasy fiction',
 'families',
 'boarding schools',
 'popular literature',
 'religious fiction',
 'brothers',
 'didactic fiction',
 'married women',
 'revolution

In [43]:
for sub in subject_list:
    if 'child' in sub.lower():
        print sub

abandoned children
adopted children
blind children
body schema in children
boxcar children fictitious characters
catholic children
child abuse
child caregivers
child labor
child rearing
child soldiers
child witnesses
childhood and youth
children
children and adults
children and animals
children and death
children and war
children in literature
children in the bible
children of alcoholics
children of clergy
children of divorced parents
children of military personnel
children of missionaries
children of physicians
children of police
children of presidents
children of prisoners
children of the rich
children of women prisoners
children with disabilities
childrens accidents
childrens literature
childrens periodicals
childrens plays
childrens poetry
childrens sermons
childrens songs
childrens stories
christian literature for children
deaf children
discipline of children
father and child
feral children
gifted children
grandparent and child
homeless children
illegitimate children
immigrant chi

In [44]:
for sub in subject_list:
    if 'juvenile' in sub.lower():
        print sub

juvenile
juvenile and popular literature
juvenile drama
juvenile fction
juvenile fiction
juvenile literature
juvenile poetry
