# See if character lists are available for LitBank texts

In [3]:
# Build LitBank titles
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

fpath = '/data/storyq/litbank_titles.csv'
litbank = pd.read_csv(fpath, sep='\t+')
litbank

  import sys


Unnamed: 0,Gutenberg ID,Date,Author,Title
0,514,1868,"Alcott, Louisa May",Little Women
1,18581,1904,"Alger, Horatio, Jr.",Adrift in New York: Tom and Florence Braving the World
2,5348,1868,"Alger, Horatio, Jr.","Ragged Dick, Or, Street Life in New York with the Boot-Blacks"
3,158,1815,"Austen, Jane",Emma
4,105,1818,"Austen, Jane",Persuasion
5,1342,1813,"Austen, Jane",Pride and Prejudice
6,1206,1914,"Bower, B. M.",The Flying U Ranch
7,969,1848,"Brontë, Anne",The Tenant of Wildfell Hall
8,1260,1847,"Brontë, Charlotte",Jane Eyre: An Autobiography
9,768,1847,"Brontë, Emily",Wuthering Heights


In [65]:
# Try to ping SparkNotes for character lists
import urllib
from bs4 import BeautifulSoup
import time
from tqdm.notebook import tqdm

chars = {}
books_found = []
books_notfound = []
books_identity = []

for title in tqdm(litbank.Title):
    print(title)
    if title in ['Black Beauty', 'The Secret Agent: A Simple Tale', 'Vanity Fair', 'The Legend of Sleepy Hollow', 'A Princess of Mars',
                'The Invisible Man: A Grotesque Romance', 'Night and Day', 'Lady Bridget in the Never-Never Land: a story of Australian life']:
        continue
    alts = []
    if 'Tom Jones' in title:
        alts.append('tomjones')
    if 'Portrait of the Artist' in title:
        alts.append('portraitartist')
    if 'Wuthering Heights' in title:
        alts.append('wuthering')
    if "Alice's Adventures" in title:
        alts.append('alice')
    if "Great Expectations" in title:
        alts.append('greatex')
    alts.append(urllib.parse.quote(title.lower().split(':')[0].replace(' ', '')))
    alts.append(urllib.parse.quote(title.lower().split(';')[0].replace(' ', '')))
    alts.append(urllib.parse.quote(title.replace('A', '').lower().split(':')[0].replace('the', '').replace("'", '').split(' ')[0]))
    alts.append(urllib.parse.quote(title.lower().split(':')[0].replace('the', '').replace("'", '').split(' ')[-1]))
    alts.append(urllib.parse.quote(title.lower().split(';')[0].replace('the', '').replace("'", '').split(' ')[-1]))
    alts.append(urllib.parse.quote(title.lower().split(':')[0].replace('the', '').replace("'s", '').split(' ')[-1]))
    alts.append(urllib.parse.quote(''.join(title.lower().split(':')[0].replace('the', '').replace("'", '').split(' ')[-2:])))
    alts.append(urllib.parse.quote(title.lower().split(':')[0].replace(' ', '').replace('the', '')))
    alts.append(urllib.parse.quote(title.lower().split(':')[0].replace(' ', '-')))
    if 'of' in title.lower().split(':')[0]:
        alts.append(urllib.parse.quote(title.lower().replace('the', '').split(' of ')[0].replace(' ', '')))
    for alt in alts:
        url = f'https://www.sparknotes.com/lit/{alt}/characters'
        try:
            res = str(urllib.request.urlopen(url).read())
            books_found.append((title, alt))
            print('\tfound')
            break
        except urllib.error.HTTPError:
            continue
        time.sleep(.2)
    else:
        books_notfound.append((title, alts))
        continue

    soup = BeautifulSoup(res, 'html.parser')
    el = soup.find('meta', {'name': "keywords"})
    chars[title] = el['content'].split('characters, ')[-1].split(', ')
    
    # Search for identity terms
    terms = ['black', 'slave', 'indian', 'chinese', 'arab']
    all_text = soup.get_text().lower()
    for term in terms:
        if term in all_text:
            books_identity.append((title, term, all_text))

    
print(f'#books found: {len(books_found)}')
print(f'#books not found: {len(books_notfound)}')
print(f'#books with identity terms: {len(books_identity)}')

  0%|          | 0/101 [00:00<?, ?it/s]

Little Women
	found
Adrift in New York: Tom and Florence Braving the World
Ragged Dick, Or, Street Life in New York with the Boot-Blacks
Emma
	found
Persuasion
	found
Pride and Prejudice
	found
The Flying U Ranch
The Tenant of Wildfell Hall
Jane Eyre: An Autobiography
	found
Wuthering Heights
	found
Clotelle: A Tale of the Southern States
The Secret Garden
	found
Evelina, Or, the History of a Young Lady's Entrance into the World
A Princess of Mars
Tarzan of the Apes
The Way of All Flesh
Alice's Adventures in Wonderland
	found
O Pioneers!
	found
The Song of the Lark
The House Behind the Cedars
	found
The Man Who Was Thursday: A Nightmare
The Awakening, and Selected Short Stories
The Secret Adversary
The Moonstone
	found
Heart of Darkness
	found
The Secret Agent: A Simple Tale
The Last of the Mohicans; A narrative of 1757
	found
The Red Badge of Courage: An Episode of the American Civil War
	found
Life in the Iron-Mills; Or, The Korl Woman
The Life and Adventures of Robinson Crusoe
	foun

In [66]:
for el in books_identity:
    print(el[0])
    print(el[1])
    print()

The Moonstone
black

The Moonstone
indian

The Last of the Mohicans; A narrative of 1757
indian

The Last of the Mohicans; A narrative of 1757
arab

The Life and Adventures of Robinson Crusoe
black

The Life and Adventures of Robinson Crusoe
slave

The Life and Adventures of Robinson Crusoe
arab

Great Expectations
black

Middlemarch
black

History of Tom Jones, a Foundling
black

The Good Soldier
black

The Turn of the Screw
black

A Portrait of the Artist as a Young Man
black

Dubliners
arab

Moby Dick; Or, The Whale
black

Moby Dick; Or, The Whale
indian

Moby Dick; Or, The Whale
chinese

Treasure Island
black

The Adventures of Tom Sawyer
slave



In [63]:
len(set(book[0] for book in books_identity))

14

In [64]:
books_found

[('Little Women', 'littlewomen'),
 ('Emma', 'emma'),
 ('Persuasion', 'persuasion'),
 ('Pride and Prejudice', 'pride'),
 ('Jane Eyre: An Autobiography', 'janeeyre'),
 ('Wuthering Heights', 'wuthering'),
 ('The Secret Garden', 'secretgarden'),
 ("Alice's Adventures in Wonderland", 'alice'),
 ('O Pioneers!', 'opioneers%21'),
 ('The House Behind the Cedars', 'cedars'),
 ('The Moonstone', 'moonstone'),
 ('Heart of Darkness', 'heart'),
 ('The Last of the Mohicans; A narrative of 1757', 'mohicans'),
 ('The Red Badge of Courage: An Episode of the American Civil War',
  'redbadge'),
 ('The Life and Adventures of Robinson Crusoe', 'crusoe'),
 ('Bleak House', 'bleakhouse'),
 ('David Copperfield', 'copperfield'),
 ('Great Expectations', 'greatex'),
 ('Oliver Twist', 'oliver'),
 ('The Hound of the Baskervilles', 'hound'),
 ('Sister Carrie: A Novel', 'sistercarrie'),
 ('Middlemarch', 'middlemarch'),
 ('Silas Marner', 'silas'),
 ('History of Tom Jones, a Foundling', 'tomjones'),
 ('This Side of Parad

In [47]:
books_notfound

[('Adrift in New York: Tom and Florence Braving the World',
  ['adriftinnewyork',
   'adriftinnewyork%3Atomandflorencebravingtheworld',
   'drift',
   'york',
   'york',
   'newyork',
   'adriftinnewyork',
   'adrift-in-new-york']),
 ('Ragged Dick, Or, Street Life in New York with the Boot-Blacks',
  ['raggeddick%2Cor%2Cstreetlifeinnewyorkwiththeboot-blacks',
   'raggeddick%2Cor%2Cstreetlifeinnewyorkwiththeboot-blacks',
   'rgged',
   'boot-blacks',
   'boot-blacks',
   'boot-blacks',
   'raggeddick%2Cor%2Cstreetlifeinnewyorkwithboot-blacks',
   'ragged-dick%2C-or%2C-street-life-in-new-york-with-the-boot-blacks']),
 ('The Flying U Ranch',
  ['theflyinguranch',
   'theflyinguranch',
   '',
   'ranch',
   'ranch',
   'uranch',
   'flyinguranch',
   'the-flying-u-ranch']),
 ('The Tenant of Wildfell Hall',
  ['thetenantofwildfellhall',
   'thetenantofwildfellhall',
   '',
   'hall',
   'hall',
   'wildfellhall',
   'tenantofwildfellhall',
   'the-tenant-of-wildfell-hall',
   'tenant']),
 (