In [None]:
# this notebook is designed to combine the web scraping of gutenberg.org for the top novels on their website. 
# these items will then be used to create the files for a subsequent machine learning notebook

In [2]:
import pandas as pd
import requests
import bs4 as bs
from bs4 import BeautifulSoup
import urllib.request
import re
import urllib3

In [None]:
# this is creating a pandas dataframe to use for our novels. These will then be split between narration and dialogue
df = pd.read_csv()

In [3]:
# creating url request object for novels available in gutenberg.org's free publications.
# gutenberg was prefered given that all of their available novels are available as HTML and is easier to parse
r = requests.get('https://gutenberg.org/browse/scores/top')

In [4]:
# creating beautiful soup object to parse through our HTML items taken from gutenberg
soup = BeautifulSoup(r.text, 'html.parser')

In [5]:
# making an empty list to append all 'body' elements of the web pages within the top novels available from gutenberg
bookLinks = []
for i in soup.find(class_='body').find_all('a', limit=107):
    bookLinks.append(i)

In [6]:
# after looking at the 'bookLinks' object, I noticed the first 7 are garbage every time, so I deleted it.
del bookLinks[:7]
bookString = str(bookLinks) # we need to turn our list objects into strings so they can be used in a regex search

In [7]:
# here we are creating objects to look for all of the HTML href items so that they can be used to access the - 
# - web pages that hold our desired novels
urlSearch = re.compile(r'href="(.*?)"')
listUrls= urlSearch.findall(bookString)
listUrls

['/ebooks/1342',
 '/ebooks/84',
 '/ebooks/844',
 '/ebooks/376',
 '/ebooks/215',
 '/ebooks/43',
 '/ebooks/2542',
 '/ebooks/11',
 '/ebooks/120',
 '/ebooks/1080',
 '/ebooks/98',
 '/ebooks/1635',
 '/ebooks/1952',
 '/ebooks/514',
 '/ebooks/46',
 '/ebooks/1661',
 '/ebooks/76',
 '/ebooks/2701',
 '/ebooks/1250',
 '/ebooks/61703',
 '/ebooks/74',
 '/ebooks/2591',
 '/ebooks/1064',
 '/ebooks/158',
 '/ebooks/345',
 '/ebooks/174',
 '/ebooks/5200',
 '/ebooks/16',
 '/ebooks/1260',
 '/ebooks/160',
 '/ebooks/219',
 '/ebooks/205',
 '/ebooks/1400',
 '/ebooks/25525',
 '/ebooks/25344',
 '/ebooks/2600',
 '/ebooks/23700',
 '/ebooks/45',
 '/ebooks/2852',
 '/ebooks/28860',
 '/ebooks/1232',
 '/ebooks/2814',
 '/ebooks/55',
 '/ebooks/4300',
 '/ebooks/36',
 '/ebooks/16328',
 '/ebooks/768',
 '/ebooks/42324',
 '/ebooks/113',
 '/ebooks/408',
 '/ebooks/1184',
 '/ebooks/2500',
 '/ebooks/3825',
 '/ebooks/35',
 '/ebooks/730',
 '/ebooks/2554',
 '/ebooks/863',
 '/ebooks/61698',
 '/ebooks/236',
 '/ebooks/203',
 '/ebooks/2805

In [8]:
# here we are gathering some more locations of the top last 30 day novels in gutenberg so that we have a larger -
# amount to choose from. We will combine the lists and scrub any duplicates later.
s = requests.get('https://www.gutenberg.org/browse/scores/top#books-last30')
soupS = BeautifulSoup(s.text, 'html.parser')


In [9]:
# here we are appending another empty list, this time we're searching for. I used 'a' instead of href because it -
# was returning other webpage html links that I didn't want. So I got specific within the ordered list item in HTML
bookLinks2 = []
for i in soupS.find('ol').find_all('a', limit=100):
    bookLinks2.append(i)
bookString2 = str(bookLinks2) # again turning them into strings so that we can regex them
listUrls2 = urlSearch.findall(bookString2)

In [10]:
# here we're merging the top 100 list and the top books in the last 30 days list
merge = listUrls2
for i in listUrls:
     if i not in listUrls2:
            merge.append(i)


In [11]:
# we are taking the regexed elements from our lists and then adding the proper url to them so that we can 
# search an intermediary file location for our specific novel pages with another beautiful soup scraping.
merge2 = []
for i in merge:
     merge2.append('https://www.gutenberg.org' + i)

In [12]:
# weird problem with the quotations, so we inserted them here
merge3 = []
for i in merge2:
    merge3.append("'" + i + "'")
merge3

["'https://www.gutenberg.org/ebooks/1342'",
 "'https://www.gutenberg.org/ebooks/376'",
 "'https://www.gutenberg.org/ebooks/84'",
 "'https://www.gutenberg.org/ebooks/11'",
 "'https://www.gutenberg.org/ebooks/844'",
 "'https://www.gutenberg.org/ebooks/1635'",
 "'https://www.gutenberg.org/ebooks/25525'",
 "'https://www.gutenberg.org/ebooks/98'",
 "'https://www.gutenberg.org/ebooks/23700'",
 "'https://www.gutenberg.org/ebooks/2542'",
 "'https://www.gutenberg.org/ebooks/514'",
 "'https://www.gutenberg.org/ebooks/1661'",
 "'https://www.gutenberg.org/ebooks/1952'",
 "'https://www.gutenberg.org/ebooks/215'",
 "'https://www.gutenberg.org/ebooks/1080'",
 "'https://www.gutenberg.org/ebooks/2701'",
 "'https://www.gutenberg.org/ebooks/1064'",
 "'https://www.gutenberg.org/ebooks/43'",
 "'https://www.gutenberg.org/ebooks/158'",
 "'https://www.gutenberg.org/ebooks/2591'",
 "'https://www.gutenberg.org/ebooks/2600'",
 "'https://www.gutenberg.org/ebooks/28860'",
 "'https://www.gutenberg.org/ebooks/205'",

In [13]:
# this will append the empty list to create yet another list to finally get the specific addresses for each book
textLink = []
for url in merge2:
    t = requests.get(url)
    soupT = BeautifulSoup(t.text, 'html.parser')
    for i in soupT.find_all('a', string='Read this book online: HTML'):
        textLink.append(i)



In [14]:
textLinkString = str(textLink) # turning all elements into string for regex search

In [15]:
finalUrls= urlSearch.findall(textLinkString)
finalUrls2 = []
for i in finalUrls:
    finalUrls2.append('https://www.gutenberg.org' + i)

In [94]:
df = pd.DataFrame(columns=['book'])


In [86]:
# don't need to run this one. It was for titles. I don't want that anymore since I could clearly take them from -
# the previous web scrapes

# title = []
# body = []
# try:
#     for i in finalUrls2:
#         u = requests.get(i)
#         soupU = BeautifulSoup(u.text, 'html.parser')
#         for j in soupU.find('title'):
#             title.append(j)
# except:
#     print('')

In [164]:
# here I had to get fancy several times over so I could remove all of the nonsense elements that were being - 
# inserted into my pandas data frames.
""" as a reminder, I needed to create two separate data frames involving the words taken from all of the 
books that were successfully accessed and scraped. These separate elements are going to be used in the machine
learning process to auto generate either dialogue based writing, or narative based writing."""
dfDialogue = pd.DataFrame() # create pandas dialogue dataframe
dfNarrative = pd.DataFrame() # create pandas narrative dataframe
for idx, url in enumerate(finalUrls2): #iterating through our final urls list created in our last cell
        v = requests.get(url) #accessing web page
        soupV = BeautifulSoup(v.text, 'html.parser') # using beautiful soup to scrape accessed urls
        book = [] # empty list to appened items to
        try:
            for j in soupV.find('body').find_all_next('p'): # attempting url address to pull information that may-
                # -or may not be available in each specific web page. Several of them utilize different html structures
                book.append(j)# here we appeneded all paragraph items in the body portion of the url's html
            dfBook = pd.DataFrame(book, columns=['words']) #creating column to place items in
            dfBook = dfBook[dfBook.words.str.len() <4] # another weird html work around. There were a lot of artifacts in several pages
            dfBook = dfBook.astype(str) # converting items to string
            dfBook = dfBook[dfBook.words.str.len() > 40] #same weird html work arounds as before
            dfBook['html'] = dfBook.words.str.contains(r'(class)|(href)|(link)|(.*=)') # looking for specific -
            # phrases to capture the elements needed. these items ensure a successfull search over all pages
            dfBook = dfBook[dfBook['html']==False] # needed to separate html items left over
            dfBook['dialogue'] = dfBook.words.str.contains(r'(\".*\")+|([\u201C].*)+|([\u201D].*)+') # -
# - these seemingly random alterations to the dataframe were needed to enhance the performace of the machine -
# -learning within the next notebook. this case found quotation marks that were not standard. Other items left -
# - random html markup language in the dataframes, which needed to be removed for proper machine learning.
            dfBookDialogue = dfBook[dfBook['dialogue']==True] # needed to separate dialoge and narative items
            dfBookNarrative = dfBook[dfBook['dialogue']==False]
            dfBookDialogue.pop('dialogue') # these next items clean up the unecessary columns of data in the dataframes
            dfBookDialogue.pop('html') 
            dfBookNarrative.pop('dialogue')
            dfBookNarrative.pop('html')
            if len(dfBookDialogue) > 1: # another weird artifact that wasn't discerning between short dialogue
                dfDialogue = dfDialogue.append(dfBookDialogue)
            dfNarrative = dfNarrative.append(dfBookNarrative)
#                 dfBookDialogue.to_csv(r'/home/lonbergercj/books/dfBookDialogue'+str(idx)+'.csv')
#             dfBookNarrative.to_csv(r'/home/lonbergercj/books/dfBookNarrative'+str(idx)+'.csv')
        except:
            print(str(idx) + 'no work good')
dfDialogue.to_csv(r'/home/lonbergercj/books/dfDialogue.csv') # splitting csv files to be accessed by fast.ai notebook
dfNarrative.to_csv(r'/home/lonbergercj/books/dfNarrative.csv')

14no work good
23no work good
25no work good
31no work good


In [19]:
df = pd.read_csv(r'/home/lonbergercj/books/dfDialogue.csv') # needed to check out the random artifacts in the dataframes
# df.replace(regex=True,inplace=True,to_replace=[r'<.*>', '\\r','\\n', '()', 'â', '\\t'],value=r'')
df = pd.DataFrame(df)

In [23]:
df

Unnamed: 0,words
0,"“My dear Mr Bennet,” said his lady to hi..."
1,"“But it is,” returned she; “for Mrs Long..."
2,“Do you not want to know who has taken i...
3,"“ want to tell me, and I have no objecti..."
4,"“Why, my dear, you must know, Mrs Long s..."
...,...
74904,"""Quick sir Here!"" Adam's staring eyes saw the ..."
74905,"With Paulette at their heels, Adam and Wayland..."
74907,Wayland's eyes lost their wild look A great si...
74908,"For a long moment no one spoke Then Paulette, ..."


In [18]:
df.to_csv(r'/home/lonbergercj/books/dfN.csv')

In [7]:
# dfBook['dialogue'] = dfBook.words.str.contains(r'(\".*\")|(\'.*\')')

In [100]:
dfNarrative # this is an example of the leftover HTML artifacts that needed removed from the dataframe -
# - for instance the <p> \r\n...etc...

Unnamed: 0,words
66,<p>\r\n It is a truth universally acknowl...
67,<p>\r\n However little known the feelings...
68,"<p>\r\n “My dear Mr. Bennet,” said his la..."
69,<p>\r\n Mr. Bennet replied that he had no...
70,"<p>\r\n “But it is,” returned she; “for M..."
...,...
828,<p>\n If I should say that this is a visi...
829,<p>\n Nor will I deny but there were abun...
830,<p>\n It was a common thing to meet peopl...
831,"<p>\n It was now, as I said before, the p..."


In [97]:
dfDialogue

Unnamed: 0,words
66,<p>\r\n It is a truth universally acknowl...
67,<p>\r\n However little known the feelings...
68,"<p>\r\n “My dear Mr. Bennet,” said his la..."
69,<p>\r\n Mr. Bennet replied that he had no...
70,"<p>\r\n “But it is,” returned she; “for M..."
...,...
828,<p>\n If I should say that this is a visi...
829,<p>\n Nor will I deny but there were abun...
830,<p>\n It was a common thing to meet peopl...
831,"<p>\n It was now, as I said before, the p..."
