In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import csv

"Book of the Week" was implemented starting in season 7. Seasons 7-12 have their own pages on the sidebar, reachable at links like https://writingexcuses.com/category/season/season-07/. This scraping will focus on these seasons.

In [2]:
#base url
base = 'https://writingexcuses.com/category/season/season-'

#adding seasons
seasons = ['07/', '08/', '09/', '10/', '11/', '12/']

#append season numbers for ultimate list of desired URLs
urls = []
for season in seasons:
    urls.append(base + season)

In [3]:
#loop through list of URLs and parse for the audiobook image indicating a book of the week
bow_list = []
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    bow_list.append(soup.find_all("div", class_="wx_audiobook"))

In [4]:
#get idea of list structure
bow_list[0][0:5]

[<div class="wx_audiobook"><p><em><a href="http://www.audible.com/pd/ref=sr_1_1?asin=B004XMIMHE&amp;qid=1325467550&amp;sr=1-1">Hard Magic</a>,</em> by Larry Correia, narrated by Bronson Pinchot</p>
 </div>,
 <div class="wx_audiobook"><p><em><a href="http://www.audible.com/pd/ref=sr_1_1?asin=B0036N2C7M&amp;qid=1326085485&amp;sr=1-1">A Fire Upon the Deep</a>,</em> by Vernor Vinge, narrated by Peter Larkin</p>
 </div>,
 <div class="wx_audiobook"><p><em><a href="http://www.audible.com/pd/ref=sr_1_1?asin=B002V8N9VG&amp;qid=1326674233&amp;sr=1-1">Speaker for the Dead</a></em>, by Orson Scott Card, narrated by Stefan Rudniki. It’s a fantastic example of well-constructed flora and fauna, and it’s also a good example of how to make a sequel almost completely unlike the book that came before it.</p>
 </div>,
 <div class="wx_audiobook"><p><em><a href="http://www.audible.com/pd/ref=sr_1_11?asin=B002UZL5SY&amp;qid=1327276360&amp;sr=1-11">Farenheit 451</a>,</em> by Ray Bradbury, narrated by the auth

In [5]:
#find the text under the 'a' tag in the first element of the soup
bow_list[0][0].find('a').text

'Hard Magic'

In [6]:
#create empty list for book titles
books = []

#iterate through nested list
#append book title depending on the tag it falls under
for bow_season in bow_list:
    for book_element in bow_season:
        if book_element.find('em') != None:
            books.append(book_element.find('em').text)
        elif book_element.find('a') != None:
            books.append(book_element.find('a').text)
        else:
            books.append(book_element.find('p').text)

In [7]:
#check structure of book list
books[0:5]

['Hard Magic,',
 'A Fire Upon the Deep,',
 'Speaker for the Dead',
 'Farenheit 451,',
 'Terrorists in Love: The Real Stories of Islamic Radicals']

In [8]:
bow_list[0:1]

[[<div class="wx_audiobook"><p><em><a href="http://www.audible.com/pd/ref=sr_1_1?asin=B004XMIMHE&amp;qid=1325467550&amp;sr=1-1">Hard Magic</a>,</em> by Larry Correia, narrated by Bronson Pinchot</p>
  </div>,
  <div class="wx_audiobook"><p><em><a href="http://www.audible.com/pd/ref=sr_1_1?asin=B0036N2C7M&amp;qid=1326085485&amp;sr=1-1">A Fire Upon the Deep</a>,</em> by Vernor Vinge, narrated by Peter Larkin</p>
  </div>,
  <div class="wx_audiobook"><p><em><a href="http://www.audible.com/pd/ref=sr_1_1?asin=B002V8N9VG&amp;qid=1326674233&amp;sr=1-1">Speaker for the Dead</a></em>, by Orson Scott Card, narrated by Stefan Rudniki. It’s a fantastic example of well-constructed flora and fauna, and it’s also a good example of how to make a sequel almost completely unlike the book that came before it.</p>
  </div>,
  <div class="wx_audiobook"><p><em><a href="http://www.audible.com/pd/ref=sr_1_11?asin=B002UZL5SY&amp;qid=1327276360&amp;sr=1-11">Farenheit 451</a>,</em> by Ray Bradbury, narrated by t

In [9]:
#create empty list for all text associated with the audiobook div
#iterate through nested list using list comprehension

alltext = [bow_element.find('p').text for bow_season in bow_list for bow_element in bow_season]

In [10]:
#check alltext structure
alltext[0:5]

['Hard Magic, by Larry Correia, narrated by Bronson Pinchot',
 'A Fire Upon the Deep, by Vernor Vinge, narrated by Peter Larkin',
 'Speaker for the Dead, by Orson Scott Card, narrated by Stefan Rudniki. It’s a fantastic example of well-constructed flora and fauna, and it’s also a good example of how to make a sequel almost completely unlike the book that came before it.',
 'Farenheit 451, by Ray Bradbury, narrated by the author',
 'Terrorists in Love: The Real Stories of Islamic Radicals, by Ken Ballen, narrated by Peter Ganim']

In [11]:
#create empty list for authors names to be pulled from alltext list
authors = []

#iterate through alltext list and use regular expressions to try to eliminate unneccesary text
for index, line in enumerate(alltext):
    line = line.replace(books[index], '')
    if re.search("by (.*), narrated by", line):
        authors.append(re.search("by (.*), narrated by", line).group(1))
    else:
        authors.append(line)

In [12]:
#check that lengths of lists are the same
len(authors) == len(books)

True

In [13]:
#eliminate commas from the end of some book titles
books = [book[:-1] if book.endswith(',') else book for book in books]

In [14]:
#check it worked
books[0:5]

['Hard Magic',
 'A Fire Upon the Deep',
 'Speaker for the Dead',
 'Farenheit 451',
 'Terrorists in Love: The Real Stories of Islamic Radicals']

In [15]:
clean_auth = []

#eliminate commas, spaces, and 'by' from beginning of some author names
for author in authors:
    if 'by ' in author:
        clean_auth.append(author[author.find('by ')+3:])
    else:
        clean_auth.append(author)

In [16]:
#check it worked
clean_auth[0:5]

['Larry Correia',
 'Vernor Vinge',
 'Orson Scott Card',
 'Ray Bradbury',
 'Ken Ballen']

In [17]:
finaldf = pd.DataFrame(list(zip(books, clean_auth)), columns =['Title', 'Author/Info'])
finaldf

Unnamed: 0,Title,Author/Info
0,Hard Magic,Larry Correia
1,A Fire Upon the Deep,Vernor Vinge
2,Speaker for the Dead,Orson Scott Card
3,Farenheit 451,Ray Bradbury
4,Terrorists in Love: The Real Stories of Islami...,Ken Ballen
...,...,...
308,The Butcher of Khardov,Dan Wells
309,Oathbringer,Brandon Sanderson
310,Always Coming Home,Ursula Le Guin
311,The Diabolic,SJ Kincaid


In [18]:
#convert the two lists to a dictionary where the book titles are the keys and author text are the values
final = {books[i]: clean_auth[i] for i in range(len(books))} 

In [19]:
#write to CSV
with open('BOWfromdict.csv', 'w', newline="") as csv_file:  
    writer = csv.writer(csv_file)
    for key, value in final.items():
       writer.writerow([key, value])

In [20]:
#or, write to CSV from the dataframe
finaldf.to_csv('BOWfromcsv.csv')