## Solution

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

class Downloader():
    '''
    A downloader class containing methods to find all links on books on given page, storing Book objects and extract pandas dataframe
    
    Usage
    -----
    Initiate the class and it will find all books and save it.
    
    Input
    -----
    `link`: link to the list of books on wikipedia
    
    Attributes
    ----------
    `soup`: BeautifulSoup objects with the list of books
    `links`: list with links to books
    `books`: pd.Dataframe with character name and the book as columns
    
    '''
    def __init__(self,link='http://wiki.lspace.org/mediawiki/List_of_Pratchett_characters'):
        '''
        Constructor extracting links (`self.links`) from the given page and creating DataFrame with character names and books.
        
        `link`: the link to the page with list of links
        
        '''
        resp = requests.get(link)
        self.soup = BeautifulSoup(resp.text,'lxml')
        
        self.getLinks()
        self.getBooks()
        
        
    def getLinks(self):
        '''
        extracts all links to the individual books pages
        '''
        lis = self.soup.find('span',{'id':'Unseen_University_and_the_Wizards'}).parent.previous.previous.parent.parent.parent.find_all('li')
        self.links = ['http://wiki.lspace.org' + li.find('a')['href'] for li in lis]
    
    def getBooks(self):
        '''
        for each link stored in `links` generate corresponding `Book` object
        
        returns list of Book objects
        '''
        
        self.books = pd.concat([Book(link).df for link in self.links], axis = 0)

    
class Book():
    '''
    A class for scraped object containing its html as well as DataFrame with parsed characteristics
    
    Usage
    -----
    just initiate the object with link pointing at the book
    
    Input
    -----
    `link`: the wikipedia link to the individual book
    '''
    def __init__(self,link):
        '''
        After downloading HTML of the page parses all relevant informations and stores them in the DataFrame called df
        '''
        resp = requests.get(link)
        self.link = link
        self.soup = BeautifulSoup(resp.text,'lxml')
        
        self.df = self.parseBook()
        
    def parseBook(self):
        '''
        Returns DataFrame with pairs of character name and the book as 2 columns.
        '''

        name = self.soup.find('h1',{'id':'firstHeading'}).text
        all_book_links = self.soup.find('table').find_all("a", href=lambda href: href and "Book:" in href)
        books = [book_link.text for book_link in all_book_links]
        print(f'Book {name} parsed')
        return pd.DataFrame([{'character_name':name,'book':b} for b in books])
    

In [2]:
dl = Downloader('http://wiki.lspace.org/mediawiki/List_of_Pratchett_characters')

Book Evadne Cake parsed
Book Fred Colon parsed
Book Marietta Cosmopilite parsed
Book Detritus parsed
Book Cut-Me-Own-Throat Dibbler parsed
Book Dorfl parsed
Book Rufus Drumknott parsed
Book Gaspode parsed
Book Carrot Ironfoundersson parsed
Book Leonard of Quirm parsed
Book Cheery Littlebottom parsed
Book Nobby Nobbs parsed
Book Sybil Ramkin parsed
Book Foul Ole Ron parsed
Book Reg Shoe parsed
Book Slant parsed
Book Angua von Überwald parsed
Book Havelock Vetinari parsed
Book Samuel Vimes parsed
Book Visit-The-Infidel-With-Explanatory-Pamphlets parsed
Book Willikins parsed


### Question 1

In [3]:
dl.books[dl.books['book'] == 'Men at Arms'].shape[0]

14

### Question 2

In [4]:
# one possibility
evadne_books = dl.books.loc[dl.books.character_name == 'Evadne Cake', 'book']
pamphlets_books = dl.books.loc[dl.books.character_name == 'Visit-The-Infidel-With-Explanatory-Pamphlets','book']
set(evadne_books).intersection(set(pamphlets_books))

{'Jingo'}

In [5]:
# alternative
dl.books.groupby('book').apply(lambda g: ('Visit-The-Infidel-With-Explanatory-Pamphlets' in g.character_name.values) & ('Evadne Cake' in g.character_name.values))

book
Feet of Clay                        False
Going Postal                        False
Guards! Guards!                     False
Hogfather                           False
I Shall Wear Midnight               False
Interesting Times                   False
Jingo                                True
Johnny and the Bomb                 False
Making Money                        False
Maskerade                           False
Men at Arms                         False
Monstrous Regiment                  False
Mort                                False
Moving Pictures                     False
Night Watch                         False
Raising Steam                       False
Reaper Man                          False
Small Gods                          False
Snuff                               False
Soul Music                          False
Sourcery                            False
The Celebrated Discworld Almanak    False
The Colour of Magic                 False
The Fifth Elephant           