In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
from time import sleep
import tqdm
import pandas as pd
import numpy as np

### Get-Soup function
#### Gets the text from the Website

In [2]:
def getSoup(link):
    '''
    No big explanation needed as copied from the lecture, however I kept it on purpose out of the 'DownloaderMidterm' class, because it is in principle 
    reusable for multiple classes. Therefore it is more efficient to keep it outside of the class instead of using it as a class method for each class
    (although of course here I just have one class so it would be the same).
    '''
    sleep(0.1) #to be kind to the website
    r = requests.get(link)
    r.encoding = 'UTF-8'
    return BeautifulSoup(r.text,'lxml') #returns BeautifulSoup object in 'lxml' style

### Downloader Class
#### Uses Get-Soup function with input link to get content from website specified and then gets links to further scrape sub-pages

In [3]:
class DownloaderMidterm(): 
    def __init__(self):
        '''
        'DownloaderMidterm' is a class which allows the user to scrape information about Discworld characters related to Ankh-Morpork and The Watch from its 
        Wikipedia page and then save this in a handy dataframe and gets some interesting statistics about it. When initialized 'DownloaderMidterm' gets the 
        scraped main page which contains the links to the sub-pages as a Beautiful Soup object, calls the first method 'multi_links' and the second method 
        'df_create'. The first method: multi_links retrieves the links which are necessary to access the sub pages which contain the desired data (names and
        book appearances of different characters connceted to Ankh-Morpork and The Watch). The second method 'df_create' iterates over these links (therefore 
        the sub pages) and obtains the desired data, and then converts it into a 'pretty' dataframe. The results of these two methods are saved as a class 
        attribute and therefore are easily accessible. Note however, that despite that the methods are called automatically when the class is initialized they do 
        not automatically return any output. They are soley called so that they can be used in other class methods, so in case that the user forgets/ doesn't want 
        to call all the required methods in the correct order. To return an output or save the output to a variable the methods  'multi_links' and 'df_create' or the 
        corresponding class attributes have to be called by the user explicitly. The remaining methods 'num_char' and 'same_books' are some interesting statistics 
        of the scraped data. They have to be called by the user explicitly. 'num_char' returns the number of characters appearing in the specified book and 
        'same_books' returns a list of books in which two specified characters appear together. 
        '''
    
        self.link = 'http://wiki.lspace.org/mediawiki/List_of_Pratchett_characters' #link of the main page where the links to the sub-pages are found
        self.soup = getSoup(self.link) #get Soup of the main page
        self.multi_link = self.multi_links() # calling self.multi_links() method when initialized so that other functions can use its output, output stored as a class attribute
        self.df = self.df_create() #also called when initialized
                
    def multi_links(self, category='Ankh-Morpork_and_The_Watch'):
        '''
        Finds all the links which are leading to the sub-pages. Returns a list containing the links when called.
        '''  
        links=self.soup.find('span', id='Ankh-Morpork_and_The_Watch').parent.findNext('ul').find_all('li')
        return ['http://wiki.lspace.org' + li.find('a')['href'] for li in links]
    
    def df_create(self):
        '''
        Iterates over links/sub-pages and retrieves data. Saves the data to lists which then are combined into one handy dataframe. Returns dataframe when called.
        ''' 
        characters = self.multi_link 
        alle_Namen=[]
        alle_Bucher=[]

        for char in characters:
            soup2 = getSoup(char)
            Heading = soup2.find('table').find('tr').find('b').text.strip() #finds Names in Headers of right hand side table, but are not the full names so prefer the Name row if exists 
            Name = soup2.find('b', text='Name').parent.next_sibling.next_sibling.text.strip() #finds all the Names, except for Mr. Slant because his name is not specified in the right position
            if Name in ['']:             # if Name exists in Name row appends/uses the full Name, if doesn't exist (ergo ='') then it uses the Name from the Headers of the table
                alle_Namen.append(Heading)
            else:
                alle_Namen.append(Name)
            Books = soup2.find('a', title='Bibliography').parent.parent.next_sibling.next_sibling.find_all('a') #finds books 
            book_list =[boo.text.strip() for boo in Books] # iterates over books so that each book is own element in a list
            alle_Bucher.append(book_list) #appends lsit of books for each character as a sublist to the main list

        df1 = pd.DataFrame({'character_name': alle_Namen}) #create Dataframe only for names - has only one column
        df2 = pd.DataFrame(alle_Bucher) #create dataframe only for books - has mulitple columns with different books in each column (rows equal characters)

        df_wide = pd.concat([df1,df2],axis=1) #concat the two df: have now one wide dataframe with each character with corresponding books in same row
        df_long = df_wide.melt(id_vars='character_name', var_name='key', value_name='book').drop(['key'], axis=1).dropna() #transform wide df to long with characters as ID variable

        df_long= df_long.sort_values('character_name').reset_index(drop=True) #beauty operations: sort the df according to characters and reset the index

        return df_long
     
    def num_char(self, book="Men at Arms", print_char= "NO"):
        '''
        'num_char' takes a book name as input and returns the number of characters appearing in it. Default book set to "Men at Arms". If the user wants can specify that he wants to print
        a small dataframe showing the characters appearing in the specified book.
        '''
        df_long= self.df  
        if print_char in ["Yes", "y", "Y", "yes"]:
            print(f'The number of characters in the book {book} are', sum(df_long['book']==book)) #counts number of books where the condtion is true
            print('Here is a list of the characters:')
            return df_long.loc[df_long['book']==book]
        else:
            return print(f'The number of characters in the book {book} are', sum(df_long['book']==book)) #counts number of books where the condtion is true

    
    def same_books(self, char1='Visit-The-Infidel-With-Explanatory-Pamphlets' ,char2='Evadne Cake'):
        '''
        'same_books' takes two character names as input and returns a list of books in which they appear together. Default characters are 
        'Visit-The-Infidel-With-Explanatory-Pamphlets' and 'Evadne Cake'. If there are no common appearances a notice of this is printed.
        '''
        df_long= self.df 
        character1 = df_long.loc[df_long['character_name']==char1] #retrieve appearances of characters
        character2 = df_long.loc[df_long['character_name']==char2]
        intersect = list(set(character1['book']).intersection(character2['book'])) #finds intersection of two lists (or dataframes) and returns the commom elements in a list
        if not intersect: # if list is empty- ergo no common appearances
            print(f'There are no common appearances of the characters {char1} and {char2} in any book')
        else: # if list is not empty - so common appearances
            print(f'The characters {char1} and {char2} appear together in the book(s):')
            return intersect


In [4]:
b= DownloaderMidterm()

In [5]:
b.multi_link

['http://wiki.lspace.org/mediawiki/Mrs._Cake',
 'http://wiki.lspace.org/mediawiki/Fred_Colon',
 'http://wiki.lspace.org/mediawiki/Mrs._Marietta_Cosmopilite',
 'http://wiki.lspace.org/mediawiki/Detritus',
 'http://wiki.lspace.org/mediawiki/Cut-Me-Own-Throat_Dibbler',
 'http://wiki.lspace.org/mediawiki/Dorfl',
 'http://wiki.lspace.org/mediawiki/Rufus_Drumknott',
 'http://wiki.lspace.org/mediawiki/Gaspode',
 'http://wiki.lspace.org/mediawiki/Carrot_Ironfoundersson',
 'http://wiki.lspace.org/mediawiki/Leonard_of_Quirm',
 'http://wiki.lspace.org/mediawiki/Cheery_Littlebottom',
 'http://wiki.lspace.org/mediawiki/Nobby_Nobbs',
 'http://wiki.lspace.org/mediawiki/Lady_Sybil_Ramkin',
 'http://wiki.lspace.org/mediawiki/Foul_Ole_Ron',
 'http://wiki.lspace.org/mediawiki/Reg_Shoe',
 'http://wiki.lspace.org/mediawiki/Mr._Slant',
 'http://wiki.lspace.org/mediawiki/Angua_von_%C3%9Cberwald',
 'http://wiki.lspace.org/mediawiki/Havelock_Vetinari',
 'http://wiki.lspace.org/mediawiki/Samuel_Vimes',
 'http:/

In [6]:
b.df

Unnamed: 0,character_name,book
0,Carrot Ironfoundersson,Snuff
1,Carrot Ironfoundersson,Thud!
2,Carrot Ironfoundersson,Men at Arms
3,Carrot Ironfoundersson,Jingo
4,Carrot Ironfoundersson,Night Watch
...,...,...
124,Willikins,Jingo
125,Willikins,Men at Arms
126,Willikins,Thud!
127,Willikins,Feet of Clay


### Answer to 3a.

In [7]:
b.num_char(print_char='y')

The number of characters in the book Men at Arms are 12
Here is a list of the characters:


Unnamed: 0,character_name,book
2,Carrot Ironfoundersson,Men at Arms
23,Delphine Angua von Überwald,Men at Arms
30,Detritus,Men at Arms
38,Evadne Cake,Men at Arms
42,Foul Ole Ron,Men at Arms
45,Fred Colon,Men at Arms
56,Gaspode,Men at Arms
61,Havelock Vetinari,Men at Arms
74,Leonard of Quirm (da Quirm),Men at Arms
107,"Sir Samuel Vimes, Duke of Ankh",Men at Arms


### Answer to 3b.

In [8]:
b.same_books()

There are no common appearances of the characters Visit-The-Infidel-With-Explanatory-Pamphlets and Evadne Cake in any book
