In [2]:
import requests
import bs4
from bs4 import BeautifulSoup
from time import sleep
import tqdm
import pandas as pd
import numpy as np

### Get-Soup function
#### Gets the text from the Website

In [3]:
def getSoup(link):
    '''
    No big explanation needed as copied from the lecture, however I kept it on purpose out of the 'DownloaderMidterm' class, because it is in principle 
    reusable for multiple classes. Therefore it is more efficient to keep it outside of the class instead of using it as a class method for each class
    (although of course here I just have one class so it would be the same).
    '''
    sleep(0.1) #to be kind to the website
    r = requests.get(link)
    r.encoding = 'UTF-8'
    return BeautifulSoup(r.text,'lxml') #returns BeautifulSoup object in 'lxml' style

In [4]:
soup=getSoup('http://wiki.lspace.org/mediawiki/List_of_Pratchett_characters')

In [15]:
links=soup.find('span', id='Ankh-Morpork_and_The_Watch').parent.findNext('ul').find_all('li')
characters= ['http://wiki.lspace.org' + li.find('a')['href'] for li in links]

In [38]:
soup2 = getSoup(characters[0])

In [39]:
soup2.find('table').find('tr').find('b')

<b>Mrs Cake</b>

In [22]:
Name = soup2.find('b', text='Name').parent.next_sibling.next_sibling.text.strip()

'Evadne Cake'

In [29]:
Books=soup2.find('a', title='Bibliography').parent.parent.next_sibling.next_sibling.find_all('a')
book_list=[boo.text.strip() for boo in Books]

In [73]:
def df_create(link_list=characters):

    alle_Namen=[]
    alle_Bucher=[]

    for char in characters:
        soup2 = getSoup(char)
        Name = soup2.find('b', text='Name').parent.next_sibling.next_sibling.text.strip() #finds all the Names except for Mr. Slant because his name is not specified in the right position
        alle_Namen.append(Name)
        Books=soup2.find('a', title='Bibliography').parent.parent.next_sibling.next_sibling.find_all('a')
        book_list=[boo.text.strip() for boo in Books]
        alle_Bucher.append(book_list)
        
    df1= pd.DataFrame({'character_name': alle_Namen}) 
    df1.loc[15,'character_name']='Mr. Slant' # Mr. Slant has to be added because his name is not in the correct position on the wiki page, alternatively 
    #'soup2.find('table').find('tr').find('b')' could have been used to scrape the names from the right hand side table header but then I would not have retrieved the full names
    
    df2= pd.DataFrame(alle_Bucher)
    
    df_wide= pd.concat([df1,df2],axis=1) 
    df_long =df_wide.melt(id_vars='character_name', var_name='key', value_name='book').drop(['key'], axis=1).dropna()
    
    df_long= df_long.sort_values('character_name').reset_index(drop=True)
    
    return df_long

df_create()

Unnamed: 0,character_name,book
0,Carrot Ironfoundersson,Snuff
1,Carrot Ironfoundersson,Thud!
2,Carrot Ironfoundersson,Men at Arms
3,Carrot Ironfoundersson,Jingo
4,Carrot Ironfoundersson,Night Watch
...,...,...
124,Willikins,Jingo
125,Willikins,Men at Arms
126,Willikins,Thud!
127,Willikins,Feet of Clay


In [44]:
df1= pd.DataFrame({'character_name': alle_Namen}) 
df1.loc[15,'character_name']='Mr. Slant' # Mr. Slant has to be added because his name is not in the correct position on the wiki page, alternatively 
#'soup2.find('table').find('tr').find('b')' could have been used to scrape the names from the right hand side table header but then I would not have retrieved the full names

In [58]:
df2= pd.DataFrame(alle_Bucher)

In [62]:
df_wide= pd.concat([df1,df2],axis=1) 
df_long =df_wide.melt(id_vars='character_name', var_name='key', value_name='book').drop(['key'], axis=1).dropna().reset_index(drop=True)

In [79]:
sum(df_long['book']=='Men at Arms')
df_long.loc[df_long['book']=='Men at Arms']

Unnamed: 0,character_name,book
13,Foul Ole Ron,Men at Arms
16,Delphine Angua von Überwald,Men at Arms
20,Willikins,Men at Arms
21,Evadne Cake,Men at Arms
22,Fred Colon,Men at Arms
26,Gaspode,Men at Arms
27,Carrot Ironfoundersson,Men at Arms
28,Leonard of Quirm (da Quirm),Men at Arms
30,Sybil Deirdre Olgivanna Vimes (née Ramkin),Men at Arms
36,"Sir Samuel Vimes, Duke of Ankh",Men at Arms


In [86]:
inf = df_long.loc[df_long['character_name']=='Visit-The-Infidel-With-Explanatory-Pamphlets']
eva= df_long.loc[df_long['character_name']=='Evadne Cake']
list(set(inf['book']).intersection(eva['book']))

[]

### Downloader Class
#### Uses Get-Soup function with input link to get content from website specified and then gets links to further scrape sub-pages

In [153]:
class DownloaderMidterm(): 
    def __init__(self):
    
        self.link = 'http://wiki.lspace.org/mediawiki/List_of_Pratchett_characters' #link of the main page where the links to the sub-pages are found
        self.soup = getSoup(self.link) #get Soup of the main page
        self.multi_link = self.multi_links() # calling self.multi_links() method when initialized so that other functions can use its output, output stored as a class attribute
        self.df = self.df_create() #also called when initialized
                
    def multi_links(self, category='Ankh-Morpork_and_The_Watch'):
        '''
        Finds all the links which are leading to the sub-pages. Returns a list containing the links when called.
        '''  
        links=self.soup.find('span', id='Ankh-Morpork_and_The_Watch').parent.findNext('ul').find_all('li')
        return ['http://wiki.lspace.org' + li.find('a')['href'] for li in links]
    
    def df_create(self):
        
        characters = self.multi_link 
        alle_Namen=[]
        alle_Bucher=[]

        for char in characters:
            soup2 = getSoup(char)
            Heading = soup2.find('table').find('tr').find('b').text.strip() #finds Names in Headers of right hand side table, but are not the full names so prefer the Name row if exists 
            Name = soup2.find('b', text='Name').parent.next_sibling.next_sibling.text.strip() #finds all the Names, except for Mr. Slant because his name is not specified in the right position
            if Name in ['']:             # if Name exists in Name row appends/uses the full Name, if doesn't exist (ergo ='') then it uses the Name from the Headers of the table
                alle_Namen.append(Heading)
            else:
                alle_Namen.append(Name)
            Books = soup2.find('a', title='Bibliography').parent.parent.next_sibling.next_sibling.find_all('a')
            book_list =[boo.text.strip() for boo in Books]
            alle_Bucher.append(book_list)

        df1 = pd.DataFrame({'character_name': alle_Namen}) 
        df2 = pd.DataFrame(alle_Bucher)

        df_wide = pd.concat([df1,df2],axis=1) 
        df_long = df_wide.melt(id_vars='character_name', var_name='key', value_name='book').drop(['key'], axis=1).dropna()

        df_long= df_long.sort_values('character_name').reset_index(drop=True)

        return df_long
     
    def num_char(self, book="Men at Arms", print_char= "NO"):
        df_long= self.df  
        if print_char in ["Yes", "y", "Y", "yes"]:
            print(f'The number of characters in the book {book} are', sum(df_long['book']==book))
            print('Here is a list of the characters:')
            return df_long.loc[df_long['book']==book]
        else:
            return print(f'The number of characters in the book {book} are', sum(df_long['book']==book)) #counts number of books where the condtion is true

    
    def same_books(self, char1='Visit-The-Infidel-With-Explanatory-Pamphlets' ,char2='Evadne Cake'):
        df_long= self.df 
        character1=df_long.loc[df_long['character_name']==char1]
        character2=df_long.loc[df_long['character_name']==char2]
        return list(set(character1['book']).intersection(character2['book'])) #finds intersection of two lists (or dataframes) and returns the commom elements in a list


In [150]:
b= DownloaderMidterm()

In [154]:
b.num_char(book= 'Reaper Man', print_char='y')

The number of characters in the book Reaper Man are 3


Unnamed: 0,character_name,book
17,Claude Maximillian Overton Transpire Dibbler,Reaper Man
37,Evadne Cake,Reaper Man
84,Reginald Shoe,Reaper Man


In [112]:
b.df_create()

Unnamed: 0,character_name,book
0,Carrot Ironfoundersson,Snuff
1,Carrot Ironfoundersson,Thud!
2,Carrot Ironfoundersson,Men at Arms
3,Carrot Ironfoundersson,Jingo
4,Carrot Ironfoundersson,Night Watch
...,...,...
124,Willikins,Jingo
125,Willikins,Men at Arms
126,Willikins,Thud!
127,Willikins,Feet of Clay


In [106]:
b.same_books()

[]

In [124]:
alle_Namen=[]

for char in characters:
        soup2 = getSoup(char)
        Heading = soup2.find('table').find('tr').find('b').text.strip() 
        Name = soup2.find('b', text='Name').parent.next_sibling.next_sibling.text.strip() #finds all the Names except for Mr. Slant because his name is not specified in the right position
        if Name in ['']:
            alle_Namen.append(Heading)
        else:
            alle_Namen.append(Name)
alle_Namen

['Evadne Cake',
 'Fred Colon',
 'Marietta Cosmopilite',
 'Detritus',
 'Claude Maximillian Overton Transpire Dibbler',
 'Dorfl',
 'Rufus Drumknott',
 'Gaspode',
 'Carrot Ironfoundersson',
 'Leonard of Quirm (da Quirm)',
 'Cheery Littlebottom',
 'Cecil Wormsborough St John Nobbs',
 'Sybil Deirdre Olgivanna Vimes (née Ramkin)',
 'Foul Ole Ron',
 'Reginald Shoe',
 'Mr. Slant',
 'Delphine Angua von Überwald',
 'Havelock Vetinari',
 'Sir Samuel Vimes, Duke of Ankh',
 'Visit-The-Infidel-With-Explanatory-Pamphlets',
 'Willikins']