# Scraping Wikipedia for dog breed synonyms

In [1]:
import pandas as pd
import re
import pprint
import collections
from bs4 import BeautifulSoup
import requests

## Read list of wikipedia dog breeds

In [2]:
# Table of dog breeds found at https://en.wikipedia.org/wiki/List_of_dog_breeds
phjDogBreedsDF = pd.read_html(requests.get("https://en.wikipedia.org/wiki/List_of_dog_breeds").content,header = 0)[0]

# The original HTML table contained a column of links to images. This is not
# needed and the column in deleted.
phjDogBreedsDF = phjDogBreedsDF.drop('Image',axis = 1)

# The final row is a repeat of the headings row. Delete.
phjDogBreedsDF = phjDogBreedsDF.loc[phjDogBreedsDF['Breed'] != 'Breed',:] # Drops final row which is repeat of column headers

print(phjDogBreedsDF.head())

              Breed                    Origin  \
0     Affenpinscher           Germany, France   
1      Afghan Hound               Afghanistan   
2   Afghan Shepherd               Afghanistan   
3              Aidi                   Morocco   
4  Airedale Terrier  United Kingdom (England)   

  Fédération Cynologique Internationale[3] American Kennel Club[4]  \
0                 Group 02 Section 01 #186               Toy Group   
1                 Group 10 Section 01 #228             Hound Group   
2                                      NaN                     NaN   
3                 Group 02 Section 02 #247                     NaN   
4                 Group 03 Section 01 #007           Terrier Group   

  Australian National Kennel Council[5] Canadian Kennel Club[6]  \
0                       Group 01 (Toys)       Group 05 - (Toys)   
1                     Group 04 (Hounds)     Group 02 - (Hounds)   
2                                   NaN                     NaN   
3               

In [3]:
# The headings of the original HTML table contained citation links which
# appeared in the column headings as a number in square brackets (e.g. [3]).
# Remove those reference numbers.
colHeadings = phjDogBreedsDF.columns.values
renameDict = {colHeadings[i]:re.sub('\[\d\]', '', colHeadings[i]) for i in range(0,len(colHeadings))}
phjDogBreedsDF = phjDogBreedsDF.rename(columns = renameDict)

print(phjDogBreedsDF.columns.values)

['Breed' 'Origin' 'Fédération Cynologique Internationale'
 'American Kennel Club' 'Australian National Kennel Council'
 'Canadian Kennel Club' 'The Kennel Club' 'New Zealand Kennel Club'
 'United Kennel Club']


In [4]:
phjDogBreedsList = phjDogBreedsDF['Breed'].tolist()
print(phjDogBreedsList)

['Affenpinscher', 'Afghan Hound', 'Afghan Shepherd', 'Aidi', 'Airedale Terrier', 'Akbash', 'Akita', 'Alano Español', 'Alaskan husky', 'Alaskan Klee Kai', 'Alaskan Malamute', 'Alaunt', 'Alopekis', 'Alpine Dachsbracke', 'Alpine Mastiff', 'Alpine Spaniel', 'American Akita', 'American Bulldog', 'American Cocker Spaniel', 'American English Coonhound', 'American Eskimo Dog', 'American Foxhound', 'American Hairless Terrier', 'American Pit Bull Terrier', 'American Staffordshire Terrier', 'American Water Spaniel', 'Anatolian Shepherd Dog', 'Andalusian Hound', 'Anglo-Français de Petite Vénerie', 'Appenzeller Sennenhund', 'Ariege Pointer', 'Ariegeois', 'Armant', 'Armenian Gampr dog', 'Artois Hound', 'Australian Cattle Dog', 'Australian Kelpie', 'Australian Shepherd', 'Australian Silky Terrier', 'Australian Stumpy Tail Cattle Dog[10]', 'Australian Terrier', 'Austrian Black and Tan Hound', 'Austrian Pinscher', 'Azawakh', 'Bakharwal Dog', 'Barbet', 'Basenji', 'Basque Ratter', 'Basque Shepherd Dog', 

## Get the URL for each breed page

In [5]:
soup = BeautifulSoup(requests.get("https://en.wikipedia.org/wiki/List_of_dog_breeds").text,'lxml')
tables = soup.findAll('table')
table = tables[0]
rows = table.findAll('tr')

breedURLList = []

for i in range(0,len(rows)):
    cells = rows[i].findAll('td')
        
    if len(cells) > 0:
        link = ''.join(['https://en.wikipedia.org',cells[0].find('a').get('href')])
        breedURLList = breedURLList + [[cells[0].text,link]]
        
phjBreedURLsDF = pd.DataFrame(breedURLList, columns = ['Breed','URL'])

pprint.pprint(phjBreedURLsDF.head())

              Breed                                             URL
0     Affenpinscher     https://en.wikipedia.org/wiki/Affenpinscher
1      Afghan Hound      https://en.wikipedia.org/wiki/Afghan_Hound
2   Afghan Shepherd       https://en.wikipedia.org/wiki/Kuchi_(dog)
3              Aidi              https://en.wikipedia.org/wiki/Aidi
4  Airedale Terrier  https://en.wikipedia.org/wiki/Airedale_Terrier


## Add URLs to main database

In [6]:
# Merge databases
phjDogBreedsDF = phjDogBreedsDF.merge(phjBreedURLsDF,
                                      how = 'left',
                                      on = 'Breed')

# Set the index to Breed
phjDogBreedsDF = phjDogBreedsDF.set_index('Breed')

print(phjDogBreedsDF.head())

                                    Origin  \
Breed                                        
Affenpinscher              Germany, France   
Afghan Hound                   Afghanistan   
Afghan Shepherd                Afghanistan   
Aidi                               Morocco   
Airedale Terrier  United Kingdom (England)   

                 Fédération Cynologique Internationale American Kennel Club  \
Breed                                                                         
Affenpinscher                 Group 02 Section 01 #186            Toy Group   
Afghan Hound                  Group 10 Section 01 #228          Hound Group   
Afghan Shepherd                                    NaN                  NaN   
Aidi                          Group 02 Section 02 #247                  NaN   
Airedale Terrier              Group 03 Section 01 #007        Terrier Group   

                 Australian National Kennel Council   Canadian Kennel Club  \
Breed                                        

In [7]:
# To simplify the creation of a dict of dicts, use defaultdict from collections
phjBreedSynonymsDict = collections.defaultdict(dict)

phjSplitStr = re.compile(""",\s*|<br/>\n|\n|\sor\s""",re.I)

for br in phjDogBreedsDF.index.values:
    url = phjDogBreedsDF.at[br,'URL']
    
    # Passing the source code to BeautifulSoup to create a BeautifulSoup object for it.
    soup = BeautifulSoup(requests.get(url).text, 'lxml')
    
    # The div on a wikipedia dog breed page has class = mw-parser-output
    # (this seems to be the only div on the page with that class).
    div = soup.find('div', {'class':'mw-parser-output'})

    # The table containing the info about breed synonyms and nicknames
    # has class = 'infobox biota'.
    table = div.find('table', {'class':'infobox biota'})
    
    if table:
    
        # Try to find a table heading containing 'Other names'
        th = table.find('th',text='Other names')

        # Breed synonyms are contained in a single cell
        if th is not None:
            td = th.findNext('td')
            
            # Remove citation numbers in square brackets, split into a list
            # and remove empty items
            phjBreedSynonymsDict[br]['Other Names'] = [i for i in re.split(phjSplitStr,re.sub('\[\d\]','',td.text)) if i != '']

        else:
            phjBreedSynonymsDict[br]['Other Names'] = None

        th = table.find('th',text='Common nicknames')

        if th is not None:
            td = th.findNext('td')
            
            # Remove citation numbers in square brackets, split into a list
            # and remove empty items
            phjBreedSynonymsDict[br]['Nicknames'] = [i for i in re.split(phjSplitStr,re.sub('\[\d\]','',td.text)) if i != '']

        else:
            phjBreedSynonymsDict[br]['Nicknames'] = None
            
    else:
        phjBreedSynonymsDict[br]['Other Names'] = None
        phjBreedSynonymsDict[br]['Nicknames'] = None
        
""" 
    # NOT CURRENTLY NEEDED
    # Retrieve the Kennel Club (UK) group for the breed. This value has already
    # been retrieved from the original list of breeds but it will be interesting
    # to compare results.
    tables = div.findAll('table', {'class':'infobox collapsible'})
    
    if len(tables) > 0:
        for table in tables:
            ths = table.findAll('th')
            if any(re.compile("classification.*standards",re.I).search(th.text) for th in ths):
                kc = table.find('th',text=re.compile("KC.*UK"))
                
                if kc is not None:
                    gp = kc.findNext('td')
                
                phjBreedSynonymsDict[br]['KC UK group'] = gp.text
    
    else:
        phjBreedSynonymsDict[br]['KC UK group'] = None
"""

pprint.pprint(phjBreedSynonymsDict)

defaultdict(<class 'dict'>,
            {'Affenpinscher': {'Nicknames': ['Affen', 'Affie', 'Monkey Dog'],
                               'Other Names': None},
             'Afghan Hound': {'Nicknames': None,
                              'Other Names': ['Tazi',
                                              'Tazhi Spay',
                                              'Da Kochyano Spay',
                                              'Sage Balochi',
                                              'Ogar Afgan',
                                              'Barakzai Hound',
                                              'Eastern Greyhound/Persian '
                                              'Greyhound']},
             'Afghan Shepherd': {'Nicknames': ['Kuchi Dog', 'Afghan Shepherd'],
                                 'Other Names': ['Sage Kuchi',
                                                 'Sage Jangi',
                                                 'De Kochyano Spai',
               

In [8]:
print(phjBreedSynonymsDict)

defaultdict(<class 'dict'>, {'Affenpinscher': {'Other Names': None, 'Nicknames': ['Affen', 'Affie', 'Monkey Dog']}, 'Afghan Hound': {'Other Names': ['Tazi', 'Tazhi Spay', 'Da Kochyano Spay', 'Sage Balochi', 'Ogar Afgan', 'Barakzai Hound', 'Eastern Greyhound/Persian Greyhound'], 'Nicknames': None}, 'Afghan Shepherd': {'Other Names': ['Sage Kuchi', 'Sage Jangi', 'De Kochyano Spai', 'Jangi Spai', 'Afghan Shepherd'], 'Nicknames': ['Kuchi Dog', 'Afghan Shepherd']}, 'Aidi': {'Other Names': ['Aïdi', 'Atlas Mountain Dog', 'Atlas Shepherd Dog', 'Berber Dog', "Chien de l'Atlas", "Chien de Montagne de l'Atlas"], 'Nicknames': ['Kabyle Dog']}, 'Airedale Terrier': {'Other Names': ['Waterside Terrier', 'Bingley Terrier'], 'Nicknames': ['Airedale', 'King of Terriers']}, 'Akbash': {'Other Names': ['Akbaş Çoban Köpeği'], 'Nicknames': None}, 'Akita': {'Other Names': ['Akita Inu', 'Japanese Akita', 'Great Japanese Dog'], 'Nicknames': None}, 'Alano Español': {'Other Names': ['Spanish Alano', 'Spanish Bulld

## Convert dict of synonyms to Pandas dataframe

In [9]:
# Following taken from https://stackoverflow.com/questions/45271336/pandas-create-a-df-from-dict-of-dict-of-lists?noredirect=1&lq=1
phjBreedSynonymDF = pd.DataFrame(phjBreedSynonymsDict).stack().reset_index()
phjBreedSynonymDF = phjBreedSynonymDF.rename(columns = {'level_0':'Label','level_1':'Breed',0:'Data'})
phjBreedSynonymDF = phjBreedSynonymDF[['Breed','Label','Data']]
phjBreedSynonymDF = phjBreedSynonymDF.sort_values('Breed').reset_index(drop = True)
print(phjBreedSynonymDF)

                              Breed        Label  \
0                     Affenpinscher    Nicknames   
1                      Afghan Hound  Other Names   
2                   Afghan Shepherd    Nicknames   
3                   Afghan Shepherd  Other Names   
4                              Aidi    Nicknames   
5                              Aidi  Other Names   
6                  Airedale Terrier    Nicknames   
7                  Airedale Terrier  Other Names   
8                            Akbash  Other Names   
9                             Akita  Other Names   
10                    Alano Español    Nicknames   
11                    Alano Español  Other Names   
12                 Alaskan Klee Kai  Other Names   
13                 Alaskan Malamute    Nicknames   
14                         Alopekis    Nicknames   
15                         Alopekis  Other Names   
16               Alpine Dachsbracke  Other Names   
17                   Alpine Spaniel  Other Names   
18          

In [10]:
# Code snippet taken from https://stackoverflow.com/questions/27263805/pandas-when-cell-contents-are-lists-create-a-row-for-each-element-in-the-list
phjBreedSynonymStackedDF = phjBreedSynonymDF.set_index(['Breed', 'Label'])['Data'].apply(pd.Series).stack()
phjBreedSynonymStackedDF = phjBreedSynonymStackedDF.reset_index(drop = False)
phjBreedSynonymStackedDF = phjBreedSynonymStackedDF.rename(columns = {'level_2':'Level',
                                                                      0:'Synonym'})

with pd.option_context('display.max_rows', 2000, 'display.max_columns', 2, 'display.max_colwidth', 30):
    print(phjBreedSynonymStackedDF[['Breed','Synonym']])

                              Breed                        Synonym
0                     Affenpinscher                          Affen
1                     Affenpinscher                          Affie
2                     Affenpinscher                     Monkey Dog
3                      Afghan Hound                           Tazi
4                      Afghan Hound                     Tazhi Spay
5                      Afghan Hound               Da Kochyano Spay
6                      Afghan Hound                   Sage Balochi
7                      Afghan Hound                     Ogar Afgan
8                      Afghan Hound                 Barakzai Hound
9                      Afghan Hound  Eastern Greyhound/Persian ...
10                  Afghan Shepherd                      Kuchi Dog
11                  Afghan Shepherd                Afghan Shepherd
12                  Afghan Shepherd                     Sage Kuchi
13                  Afghan Shepherd                     Sage J