In [19]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

### Get a List of Breeds

In [20]:
breeds = []
for i in range(27):
    page_url = 'https://www.akc.org/dog-breeds/page/{}/'.format(i)

    page = requests.get(page_url)
    soup = BeautifulSoup(page.text, 'html.parser')

    results = soup.findAll('h3', {'class':'breed-type-card__title mt0 mb0 f-25 py3 px3'})
    for j in results:
        breeds.append(j.text.replace(' ', '-').replace("’", '').lower())

Remove the duplicates and clean remaining mismatching characters.

In [21]:
_breeds = []
for dog in breeds:
    if dog not in _breeds:
        _breeds.append(dog)

_breeds.remove('australian-stumpy-tail-cattle-dog')
_breeds.remove('grand-basset-griffon-vendéen')
_breeds.remove('löwchen')
_breeds.remove('petit-basset-griffon-vendéen')


_breeds.append('australian-stump-tail-cattle-dog')
_breeds.append('grand-basset-griffon-vendeen')
_breeds.append('lowchen')
_breeds.append('petit-basset-griffon-vendeen')

_breeds.sort()

In [22]:
len(_breeds)

282

We have 282 breeds to look at!

### Check Breed Attributes

In [23]:
df = []
for dog in _breeds:
    print(dog)
    url = 'https://www.akc.org/dog-breeds/{}/'.format(dog)
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')

    data = {
            'breed':dog,
            'attribute_labels':soup.find('span', {'class':'attribute-list__description attribute-list__text attribute-list__text--lg mb4 bpm-mb5 pb0 d-block'}).text,
            'popularity':-1,
            'height':-1,
            'weight':-1,
            'life_exp':-1,
            'group':-1,
            'image_url':soup.find('img', {'class':'media-wrap__image lozad'})['data-src'],
            'puppy_link': soup.find('div', {'class':'content-block__button'}).a['href'],
            'grooming_frequency':-1,
            'shedding':-1,
            'energy_level':-1,
            'trainability':-1,
            'demeanor':-1
    }

    # Check unreliable variables: popularity, height, weight, life expectancy, group
    for phrase in soup.find('ul', {'class':'attribute-list'}).findAll('li', {'class':'attribute-list__row'}):
        # AKC Breed Popularity
        if phrase.find('span', {'class':'attribute-list__term attribute-list__text'}).text == 'AKC Breed Popularity:':
            data['popularity'] = phrase.find('span', {'class':'attribute-list__description attribute-list__text'}).text
        # Height
        if phrase.find('span', {'class':'attribute-list__term attribute-list__text'}).text == 'Height:':
            data['height'] = phrase.find('span', {'class':'attribute-list__description attribute-list__text'}).text
        # Weight
        if phrase.find('span', {'class':'attribute-list__term attribute-list__text'}).text == 'Weight:':
            data['weight'] = phrase.find('span', {'class':'attribute-list__description attribute-list__text'}).text
        # Life Expectancy
        if phrase.find('span', {'class':'attribute-list__term attribute-list__text'}).text == 'Life Expectancy:':
            data['life_exp'] = phrase.find('span', {'class':'attribute-list__description attribute-list__text'}).text
        # Group
        if phrase.find('span', {'class':'attribute-list__term attribute-list__text'}).text == 'Group:':
            data['group'] = phrase.find('span', {'class':'attribute-list__description attribute-list__text'}).text
    
    # Check unreliable Color variable
    try:
        colors = []
        for k, row in enumerate(soup.findAll('div', {'class':'breed-table__wrap'})[0].findAll('tr')):
            if k > 0:
                colors.append(row.find('td').text)
    except:
        colors = -1
    
    data['colors'] = colors
    # Check unreliable Mask variable
    try:
        mask = []
        for k, row in enumerate(soup.findAll('div', {'class':'breed-table__wrap'})[1].findAll('tr')):
            if k > 0:
                mask.append(row.find('td').text)
    except:
        colors = -1
    
    data['mask'] = mask

    # Check unreliable Rescue Link variable
    try:
        rescue_link = soup.find('a', {'class':'link-list__link f-lato-b'})['href']
    except:
        rescue_link = -1

    data['rescue_link'] = rescue_link

    for phrase in soup.findAll('div', {'class':'graph-section__inner'}):
        if phrase.find('h4', {'class':'bar-graph__title'}).text == 'Grooming Frequency':
            data['grooming_frequency'] = phrase.find('div', {'class':'bar-graph__text'}).text
        if phrase.find('h4', {'class':'bar-graph__title'}).text == 'Shedding':
            data['shedding'] = phrase.find('div', {'class':'bar-graph__text'}).text
        if phrase.find('h4', {'class':'bar-graph__title'}).text == 'Energy Level':
            data['energy_level'] = phrase.find('div', {'class':'bar-graph__text'}).text
        if phrase.find('h4', {'class':'bar-graph__title'}).text == 'Trainability':
            data['trainability'] = phrase.find('div', {'class':'bar-graph__text'}).text
        if phrase.find('h4', {'class':'bar-graph__title'}).text == 'Temperament/Demeanor':
            data['demeanor'] = phrase.find('div', {'class':'bar-graph__text'}).text
    
    df.append(data)

affenpinscher
afghan-hound
airedale-terrier
akita
alaskan-klee-kai
alaskan-malamute
american-bulldog
american-english-coonhound
american-eskimo-dog
american-foxhound
american-hairless-terrier
american-leopard-hound
american-staffordshire-terrier
american-water-spaniel
anatolian-shepherd-dog
appenzeller-sennenhund
australian-cattle-dog
australian-kelpie
australian-shepherd
australian-stump-tail-cattle-dog
australian-terrier
azawakh
barbado-da-terceira
barbet
basenji
basset-fauve-de-bretagne
basset-hound
bavarian-mountain-scent-hound
beagle
bearded-collie
beauceron
bedlington-terrier
belgian-laekenois
belgian-malinois
belgian-sheepdog
belgian-tervuren
bergamasco-sheepdog
berger-picard
bernese-mountain-dog
bichon-frise
biewer-terrier
black-and-tan-coonhound
black-russian-terrier
bloodhound
bluetick-coonhound
boerboel
bohemian-shepherd
bolognese
border-collie
border-terrier
borzoi
boston-terrier
bouvier-des-flandres
boxer
boykin-spaniel
bracco-italiano
braque-du-bourbonnais
braque-francais

In [24]:
res = pd.DataFrame(df)

In [25]:
res

Unnamed: 0,breed,attribute_labels,popularity,height,weight,life_exp,group,image_url,puppy_link,grooming_frequency,shedding,energy_level,trainability,demeanor,colors,mask,rescue_link
0,affenpinscher,"Confident, Famously Funny, Fearless",Ranks 148 of 197,9-11.5 inches,7-10 pounds,12-15 years,Toy Group,https://s3.amazonaws.com/cdn-origin-etr.akc.or...,https://marketplace.akc.org/puppies/affenpinscher,2-3 Times a Week Brushing,Seasonal,Regular Exercise,Easy Training,Outgoing,"[Belge, Black, Black & Silver, Black & Tan, Red]",[Black Mask],https://www.affenpinscher.org/
1,afghan-hound,"Dignified, Profoundly Loyal, Aristocratic",Ranks 113 of 197,25-27 inches,50-60 pounds,12-15 years,Hound Group,https://s3.amazonaws.com/cdn-origin-etr.akc.or...,https://marketplace.akc.org/puppies/afghan-hound,Daily Brushing,Infrequent,Energetic,May be Stubborn,Aloof/Wary,"[Black, Black & Silver, Black & Tan, Blue, Blu...","[Black Mask, Brindle, Brindle Black Mask, Brin...",https://afghanhoundclubofamerica.org
2,airedale-terrier,"Friendly, Clever, Courageous",Ranks 60 of 197,23 inches,50-70 pounds,11-14 years,Terrier Group,https://s3.amazonaws.com/cdn-origin-etr.akc.or...,https://marketplace.akc.org/puppies/airedale-t...,2-3 Times a Week Brushing,Occasional,Regular Exercise,Eager to Please,Friendly,"[Black & Tan, Grizzle & Tan]",[],https://airedale.org
3,akita,"Courageous, Dignified, Profoundly Loyal",Ranks 47 of 197,"26-28 inches (male), 24-26 inches (female)","100-130 pounds (male), 70-100 pounds (female)",10-13 years,Working Group,https://s3.amazonaws.com/cdn-origin-etr.akc.or...,https://marketplace.akc.org/puppies/akita,Daily Brushing,Seasonal,Energetic,Eager to Please,Alert/Responsive,"[Black, Brown Brindle, Brown, Black Overlay, F...","[Black Mask, White Markings, Black & White Mas...",http://akitaclub.org/
4,alaskan-klee-kai,"Loyal, Intelligent, Vigilant",-1,"13 inches and under (Toy), 13-15 inches (Minia...","6-12 lbs (Toy), 10-18 lbs (Miniature), 16-25 l...",13-16 years,Foundation Stock Service,https://s3.amazonaws.com/cdn-origin-etr.akc.or...,https://marketplace.akc.org/puppies/alaskan-kl...,Weekly Brushing,Seasonal,Energetic,Agreeable,Reserved with Strangers,"[Black & White, Gray & White, Red & White]",[],https://www.akkcoa.org/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,wirehaired-vizsla,"Gentle, Loyal, Trainable",Ranks 167 of 197,"23-25 inches (male), 21.5-23 inches (female)","55-65 pounds (male), 45-55 pounds (female)",12-14 years,Sporting Group,https://s3.amazonaws.com/cdn-origin-etr.akc.or...,https://marketplace.akc.org/puppies/wirehaired...,Occasional Bath/Brush,Seasonal,Energetic,Agreeable,Alert/Responsive,"[Golden Rust, Golden, Red, Red Golden, Rust, R...",[White Markings],https://whvca.org
278,working-kelpie,"Alert, Eager, Intelligent",-1,19-25 inches,28-60 pounds,12-15 years,Foundation Stock Service,https://s3.amazonaws.com/cdn-origin-etr.akc.or...,https://marketplace.akc.org/puppies/working-ke...,Occasional Bath/Brush,Seasonal,Energetic,Independent,Alert/Responsive,"[Black, Blue, Fawn, Red]",[Tan Points],http://www.wkc.org.au/About-Kelpies/Charateris...
279,xoloitzcuintli,"Loyal, Alert, Calm",Ranks 140 of 197,"10-14 inches (toy), 14-18 inches (miniature), ...","10-15 pounds (toy), 15-30 pounds (miniature), ...",13-18 years,Non-Sporting Group,https://s3.amazonaws.com/cdn-origin-etr.akc.or...,https://marketplace.akc.org/puppies/xoloitzcui...,Occasional Bath/Brush,Infrequent,Energetic,Agreeable,Alert/Responsive,"[Black, Brindle, Bronze, Dark Brown, Fawn, Gra...","[Black Markings, Spotted, Tan Markings, White ...",http://www.xoloitzcuintliclubofamerica.org/
280,yakutian-laika,"Affectionate, Intelligent, Active",-1,21-23 inches,40-55 pounds,10-12 years,Foundation Stock Service,https://s3.amazonaws.com/cdn-origin-etr.akc.or...,https://marketplace.akc.org/puppies/yakutian-l...,Weekly Brushing,Seasonal,Energetic,May be Stubborn,Reserved with Strangers,"[Black & White, Brown & White, Gray & White, W...","[Black Markings, Buff Markings, Tri Color Mark...",https://yakutianlaikaclubofamerica.org


In [26]:
res.to_csv('./dog_breeds.csv', index = False)