In [18]:
import pandas as pd
import numpy as np 
from bs4 import BeautifulSoup
import requests

In [19]:
def getCatPage(url):

    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}
    
    page = requests.get(url, headers=headers)
    
    if page.status_code == 200: 
        soup = BeautifulSoup(page.content, 'html.parser')
        # print("got site data")
        return soup
    else: 
        print("Error: ", page.status_code)
        print(page.content)
        return

In [20]:
def getCatSummary(url):
    
    soup = getCatPage(url)
    cat_soup = soup.find('div', class_='flex flex-wrap -m-4')
    cats = cat_soup.find_all('a', href=True)

    results = []
    for cat in cats: 

        #get summary information: link, name, age, gender
        if cat.get('href') != '/': 
            cat_url = 'https://catcafebk.com' + cat.get('href')
            cat_id = cat_url.split('=')[1]
            cat_summary = [i for i in cat.stripped_strings]
            cat_name = cat_summary[0]
            cat_age = cat_summary[1].split('|')[0]
            cat_gender = cat_summary[1].split('|')[1]
            
            results.append(
                {'id':cat_id,
                'url':cat_url,
                'name':cat_name,
                'age':cat_age,
                'gender':cat_gender}
            )
    return results

In [21]:
def addCatFilters(results):

    filters = [
        {'filter': 'cafe_cat','url': 'https://catcafebk.com/our-cats/?cat-filters%5Bcafe%5D=1'},
        {'filter': 'kid_approved','url': 'https://catcafebk.com/our-cats/?cat-filters%5Bkids%5D=1'},
        {'filter': 'dog_approved','url': 'https://catcafebk.com/our-cats/?cat-filters%5Bdogs%5D=1'},
        {'filter': 'companion_cat','url': 'https://catcafebk.com/our-cats/?cat-filters%5Bcompanion%5D=1'},
        {'filter': 'bonded_pair','url': 'https://catcafebk.com/our-cats/?cat-filters%5Bbonded%5D=1'},
        {'filter': 'single_cat','url': 'https://catcafebk.com/our-cats/?cat-filters%5Bsinglecat%5D=1'}
    ]

    # get list of cats associated with each filter
    for filtered in filters:
        print(f"Getting {filtered['filter']} cats... ")
        filtered_list = getCatSummary(filtered['url'])

        # compare filtered list with main list
        for cat in results:
            if cat['id'] in [filtered_cat['id'] for filtered_cat in filtered_list]: 
                cat[filtered['filter']] = True
            else: 
                cat[filtered['filter']] = False
    
    return results

In [22]:
def addCatPageInfo(cats): 
    
    for cat in cats: 
        
        # parse page info
        page = getCatPage(cat['url'])
        page_content = page.find('div', class_ = 'px-6 py-12 md:px-12')
        page_strings = [item for item in page_content.stripped_strings]

        cat_type = page_strings[1].split('|')[0]
        cat_description = ' '.join(page_strings[2:-1])
        
        cat['type'] = cat_type
        cat['description'] = cat_description
    
    return cats

In [23]:
def mainRunner(url):

    print('Getting summary...')
    cats = getCatSummary(url)
    print('...done')
    
    print('Getting cat filter data...')
    cats = addCatFilters(cats)
    print('...done')
    
    print('Getting individual cat info (descriptions, type)...')
    cats = addCatPageInfo(cats)
    print('...done')
    
    return cats

In [24]:
url = 'https://catcafebk.com/our-cats/?'

In [25]:
cats = mainRunner(url)

Getting summary...
...done
Getting cat filter data...
Getting cafe_cat cats... 
Getting kid_approved cats... 
Getting dog_approved cats... 
Getting companion_cat cats... 
Getting bonded_pair cats... 
Getting single_cat cats... 
...done
Getting individual cat info (descriptions, type)...
...done


In [26]:
cat_df = pd.DataFrame(data=cats)

In [27]:
cat_df

Unnamed: 0,id,url,name,age,gender,cafe_cat,kid_approved,dog_approved,companion_cat,bonded_pair,single_cat,type,description
0,18576198,https://catcafebk.com/our-cats/?cat=18576198,Aeon,10 Years 4 Months,Female,False,False,False,True,False,True,Domestic Short Hair,"Aeon is a gorgeous, chatty tortie girl with a ..."
1,18049093,https://catcafebk.com/our-cats/?cat=18049093,Apollo & Artemis,1 Year,,False,True,True,False,True,False,Domestic Short Hair / Mixed,Apollo and Artemis are sweet and sassy teens r...
2,18844592,https://catcafebk.com/our-cats/?cat=18844592,Baby Shark,7 Months,Female,False,False,False,False,False,False,Domestic Short Hair,Baby Shark is a sweet tuxedo who likes taking ...
3,17834315,https://catcafebk.com/our-cats/?cat=17834315,Bobsled,2 Years 9 Months,Male,False,False,False,False,False,False,Domestic Short Hair,Bobsled is turning into quite a spunky lovey g...
4,18458507,https://catcafebk.com/our-cats/?cat=18458507,Bonnie & Buddy,4 Months 30 Days,Female,False,False,False,False,True,False,Domestic Short Hair (short coat),Buddy and Bonnie are brother and sister kitten...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,18568223,https://catcafebk.com/our-cats/?cat=18568223,Where & Wal,7 Months,,False,True,False,False,True,False,Domestic Short Hair,"Where and Wal aren’t siblings, but they were c..."
104,18634866,https://catcafebk.com/our-cats/?cat=18634866,Will & Way,6 Months,,False,True,True,False,True,False,Domestic Short Hair,Will and Way are paws down the most affectiona...
105,18895748,https://catcafebk.com/our-cats/?cat=18895748,Winston & Venkman,3 Months 30 Days,,False,True,True,False,True,False,Domestic Short Hair,Winston is the charmer of the duo. She’s more ...
106,18587232,https://catcafebk.com/our-cats/?cat=18587232,Wonky & Wheezy,8 Months,,False,True,True,False,True,False,Domestic Short Hair,Wonky may be the cuddliest kitten in the world...
