In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np

In [2]:
url='https://chipublib.bibliocommons.com/events/rss/all'
program_list={}
my_dict = {"program_name":[],
           "program_description":[],
           "program_url":[],
           "virtual":[],
           "registration_url":[],
           "start_date":[],
           "end_date":[],
           "program":[],
           "category":[],
          "audience":[],
          "language":[],
          "location_name":[],
          "address":[],
          "city":[],
          "state":[],
          "zipcode":[],
          "contact_name":[],
          "contact_phone":[],
          "contact_email":[]}

In [3]:
def build_df():
    r=requests.get(url)
    soup=bs(r.text, features='xml')
    print('CPL scrape successful: ',r.status_code)
    programs=soup.find_all('item')
    print(len(programs))
    
    for p in programs:
            title=p.find('title').text
            description=p.find('description').text.strip()
            link=p.find('link').text
            virtual=p.find('bc:is_virtual').text
            start_date=p.find('bc:start_date_local').text
            end_date=p.find('bc:end_date_local').text
            program=getattr(p.find('category',domain='Program'), 'text', 'null')
            category=getattr(p.find('category',domain='Type'), 'text', 'null')
            audience=p.find_all('category',domain='Audience')
            language=p.find_all('category',domain='Language')
            location=p.find_all('bc:location')
            if location is not None:
                for l in location:
                    location_name=getattr(l.find('bc:name'), 'string', None)
                
            location_number=getattr(p.find('bc:number'), 'string', 'null')
            location_street=getattr(p.find('bc:street'), 'string', 'null')
            if location_number!=location_street:
                location_address=f'{location_number} {location_street}'
            else:
                location_address=location_number
            
            location_city=getattr(p.find('bc:city'), 'string', 'null')
            location_zip=getattr(p.find('bc:zip'), 'string', 'null')
            location_state=getattr(p.find('bc:state'), 'string', 'null')
            contact_name=p.find('bc:contact')
            contact_phone=getattr(p.find('bc:phone'), 'string', 'null')
            contact_email=getattr(p.find('bc:email'), 'string', 'null')
            
            
            my_dict["program_name"].append(title)
            my_dict["program_description"].append(description)
            my_dict["program_url"].append(link)
            my_dict["registration_url"].append(link)
            my_dict["virtual"].append(virtual)
            my_dict["start_date"].append(start_date)
            my_dict["end_date"].append(start_date)
            my_dict["program"].append(program)
            my_dict["category"].append(category)
            my_dict["audience"].append(audience)
            my_dict["language"].append(language)
            my_dict["location_name"].append(location_name)
            my_dict["address"].append(location_address)
            my_dict["city"].append(location_city)
            my_dict["state"].append(location_state)
            my_dict["zipcode"].append(location_zip)
            my_dict["contact_name"].append(contact_name)
            my_dict["contact_phone"].append(contact_phone)
            my_dict["contact_email"].append(contact_email)
            
    df = pd.DataFrame(my_dict)
    
    df=df.astype({"language":'string',
             "audience":'string',
             "contact_name":'string'})
    
    df['location_name'] = np.where(df["virtual"]=='true', 'null', df['location_name'])
    df['meeting_type'] = np.where(df["virtual"]=='true', 'online', 'face_to_face')
    df['price']=0
    df[['start_date','start_time']] = df['start_date'].str.split('T',expand=True)
    df[['end_date','end_time']] = df['end_date'].str.split('T',expand=True) 
    
    df["contact_name"]=df["contact_name"].str.split(']',1,expand=True)
    df['contact_name'] = df['contact_name'].str.replace(r'<[^<>]*>', '', regex=True)
    df['contact_name'] = df['contact_name'].str.replace('[','')
    df['contact_name'] = df['contact_name'].str.replace(']','')
    df["contact_name"]=df["contact_name"].str.split('+',1,expand=True)
    
    df['contact_phone'] = df['contact_phone'].str.replace('+','')
    df['contact_phone'] = df['contact_phone'].str.replace('1-','')
    df['contact_phone'] = df['contact_phone'].str.replace('-','')
    
    df['internal_id']=df['program_url']
    df["internal_id"]=df["internal_id"].str.strip('https://chipublib.bibliocommons.com/events/')
    df["internal_id"]=df["internal_id"]+df["start_date"]
    
    cat_list=df['category'].unique()
    
    print(cat_list)
    print(len(cat_list))
    
    cat_dict={'Crafts, Games and Play':'232',
         'Story Time':'230',
         'Writing and Poetry':'230',
         'Business, Law and Money':'224',
         'Book Clubs':'230',
         'Health and Science':'218',
         'DIY':'210',
         'Art, Movies and Performances':'232',
         'Computers and Technology':'228',
         'Tours':'215',
         'Workshops':'232',
         'Continuing Education and College':'223',
         'History and Genealogy':'221',
         'Jobs and Careers':'222',
         'Celebrations':'232',
         'Lectures':'223',
         'Author Events':'230',
         'Library Closures':'215'}
    
    for old, new in cat_dict.items():
        df['category'] = df['category'].str.replace(old, new, regex=False)
    
    df['language'] = df['language'].str.replace(r'<[^<>]*>', '', regex=True)
    df['language'] = df['language'].str.replace('[','')
    df['language'] = df['language'].str.replace(']','') 
    
    df['audience'] = df['audience'].str.replace(r'<[^<>]*>', '', regex=True)
    df['audience'] = df['audience'].str.replace('[','')
    df['audience'] = df['audience'].str.replace(']','')
    df['audience_max']=df['audience']
    
    df['program_description'] = df['program_description'].str.replace(r'<[^<>]*>', '', regex=True)
    
    aud_list=['Babies','Toddlers','Preschoolers','Kids','Tweens','Teens','Adults','Seniors','All Ages']
    babymin,babymax=0,2
    babyrange=list(range(babymin, babymax+1))
    toddlermin,toddlermax=1,3
    toddlerrange=list(range(toddlermin, toddlermax+1))
    prekmin,prekmax=2,4
    prekrange=list(range(prekmin, prekmax+1))
    kidmin,kidmax=4,10
    kidrange=list(range(kidmin, kidmax+1))
    tweenmin,tweenmax=9,12
    tweenrange=list(range(tweenmin, tweenmax+1))
    teenmin,teenmax=13,18
    teenrange=list(range(teenmin, teenmax+1))
    adultmin,adultmax=18,99
    adultrange=list(range(adultmin, adultmax+1))
    seniormin,seniormax=65,99
    seniorrange=list(range(seniormin, seniormax+1))
    allmin,allmax=0,99
    allrange=list(range(allmin, allmax+1))
    
    aud_min=[babymin,toddlermin,prekmin,kidmin,tweenmin,teenmin,adultmin,seniormin,allmin]
    aud_min=list(map(str, aud_min))
    aud_max=[babymax,toddlermax,prekmax,kidmax,tweenmax,teenmax,adultmax,seniormax,allmax]
    aud_max=list(map(str, aud_max))
    
    audhelpmin = {'aud_list':aud_list,
                  'aud_min':aud_min}

    audhelpmin = dict(zip(audhelpmin['aud_list'], audhelpmin['aud_min']))
    
    for old, new in audhelpmin.items():
        df['audience'] = df['audience'].str.replace(old, new, regex=False) 
    
    df['audience'] = df['audience'].str.replace(' ','') 
    
    minlist=[]
    for x in df['audience'].str.split(','):
        minlist.append(min(x))
    
    df['min_age']=minlist
    
    audhelpmax = {'aud_list':aud_list,
                  'aud_max':aud_max}

    audhelpmax = dict(zip(audhelpmax['aud_list'], audhelpmax['aud_max']))
    
    for old, new in audhelpmax.items():
        df['audience_max'] = df['audience_max'].str.replace(old, new, regex=False)
    
    df['audience_max'] = df['audience_max'].str.replace(' ','')
    maxlist=[]
    for x in df['audience_max'].str.split(','):
        maxlist.append(max(x))
    
    df['max_age']=maxlist
    
    return(df)

In [4]:
build_df()

CPL scrape successful:  200
1302
['Crafts, Games and Play' 'Story Time' 'Writing and Poetry'
 'Business, Law and Money' 'Book Clubs' 'Health and Science' 'DIY'
 'Art, Movies and Performances' 'Computers and Technology' 'Tours'
 'Workshops' 'Continuing Education and College' 'History and Genealogy'
 'Jobs and Careers' 'Celebrations' 'Lectures' 'Author Events'
 'Library Closures']
18


Unnamed: 0,program_name,program_description,program_url,virtual,registration_url,start_date,end_date,program,category,audience,...,contact_phone,contact_email,meeting_type,price,start_time,end_time,internal_id,audience_max,min_age,max_age
0,Grab and Go: Paper Rose Craft,In celebration of Latinx History Month come&nb...,https://chipublib.bibliocommons.com/events/615...,false,https://chipublib.bibliocommons.com/events/615...,2021-10-08,2021-10-08,Latinx History,232,4,...,3127466800,jnewman@chipublib.org,face_to_face,0,09:00,09:00,61571f6f66359037007e6f8f2021-10-08,10,4,10
1,Rise and Shine Story Time: The Alphabet,Start your day the library way!&nbsp;Join Mr. ...,https://chipublib.bibliocommons.com/events/615...,true,https://chipublib.bibliocommons.com/events/615...,2021-10-08,2021-10-08,,230,021,...,3127441965,,online,0,09:30,09:30,615b087bddad1a3000c2254f2021-10-08,243,0,4
2,Family Story Time,Join the Thomas Hughes Children's Library staf...,https://chipublib.bibliocommons.com/events/612...,true,https://chipublib.bibliocommons.com/events/612...,2021-10-08,2021-10-08,,230,021,...,3127474200,,online,0,10:00,10:00,612e35a6577350360031868f2021-10-08,243,0,4
3,Creative Writing Workshop,"Turn over a new leaf, this autumn, and celebra...",https://chipublib.bibliocommons.com/events/612...,true,https://chipublib.bibliocommons.com/events/612...,2021-10-08,2021-10-08,,230,18,...,3127429590,tstark@chipublib.org,online,0,10:00,10:00,612fc678cf94553600290f052021-10-08,99,18,99
4,Story Time with Ms. Katie,Join Ms.&nbsp;Katie from the McKinley Park Bra...,https://chipublib.bibliocommons.com/events/610...,true,https://chipublib.bibliocommons.com/events/610...,2021-10-08,2021-10-08,,230,021,...,3127476082,kboucher@chipublib.org,online,0,10:30,10:30,610d3daf8b28013800490fd82021-10-08,243,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,Books 'n Babes,"Come one, come all, to our fun story time for ...",https://chipublib.bibliocommons.com/events/615...,true,https://chipublib.bibliocommons.com/events/615...,2022-02-28,2022-02-28,,230,0,...,3127472733,jhargrave@chipublib.org,online,0,10:30,10:30,61520689d68b0a2f003c3292022-02-28,2,0,2
1298,Books 'n Babes,"Come one, come all, to our fun story time for ...",https://chipublib.bibliocommons.com/events/615...,true,https://chipublib.bibliocommons.com/events/615...,2022-03-14,2022-03-14,,230,0,...,3127472733,jhargrave@chipublib.org,online,0,10:30,10:30,61520689d68b0a2f003c329d2022-03-14,2,0,2
1299,Books 'n Babes,"Come one, come all, to our fun story time for ...",https://chipublib.bibliocommons.com/events/615...,true,https://chipublib.bibliocommons.com/events/615...,2022-03-28,2022-03-28,,230,0,...,3127472733,jhargrave@chipublib.org,online,0,10:30,10:30,61520689d68b0a2f003c3292022-03-28,2,0,2
1300,"Law at the Library: Wills, Trusts and Estate P...",Each Law at the Library program features a pre...,https://chipublib.bibliocommons.com/events/611...,true,https://chipublib.bibliocommons.com/events/611...,2022-03-29,2022-03-29,Law at the Library,224,18,...,,adultservices@chipublib.org,online,0,18:00,18:00,611ab2eaad0dd42800ed27512022-03-29,99,18,99
