In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import datetime

In [2]:
url='https://chipublib.bibliocommons.com/events/rss/all'
program_list={}
my_dict = {"program_name":[],
           "program_description":[],
           "program_url":[],
           "virtual":[],
           "registration_url":[],
           "start_date":[],
           "end_date":[],
           "program":[],
           "category":[],
          "audience":[],
          "language":[],
          "location_name":[],
          "address":[],
          "city":[],
          "state":[],
          "zipcode":[],
          "contact_name":[],
          "contact_phone":[],
          "contact_email":[]}

In [3]:
def build_df():
    r=requests.get(url)
    soup=bs(r.text, features='xml')
    print('CPL scrape successful: ',r.status_code)
    programs=soup.find_all('item')
    print(len(programs))
    
    for p in programs:
            title=p.find('title').text
            description=p.find('description').text.strip()
            link=p.find('link').text
            virtual=p.find('bc:is_virtual').text
            start_date=p.find('bc:start_date_local').text
            end_date=p.find('bc:end_date_local').text
            program=getattr(p.find('category',domain='Program'), 'text', 'null')
            category=getattr(p.find('category',domain='Type'), 'text', 'null')
            audience=p.find_all('category',domain='Audience')
            language=p.find_all('category',domain='Language')
            location=p.find_all('bc:location')
            if location is not None:
                for l in location:
                    location_name=getattr(l.find('bc:name'), 'string', None)
                
            location_number=getattr(p.find('bc:number'), 'string', 'null')
            location_street=getattr(p.find('bc:street'), 'string', 'null')
            if location_number!=location_street:
                location_address=f'{location_number} {location_street}'
            else:
                location_address=location_number
            
            location_city=getattr(p.find('bc:city'), 'string', 'null')
            location_zip=getattr(p.find('bc:zip'), 'string', 'null')
            location_state=getattr(p.find('bc:state'), 'string', 'null')
            contact_name=p.find('bc:contact')
            contact_phone=getattr(p.find('bc:phone'), 'string', 'null')
            contact_email=getattr(p.find('bc:email'), 'string', 'null')
            
            
            my_dict["program_name"].append(title)
            my_dict["program_description"].append(description)
            my_dict["program_url"].append(link)
            my_dict["registration_url"].append(link)
            my_dict["virtual"].append(virtual)
            my_dict["start_date"].append(start_date)
            my_dict["end_date"].append(start_date)
            my_dict["program"].append(program)
            my_dict["category"].append(category)
            my_dict["audience"].append(audience)
            my_dict["language"].append(language)
            my_dict["location_name"].append(location_name)
            my_dict["address"].append(location_address)
            my_dict["city"].append(location_city)
            my_dict["state"].append(location_state)
            my_dict["zipcode"].append(location_zip)
            my_dict["contact_name"].append(contact_name)
            my_dict["contact_phone"].append(contact_phone)
            my_dict["contact_email"].append(contact_email)
            
    df = pd.DataFrame(my_dict)
    
    df=df.astype({"language":'string',
             "audience":'string',
             "contact_name":'string'})
    
    df['location_name'] = np.where(df["virtual"]=='true', 'null', df['location_name'])
    df['meeting_type'] = np.where(df["virtual"]=='true', 'online', 'face_to_face')
    df['price']=0
    df[['start_date','start_time']] = df['start_date'].str.split('T',expand=True)
    df[['end_date','end_time']] = df['end_date'].str.split('T',expand=True) 
    
    df["contact_name"]=df["contact_name"].str.split(']',1,expand=True)
    df['contact_name'] = df['contact_name'].str.replace(r'<[^<>]*>', '', regex=True)
    df['contact_name'] = df['contact_name'].str.replace('[','')
    df['contact_name'] = df['contact_name'].str.replace(']','')
    df["contact_name"]=df["contact_name"].str.split('+',1,expand=True)
    
    df['contact_phone'] = df['contact_phone'].str.replace('+','')
    df['contact_phone'] = df['contact_phone'].str.replace('1-','')
    df['contact_phone'] = df['contact_phone'].str.replace('-','')
    
    df['internal_id']=df['program_url']
    df["internal_id"]=df["internal_id"].str.strip('https://chipublib.bibliocommons.com/events/')
    df["internal_id"]=df["internal_id"]+df["start_date"]
    
    cat_list=df['category'].unique()
    
    print(cat_list)
    print(len(cat_list))
    
    cat_dict={'Crafts, Games and Play':'232',
         'Story Time':'230',
         'Writing and Poetry':'230',
         'Business, Law and Money':'224',
         'Book Clubs':'230',
         'Health and Science':'218',
         'DIY':'210',
         'Art, Movies and Performances':'232',
         'Computers and Technology':'228',
         'Tours':'215',
         'Workshops':'232',
         'Continuing Education and College':'223',
         'History and Genealogy':'221',
         'Jobs and Careers':'222',
         'Celebrations':'232',
         'Lectures':'223',
         'Author Events':'230',
         'Library Closures':'215'}
    
    for old, new in cat_dict.items():
        df['category'] = df['category'].str.replace(old, new, regex=False)
    
    df['language'] = df['language'].str.replace(r'<[^<>]*>', '', regex=True)
    df['language'] = df['language'].str.replace('[','')
    df['language'] = df['language'].str.replace(']','') 
    
    df['audience'] = df['audience'].str.replace(r'<[^<>]*>', '', regex=True)
    df['audience'] = df['audience'].str.replace('[','')
    df['audience'] = df['audience'].str.replace(']','')
    df['audience_max']=df['audience']
    
    df['program_description'] = df['program_description'].str.replace(r'<[^<>]*>', '', regex=True)
    
    aud_list=['Babies','Toddlers','Preschoolers','Kids','Tweens','Teens','Adults','Seniors','All Ages']
    babymin,babymax=0,2
    babyrange=list(range(babymin, babymax+1))
    toddlermin,toddlermax=1,3
    toddlerrange=list(range(toddlermin, toddlermax+1))
    prekmin,prekmax=2,4
    prekrange=list(range(prekmin, prekmax+1))
    kidmin,kidmax=4,10
    kidrange=list(range(kidmin, kidmax+1))
    tweenmin,tweenmax=9,12
    tweenrange=list(range(tweenmin, tweenmax+1))
    teenmin,teenmax=13,18
    teenrange=list(range(teenmin, teenmax+1))
    adultmin,adultmax=18,99
    adultrange=list(range(adultmin, adultmax+1))
    seniormin,seniormax=65,99
    seniorrange=list(range(seniormin, seniormax+1))
    allmin,allmax=0,99
    allrange=list(range(allmin, allmax+1))
    
    aud_min=[babymin,toddlermin,prekmin,kidmin,tweenmin,teenmin,adultmin,seniormin,allmin]
    aud_min=list(map(str, aud_min))
    aud_max=[babymax,toddlermax,prekmax,kidmax,tweenmax,teenmax,adultmax,seniormax,allmax]
    aud_max=list(map(str, aud_max))
    
    audhelpmin = {'aud_list':aud_list,
                  'aud_min':aud_min}

    audhelpmin = dict(zip(audhelpmin['aud_list'], audhelpmin['aud_min']))
    
    for old, new in audhelpmin.items():
        df['audience'] = df['audience'].str.replace(old, new, regex=False) 
    
    df['audience'] = df['audience'].str.replace(' ','') 
    
    minlist=[]
    for x in df['audience'].str.split(','):
        minlist.append(min(x))
    
    df['min_age']=minlist
    
    audhelpmax = {'aud_list':aud_list,
                  'aud_max':aud_max}

    audhelpmax = dict(zip(audhelpmax['aud_list'], audhelpmax['aud_max']))
    
    for old, new in audhelpmax.items():
        df['audience_max'] = df['audience_max'].str.replace(old, new, regex=False)
    
    df['audience_max'] = df['audience_max'].str.replace(' ','')
    maxlist=[]
    for x in df['audience_max'].str.split(','):
        maxlist.append(max(x))
    
    df['max_age']=maxlist
    
    return(df)

In [4]:
current_date = datetime.datetime.now()
filename = "CPL_RSS_"+str(current_date.day)+str(current_date.month)+str(current_date.year)
build_df().to_csv(filename + '.csv')

CPL scrape successful:  200
1238
['Crafts, Games and Play' 'Story Time' 'Computers and Technology'
 'History and Genealogy' 'Book Clubs' 'Jobs and Careers'
 'Health and Science' 'Art, Movies and Performances' 'Celebrations'
 'Workshops' 'DIY' 'Continuing Education and College'
 'Business, Law and Money' 'Lectures' 'Author Events' 'Writing and Poetry'
 'Library Closures']
17
