In [147]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
from urllib import request, parse

In [129]:
charter = pd.read_csv('../.data/city_charter_raw.csv')
chapter_regex = re.compile('Chapter (\d+) (.*)')
article_regex = re.compile('Article (\d+) (.*)')
section_regex = re.compile('[Section\s]*(\d+)-(\d+)[.]? (.*)')

In [136]:
charter['article_match'] = charter.CATEGORY_NAME.map(lambda x: article_regex.match(x))
charter['section_match'] = charter.CONTENT_NAME.map(lambda x: section_regex.match(x))
charter['updated_date'] = charter.LAST_UPDATED_DATE.map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S %Z'))

In [137]:
charter = charter[charter.article_match.notnull()].copy()
charter = charter[charter.section_match.notnull()].copy()

In [171]:
charter['article'] = charter.article_match.map(lambda x: x.group(1))
charter['chapter'] = charter.section_match.map(lambda x: x.group(1))
charter['number'] = charter.section_match.map(lambda x: x.group(2))
charter['name'] = charter.section_match.map(lambda x: x.group(3))
charter['title'] = charter.apply(lambda x: ' '.join(['Section', '-'.join([x.chapter, x.number]), x['name']]), axis=1)
charter['article_id'] = charter.apply(lambda x: '-'.join([x.chapter, x.article]), axis=1)
charter['id'] = charter.apply(lambda x: '-'.join([x.chapter, x.number]), axis=1)
charter['URL'] = charter.URL.map(lambda x: parse.urlparse(x).path)

In [172]:
charter[['id','article_id', 'number', 'title', 'TEXT', 'URL']].to_csv('../.data/city_charter_sections.csv', index=False)

In [169]:
page = request.urlopen('https://www.portlandoregon.gov/citycode/28149').read()
soup = BeautifulSoup(page)
chapters = {
    'title': [],
    'name': [],
    'url': [],
    'note': []
}
for heading in soup.find_all('h2')[1:]: # skip first h2 as it is 'City of Portland'
    chapters['title'].append(heading.get_text())
    chapters['name'].append(heading.next_sibling.next_sibling.get_text())
    chapters['url'].append('/citycode/' + heading.find('a').get('href'))
    for item in heading.parent.find('li'):
        if(item.get_text().startswith('-')):
            href = item.get('href')
            page = request.urlopen('https://www.portlandoregon.gov/citycode/' + href).read()
            soup = BeautifulSoup(page)
            chapters['note'].append(soup.find('article').find('section').get_text().strip())
        else:
            chapters['note'].append(np.nan)
chapters = pd.DataFrame(chapters)
chapters['number'] = chapters.title.map(lambda x: chapter_regex.match(x).group(1))
chapters['title'] = chapters.apply(lambda x: ' '.join(['Chapter', str(x.number), str(x['name'])]), axis=1)
chapters[['number', 'title', 'url', 'note']].to_csv('../.data/city_charter_chapters.csv', index=False)

In [170]:
page = request.urlopen('https://www.portlandoregon.gov/citycode/28149').read()
soup = BeautifulSoup(page)
articles = {
    'chapter': [],
    'title': [],
    'url': []
}
for heading in soup.find_all('h2')[1:]: # skip first h2 as it is 'City of Portland'
    for item in heading.parent.find_all('li'):
        if(item.get_text().startswith('Article')):
            articles['chapter'].append(heading.get_text())
            articles['title'].append(item.get_text())
            articles['url'].append('/citycode/' + href)
articles = pd.DataFrame(articles)
articles['chapter'] = articles.chapter.map(lambda x: chapter_regex.match(x).group(1))
articles['number'] = articles.title.map(lambda x: article_regex.match(x).group(1))
articles['name'] = articles.title.map(lambda x: article_regex.match(x).group(2))
articles['title'] = articles.apply(lambda x: ' '.join(['Article', str(x.number), str(x['name'])]), axis=1)
articles['id'] = articles.apply(lambda x: '-'.join([x.chapter, x.number]), axis=1)
articles[['id', 'chapter', 'title', 'number', 'url']].to_csv('../.data/city_charter_articles.csv', index=False)

In [164]:
' '.join(['Article', str(1), 'Name'])

'Article 1 Name'