In [1]:
url = "https://nspires.nasaprs.com/external/viewrepositorydocument/cmdocumentid=492458/solicitationId=%7B68C12087-132D-3814-9A87-5323BCE6CAB6%7D/viewSolicitationDocument=1/Table%202%202016_amend8_clarify.html"

In [2]:
import requests

In [3]:
from bs4 import BeautifulSoup

In [4]:
r = requests.get(url)

In [5]:
soup = BeautifulSoup(r.text, 'lxml')

In [6]:
table = soup.find('table')

In [7]:
rows = table.find_all('tr')

In [8]:
def parse_header(row):
    headers = [i.get_text() for i in row.find_all('th')]
    headers = [i.strip() for i in headers]
    return [header.splitlines()[0].strip() for header in headers]

In [9]:
columns = parse_header(rows[0])
columns.append('url')

In [10]:
def parse_out_url(row):
    cells = row.find_all('td')
    cell = cells[1]
    a = cell.find('a')
    a['href'] = a['href'].strip().replace('http','https')
    return a

def parse_table(rows):
    cell_data = []
    for row in rows:
        cells = [cell.get_text().strip() for cell in row.find_all('td')]
        if len(cells) == 3:
            cells.append('None')
        cells.append(parse_out_url(row))
        cell_data.append(cells)
    return cell_data

In [11]:
df = pd.DataFrame(parse_table(rows[1:]), columns=columns)

In [12]:
from IPython.display import HTML

In [13]:
pd.set_option('display.max_colwidth', 1000)

In [14]:
HTML(df.to_html(escape=False))

Unnamed: 0,APPENDIX,PROGRAM,NOI/Step-1,PROPOSAL,url
0,B.4,Heliophysics Guest Investigators,03/18/2016 (Step-1),04/22/2016(Step-2),Heliophysics Guest Investigators
1,A.29,NASA Data for Operation and Assessment,03/15/2016,05/20/2016,NASA Data for Operation and Assessment
2,D.2,Astrophysics Data Analysis,03/25/2016,05/13/2016,Astrophysics Data Analysis
3,A.46,Earth Science Applications: Ecological Forecasting,03/16/2016,05/26/2016,Earth Science Applications: Ecological Forecasting
4,E.3,Exoplanets Research Program [3],03/29/2016 (Step-1),05/26/2016(Step-2),Exoplanets Research Program [3]
5,A.11,Ocean Surface Topography Science Team,04/29/2016,05/27/2016,Ocean Surface Topography Science Team
6,C.2,Emerging Worlds [3] [4],03/31/2016(Step-1),06/03/2016(Step-2),Emerging Worlds [3] [4]
7,A.31,Utilization of Airborne Visible/Infrared Imaging Spectrometer - Next Generation Data from an Airborne Campaign in India,04/07/2016,06/10/2016,Utilization of Airborne Visible/Infrared Imaging Spectrometer - Next Generation Data from an Airborne Campaign in India
8,C.6,Solar System Observations [3] [4],04/08/2016(Step-1),06/10/2016(Step-2),Solar System Observations [3] [4]
9,A.24,Earth Surface and Interior,04/15/2016,06/15/2016,Earth Surface and Interior


In [15]:
def parse_date_cell(cell):
    try:
        tokens = cell.split('(')
    except AttributeError:
        return cell, False
    date = tokens[0]
    step = True if len(tokens)==2 else False        
    if date == 'N/A':
        d = pd.NaT
    elif date[0].isdigit():
        t = pd.to_datetime(str(date))
        d = t.date()
    else:
        d = date
    return d, step
    
def parse_date_columns(row):
    cell = row['NOI/Step-1']
    date1, step1 = parse_date_cell(cell)
    cell = row['PROPOSAL']
    date2, step2 = parse_date_cell(cell)
    if step1!=step2:
        raise ValueError('steps not the same')
    return pd.Series([date1, step1, date2], index=['date1', 'step', 'date2'])

In [16]:
df = pd.concat([df, df.apply(parse_date_columns, axis=1)], axis=1)

In [18]:
from icalendar import Calendar, Event

In [19]:
cal = Calendar()

cal.add('prodid', '-//NASA ROSES deadline calendar//mxm.dk//')
cal.add('version', '2.0')
cal.add('name', 'NASA ROSES Deadlines')
cal.add('x-wr-calname', 'NASA ROSES Deadlines')
cal.add('x-wr-caldesc', 'NASA ROSES Deadlines for 2016')

In [20]:
def create_cal_events_from_row(row, i):
    events = []
    for j,col in enumerate(['date1', 'date2']):
        date = row[col]

        event = Event()
        event.add('dtstart', date)

        descr = 'APPENDIX '
        descr += row['APPENDIX'] + '\n'
        event.add('description', descr)

        sumtext = 'ROSES D/L '
        if row['step']:
            if col == 'date1':
                sumtext += 'Step_1 '
            else:
                sumtext += 'Step_2 '
        else:
            if col == 'date':
                sumtext += 'NOI '
            else:
                sumtext += 'Final '

        event.add('summary', sumtext + row['PROGRAM'])
        s = row['url'].get('href')
        event.add('location', s)
        s = s.replace('&', '%26')
        s = s.replace('?', '%3F')
        s = s.replace('{', '%7B')
        s = s.replace('}', '%7D')
        event.add('url', s)
        uid = 'nasa_roses_deadlines_2016_'+str(i)+str(j)
        event.add('uid', uid)
        events.append(event)
    return events

In [21]:
import datetime
for i, row in df.iterrows():
    date = row['date1']
    if type(date) == datetime.date:
        for event in create_cal_events_from_row(row, i):
            cal.add_component(event)

In [22]:
import os

with open(os.path.join(os.environ['HOME'], 'NASA_ROSES_deadlines.ics'), 'wb') as f:
    f.write(cal.to_ical())

!open {f.name}