In [1]:
import requests           # for calling test API
from wget import download # for downloading data files
import json
from IPython.core.display import display, HTML

# Customize this value

In [2]:
TEAM_ID = 'SuperAwesomeTeamId'

# Helper Functions

In [3]:
API_ENDPOINT = "https://v0g71fyb06.execute-api.us-east-1.amazonaws.com/testSubmission"


# show pretty message boxes
# css possible values = success(green), danger(red), info(blue), warning(yellow)
def show_box(message, css='warning'):
    text = '''
    <div class="alert alert-block alert-{css}">
        <b>API says:</b>
        <pre>{message}</pre>
    </div>
    '''
    display(HTML(text.format(message=message,css=css)))


# calls the testing api to check results
def canihazmyscore(data):
    # json body payload for post request
    payload = json.dumps({
        'team_id': TEAM_ID,
        'data': data
    })
    
    r = requests.post(url = API_ENDPOINT, data = payload) 
    result = r.json()
    
    # trim both properties so only message is displayed
    if 'body' in result:
        msg = result['body'].copy()
        msg.pop('team_id', False)
        msg.pop('data', False)
    else: 
        # an unhandled error occurred
        #  show everything
        msg = result
        
    # convert dict to string
    msg = str(msg)
    show_box(msg)
    
    # return full body (useful if you need to confirm data integrity)
    return result

In [4]:
# Files which might be useful will
# be added to this dict.
# key = filename
# value = url to download
DOWNLOAD_LIST = {}

# Business Master File (BMF)
* published by IRS, publicly available
* already comes [separated by state](https://www.irs.gov/charities-non-profits/exempt-organizations-business-master-file-extract-eo-bmf), (we only want Missouri)
* **Data dictionary** from [IRS(*.pdf)](https://www.irs.gov/pub/irs-soi/eo_info.pdf)
* From the instructions:
> The BMF dataset will be the primary data source but teams will need to find corresponding phone numbers and website URLs within the Form 990 data, using the EIN to match records across the two datasets. In this process, teams will reduce the dataset to only the records for which there is a corresponding URL to use for website text analysis. The BMF data also must be filtered down by service categories using the category whitelists provided below. 


In [5]:
DOWNLOAD_LIST['MoBMF.csv'] = 'https://slalom-hackathon.s3.us-east-2.amazonaws.com/MO+BMF+6.10.2019.csv'

> The names of the columns you are required to capture from the BMF data are:

In [6]:
REQUIRED_BMF_COLUMNS = [
    "EIN",
    "NAME",
    "STREET",
    "CITY",
    "ZIP",
    "ACTIVITY",
    "NTEE_CD",
]

#### whitelist filters

> There are two types of service classification codes used in the BMF data that describe the types of services provided by an organization: Activity codes and NTEE codes. Some rows of the BMF file include only one type of code, some have both, and some have neither. Furthermore, there can be multiple codes of each type. For activity codes, there can be up to three assigned so you’ll find either three, six, or nine digits in the CSV field. If you wish to learn more about these service classification codes, the category names and codes are listed in the aforementioned PDF about the BMF data, but NTEE categories have descriptions available at a [dedicated website](https://nccs.urban.org/publication/irs-activity-codes). As previously mentioned, organizations that have been assigned categories which are not relevant to Johego users can be filtered out of the dataset. Johego has provided a [whitelist of NTEE codes](https://slalom-hackathon.s3.us-east-2.amazonaws.com/flat_NTEE_whitelist) and a [whitelist of Activity codes](https://slalom-hackathon.s3.us-east-2.amazonaws.com/flat_ActivityCode_whitelist), both in CSV format hosted on S3. The whitelist of NTEE codes uses a shorthand such that the entry “E2” represents inclusion of the NTEE category “E20” as well as all of its sub-categories. If any of the Activity codes or NTEE codes assigned to an organization are found in the respective whitelist, the organization should be included in the final dataset.




In [7]:
DOWNLOAD_LIST['whitelist-ntee.txt'] = 'https://slalom-hackathon.s3.us-east-2.amazonaws.com/flat_NTEE_whitelist'
DOWNLOAD_LIST['whitelist-activity.txt']  = 'https://slalom-hackathon.s3.us-east-2.amazonaws.com/flat_ActivityCode_whitelist'

# IRS Form 990

> The IRS publishes certain electronic Form 990 filings (in XML format) with an index file (in JSON format) hosted on Amazon S3. Teams will use the [2018 990 filings index](https://s3.amazonaws.com/irs-form-990/index_2018.json) to locate and then parse individual 990 data files. ([Here](https://s3.amazonaws.com/irs-form-990/201713189349307146_public.xml) is an example 990 filing.). Some of the files you will encounter are a different version of form 990, 990EZ and 990PF. Using a bullet list structure to mimic the XML structure of the 990 files, the fields you are required to capture are listed below. The ReturnHeader section remains the same for all form types and the ReturnData section will contain a type-specific XML element which is either IRS990, IRS990EZ, or IRS990PF. The IRS990PF element does not contain information relevant to Johego, so it is not listed below.

In [8]:
DOWNLOAD_LIST['990-index-2018.json'] = 'https://s3.amazonaws.com/irs-form-990/index_2018.json'
DOWNLOAD_LIST['example-990.xml'] = 'https://s3.amazonaws.com/irs-form-990/201713189349307146_public.xml'

In [9]:
REQUIRED_990_PATHS = '''
Return ReturnHeader Filer EIN
Return ReturnHeader Filer PhoneNum
ReturnData IRS990 WebsiteAddressTxt
ReturnData IRS990EZ WebsiteAddressTxt
'''
REQUIRED_990_PATHS = [x.split() for x in REQUIRED_990_PATHS.strip().splitlines()]
REQUIRED_990_PATHS

[['Return', 'ReturnHeader', 'Filer', 'EIN'],
 ['Return', 'ReturnHeader', 'Filer', 'PhoneNum'],
 ['ReturnData', 'IRS990', 'WebsiteAddressTxt'],
 ['ReturnData', 'IRS990EZ', 'WebsiteAddressTxt']]

# Amazon Comprehend

> The IRS data can provide a basic directory of social and medical service providers in Missouri, but it lacks information about their hours of operation and may list only one service category despite the fact that many organizations’ services span multiple categories. Use any of the powerful features offered by Amazon Comprehend to analyze the organizations’ websites (found in Form 990 data) to find their hours of operation and additional service categories. The category names, descriptions, keywords, and NTEE codes can be found in a [JSON file](https://slalom-hackathon.s3.us-east-2.amazonaws.com/categories.json) provided by Johego. Here are some suggestions of how Amazon Comprehend may be used to accomplish that goal:
* Hours of operation (for a typical week)
    * Use key phrase extraction and/or entity recognition to detect hours of operation listed on the websites.
* Services offered
    * Use key phrase extraction and/or topic modeling to detect distinct services offered by the organization.


In [10]:
DOWNLOAD_LIST['categories.json'] = 'https://slalom-hackathon.s3.us-east-2.amazonaws.com/categories.json'

# Final Data Structure

Submissions are in this JSON schema:

* team_id
* data (type: object with EINs as keys)
* EIN ("883832...")
    * name
    * url
    * phone (format: “3145550673”)
    * services (type: array of NTEE code strings)
    * hours (type: object with days of the week as keys)
        * sunday|monday|tuesday|wednesday|thursday|friday|saturday
            * opens_at (format: twenty-four hour clock, such as “09:00”)
            * closes_at (format: twenty-four hour clock, such as “16:30”)

Note: If the organization is closed all day, that should be represented with the values opens_at=“00:00” and closes_at=“00:00”. If the organization is open for twenty-four hours, that should be represented as opens_at=“00:00” and closes_at=“24:00”.

In [11]:
# Example data to send to api
example_data = {
    '462123095': {
        'name': 'JOHEGO',
        'url': 'https://www.johego.org/',
        'phone': '3140001234',
        'services': ['P20'],
        'hours': {
            # open 8hrs (9AM-5PM) on Mondays
            'monday': {
                'opens_at': '09:00',
                'closes_at': '17:00'
            },
            # open 24h on T
            'tuesday':   { 'opens_at': '00:00', 'closes_at': '24:00' },
            # closed on W-S
            'wednesday': { 'opens_at': '00:00', 'closes_at': '00:00' },
            'thursday':  { 'opens_at': '00:00', 'closes_at': '00:00' },
            'friday':    { 'opens_at': '00:00', 'closes_at': '00:00' },
            'saturday':  { 'opens_at': '00:00', 'closes_at': '00:00' },
            'sunday':    { 'opens_at': '00:00', 'closes_at': '00:00' },
        }

    },
    '431195240': {
        'name': 'INDEPENDENCE CENTER',
        # etc...
    },
}

# Testing Throughout the Hackathon
Teams can test their submission JSON throughout the competition by sending their JSON in the body of a POST request to this URL using either a cURL command in a terminal or a REST client application such as Postman or Insomnia:

https://v0g71fyb06.execute-api.us-east-1.amazonaws.com/testSubmission

This testing API will return a JSON object with a “status” property and a “body” property. The “status” property contains the HTTP response code for the API call. The “body” property will be the JSON object that was sent in the POST request with a new property appended, either a “score” or an “error”. If the submission JSON was not properly structured, the “error” property will contain a string error message describing what is incorrect in the JSON structure. If the submission JSON was properly structured, the “score” property will contain a JSON object with a breakdown of the score calculation (values shown are for an example score):
* number_of_rows_submitted: 450
* number_of_invalid_rows: 125
* number_of_valid_rows: 325
* points_for_phone: 276
* points_for_url: 325
* points_for_hours: 65
* points_for_services: 937
* initial_score: 1603
* accuracy_multiplier: 0.21951219512195122
* accuracy_adjusted_score: 351.8780487804878
* final_score: 351.8780487804878


In [12]:
# Let's try out the test API with the fake data above
result = canihazmyscore(example_data)
result

{'errorMessage': 'division by zero',
 'errorType': 'ZeroDivisionError',
 'stackTrace': ['  File "/var/task/lambda_function.py", line 11, in lambda_handler\n    \'body\': scoreSubmission(event)\n',
  '  File "/var/task/lambda_function.py", line 169, in scoreSubmission\n    accuracy = matched / compared\n']}

In [13]:
# ... Hmm, guess you shouldn't feed fake data to cats.

In [None]:
# according to the description above, 
# legit calls should have a score object:
#
print(result['body']['score']['final_score'])

# Before you go!!!

#### do something with these variables

In [14]:
REQUIRED_990_PATHS

[['Return', 'ReturnHeader', 'Filer', 'EIN'],
 ['Return', 'ReturnHeader', 'Filer', 'PhoneNum'],
 ['ReturnData', 'IRS990', 'WebsiteAddressTxt'],
 ['ReturnData', 'IRS990EZ', 'WebsiteAddressTxt']]

In [15]:
REQUIRED_BMF_COLUMNS

['EIN', 'NAME', 'STREET', 'CITY', 'ZIP', 'ACTIVITY', 'NTEE_CD']

#### Download those data files

In [16]:
# let's look at what's in our bag
DOWNLOAD_LIST

{'MoBMF.csv': 'https://slalom-hackathon.s3.us-east-2.amazonaws.com/MO+BMF+6.10.2019.csv',
 'whitelist-ntee.txt': 'https://slalom-hackathon.s3.us-east-2.amazonaws.com/flat_NTEE_whitelist',
 'whitelist-activity.txt': 'https://slalom-hackathon.s3.us-east-2.amazonaws.com/flat_ActivityCode_whitelist',
 '990-index-2018.json': 'https://s3.amazonaws.com/irs-form-990/index_2018.json',
 'example-990.xml': 'https://s3.amazonaws.com/irs-form-990/201713189349307146_public.xml',
 'categories.json': 'https://slalom-hackathon.s3.us-east-2.amazonaws.com/categories.json'}

In [None]:
# download 'em
for filename, url in DOWNLOAD_LIST.items():
    download(url, './%s' % filename)

# Have fun!