In [34]:
# Fundamental
import sys, re, random, operator
import datetime, json, pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#ML
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif

# Query
import urllib3 as url
import xml.etree.ElementTree as ET

# Visualize
import seaborn as sns

### Parameters + Instancing

In [2]:
http = url.PoolManager()
pp = pprint.PrettyPrinter(indent=4)
pd.options.display.max_seq_items = 100
%matplotlib inline

In [44]:
apipath = "https://projects.propublica.org/nonprofits/api/v2"
search = "/search.json?q="
s3path = "https://s3.amazonaws.com/irs-form-990"
s3index = lambda x: "/index_"+str(x)+".csv"
s3xml = lambda x:"/"+str(x)+"_public.xml"

### Core Functions

In [4]:
def organization(ein):
    return "/organizations/"+str(ein)+".json"

In [5]:
def get_org_by_ein(ein):
    response = http.request('GET',apipath+organization(ein))
    parsed = json.loads(response.data.decode('utf-8'))
    print(json.dumps(parsed, indent=4, sort_keys=True))
    return pd.read_json(response.data.decode('utf-8'))

In [6]:
def search_by_name(name, zipcode, returnlist=False):
    return ein

In [30]:
def get_unique_from_ein(year,ein):
    df = pd.read_csv(s3path+s3index(str(year)))
    objid = df[df["EIN"]==ein].OBJECT_ID
    return objid.values[0]

In [54]:
def get_990_to_df(objid):
    response = http.request('GET',s3path+s3xml(objid))
    root = ET.fromstring(response.data.decode('utf-8')) # element tree
    print(root)
    all_records = []
    for i, child in enumerate(root):
        record = {}
        for subchild in child:
            record[subchild.tag] = subchild.text
            all_records.append(record)
    return pd.DataFrame(all_records)

### Explore
[Dictionary](https://github.com/CharityNavigator/irs990/blob/master/docs/explore-database.md)

In [42]:
r = get_unique_from_ein(2015,382912028)

In [55]:
df=get_990_to_df(r)

<Element '{http://www.irs.gov/efile}Return' at 0x7fa8e8f94d18>




In [53]:
df

Unnamed: 0,{http://www.irs.gov/efile}BuildTS,{http://www.irs.gov/efile}BusinessOfficerGrp,{http://www.irs.gov/efile}Filer,{http://www.irs.gov/efile}IRS990,{http://www.irs.gov/efile}IRS990ScheduleA,{http://www.irs.gov/efile}IRS990ScheduleB,{http://www.irs.gov/efile}IRS990ScheduleD,{http://www.irs.gov/efile}IRS990ScheduleL,{http://www.irs.gov/efile}IRS990ScheduleO,{http://www.irs.gov/efile}PreparerFirmGrp,{http://www.irs.gov/efile}PreparerPersonGrp,{http://www.irs.gov/efile}ReturnTs,{http://www.irs.gov/efile}ReturnTypeCd,{http://www.irs.gov/efile}TaxPeriodBeginDt,{http://www.irs.gov/efile}TaxPeriodEndDt,{http://www.irs.gov/efile}TaxYr
0,2016-02-25 16:41:14Z,\n,\n,,,,,,,\n,\n,2015-08-13T12:41:51-05:00,990.0,2014-01-01,2014-12-31,2014.0
1,2016-02-25 16:41:14Z,\n,\n,,,,,,,\n,\n,2015-08-13T12:41:51-05:00,990.0,2014-01-01,2014-12-31,2014.0
2,2016-02-25 16:41:14Z,\n,\n,,,,,,,\n,\n,2015-08-13T12:41:51-05:00,990.0,2014-01-01,2014-12-31,2014.0
3,2016-02-25 16:41:14Z,\n,\n,,,,,,,\n,\n,2015-08-13T12:41:51-05:00,990.0,2014-01-01,2014-12-31,2014.0
4,2016-02-25 16:41:14Z,\n,\n,,,,,,,\n,\n,2015-08-13T12:41:51-05:00,990.0,2014-01-01,2014-12-31,2014.0
5,2016-02-25 16:41:14Z,\n,\n,,,,,,,\n,\n,2015-08-13T12:41:51-05:00,990.0,2014-01-01,2014-12-31,2014.0
6,2016-02-25 16:41:14Z,\n,\n,,,,,,,\n,\n,2015-08-13T12:41:51-05:00,990.0,2014-01-01,2014-12-31,2014.0
7,2016-02-25 16:41:14Z,\n,\n,,,,,,,\n,\n,2015-08-13T12:41:51-05:00,990.0,2014-01-01,2014-12-31,2014.0
8,2016-02-25 16:41:14Z,\n,\n,,,,,,,\n,\n,2015-08-13T12:41:51-05:00,990.0,2014-01-01,2014-12-31,2014.0
9,2016-02-25 16:41:14Z,\n,\n,,,,,,,\n,\n,2015-08-13T12:41:51-05:00,990.0,2014-01-01,2014-12-31,2014.0
