In [1]:
import os
from pymongo import MongoClient
import pandas as pd
import numpy as np
%matplotlib inline

# Open MongoDBB

In [2]:
year = 2017

In [3]:
uri = "mongodb://mongo/tweets"
client = MongoClient(uri)

print( 'List of databases in MongoDB:' )
print(client.list_database_names())

# database
db = client['irs990']

# collection
dbYear = db['%s' % year]

List of databases in MongoDB:
['admin', 'config', 'irs990', 'local']


# Filter IRS Forms w/no Website

In [None]:
# dbYear.delete_many({'$or':[{ 'WebsiteAddressTxt' : 'N/A' },{ 'WebsiteAddressTxt' : 'NONE' },{'WebsiteAddressTxt':{'$exists':bool(0)}}]})

# Feature Analysis

In [4]:
from bson.code import Code

In [None]:
map = Code("function() {"
    "for (var key in this) { emit(key, 1); }"
  "}");

reduce = Code("function(key, stuff) { var sum=0;for(var i in stuff) sum += stuff[i];"
     "return sum;}");

result = dbYear.map_reduce(map, reduce, "fieldResults")

In [None]:
d = {}
for doc in result.find():
    d[doc['_id']]= doc['value']
    
v_sorted = [d[key] for key in sorted(d, key=d.get, reverse=True)]
k_sorted = [key for key in sorted(d, key=d.get, reverse=True)]

The top aprox 150 Fields are rather consistent across the Collection

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
figure(figsize=(16, 12))
plt.plot(v_sorted)
plt.ylabel("Feature Count")
plt.show()

In [None]:
top50 = k_sorted[:50]

In [None]:
figure(figsize=(15, 20))
pos = np.arange(len(top50))
ax = plt.axes()
ax.invert_yaxis()  # labels read top-to-bottom
plt.barh(top50, v_sorted[:50], color='g',align='center')
plt.show()

# Filtering of Fields

In [None]:
# original
cursor = dbYear.find({},{'OrganizationNAME':1
                         ,'WebsiteAddressTxt':1
                         ,'Desc':1
                         ,'ActivityOrMissionDesc':1
                         ,'USAddress.ZIPCd':1
                         ,'TaxExemptBondsInd':1
                         ,'EmployeeCnt':1
                         ,'CYTotalRevenueAmt':1
                         ,'CYTotalExpensesAmt':1
                         ,'NetAssetsOrFundBalancesEOYAmt':1
                         ,'URL':1
                         ,'TaxPeriod':1
                         ,'LastUpDated':1
                         ,'FormType':1
                         ,'EIN':1
                         ,'GrossReceiptsAmt':1
                         ,'_id':0})

In [24]:
# new
cursor = dbYear.find({},{'OrganizationName':1
                         ,'URL':1
                         ,'TaxPeriod':1
                         ,'FormType':1
                         ,'EIN':1
                         #text
                         ,'/IRS990/WebsiteAddressTxt':1
                         ,'/IRS990EZ/WebsiteAddressTxt':1
                         ,'/IRS990/Desc':1
                         ,'/IRS990/ActivityOrMissionDesc':1
                         ,'/IRS990/MissionDesc':1
                         ,'/IRS990EZ/PrimaryExemptPurposeTxt':1
                         #geographic
                         ,'/IRS990/BooksInCareOfDetail/USAddress/ZIPCode':1
                         ,'/IRS990EZ/BooksInCareOfDetail/USAddress/ZIPCode':1
                         #business
                         ,'/IRS990/TaxExemptBondsInd':1
                         ,'/IRS990/EmployeeCnt':1
                         ,'/IRS990/TotalEmployeeCnt':1
                         ,'/IRS990/TotalVolunteersCnt':1
                         
                         ,'/IRS990/CYTotalRevenueAmt':1                 #CY = calendar year
                         ,'/IRS990/CYTotalExpensesAmt':1
                         ,'/IRS990/CYTotalProfFndrsngExpnsAmt':1
                         ,'/IRS990/CYSalariesCompEmpBnftPaidAmt':1
                         
                         ,'/IRS990/PYTotalRevenueAmt':1                 #PY = previous year
                         ,'/IRS990/PYTotalExpensesAmt':1
                         ,'/IRS990/PYTotalProfFndrsngExpnsAmt':1
                         ,'/IRS990/PYSalariesCompEmpBnftPaidAmt':1
                         
                         ,'/IRS990/GovernmentGrantsAmt':1
                         ,'/IRS990/NetAssetsOrFundBalancesEOYAmt':1
                         ,'/IRS990EZ/NetAssetsOrFundBalancesEOYAmt':1
                         ,'/IRS990/GrossReceiptsAmt':1
                         ,'/IRS990EZ/GrossReceiptsAmt':1
                         ,'/IRS990/OtherSalariesAndWagesGrp/TotalAmt':1
                         ,'/IRS990/RelatedOrganizationsAmt':1
                         ,'/IRS990/RevenueAmt':1
                         
                         ,'/IRS990/TotLiabNetAssetsFundBalanceGroup/EOYAmt':1
                         ,'/IRS990/TotalAssetsGrp/EOYAmt':1
                         ,'/IRS990EZ/Form990TotalAssetsGrp/EOYAmt':1
                         ,'/IRS990/TotalContributionsAmt':1
                         ,'/IRS990/TotalFunctionalExpenseGrp/TotalAmt':1
                         ,'/IRS990/TotalLiabilitiesGrp/EOYAmt':1
                         ,'/IRS990/TotalNetAssetsFundBalanceGrp/EOYAmt':1
                         ,'/IRS990EZ/NetAssetsOrFundBalancesGrp/EOYAmt':1
                         ,'/IRS990EZ/TotalExpenseAmt':1
                         ,'/IRS990EZ/TotalRevenueAmt':1
                         
                         })

In [25]:
df = pd.DataFrame(list(cursor))

In [26]:
len(df)

122082

In [27]:
list(df.FormType.unique())

['990', '990PF', '990EZ']

In [78]:
#Split by Form Type
df_990 = df[df['FormType']=='990']
df_990EZ = df[df['FormType']=='990EZ']
# df = df[df['FormType']!='990PF']
len(df)

107672

In [79]:
print(len(df_990) + len(df_990EZ))

107672


In [80]:
names = []
for name in df_990.columns:
    if name[0:8] == '/IRS990/':
        names.append(name[8:])
    else:
        names.append(name)
df_990.columns = names

names = []
for name in df_990EZ.columns:
    if name[0:10] == '/IRS990EZ/':
        names.append(name[10:])
    elif name[0:8] == '/IRS990/':
        names.append(name[8:])
    else:
        names.append(name)
df_990EZ.columns = names

In [81]:
df_990EZ.dropna(axis=1, thresh=100, inplace=True)
df_990EZ

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,BooksInCareOfDetail/USAddress/ZIPCode,Form990TotalAssetsGrp/EOYAmt,GrossReceiptsAmt,NetAssetsOrFundBalancesEOYAmt,NetAssetsOrFundBalancesGrp/EOYAmt,PrimaryExemptPurposeTxt,TotalRevenueAmt,WebsiteAddressTxt,EIN,FormType,OrganizationName,TaxPeriod,URL,_id
11,32935,32904,40833,32904,32904,"TO PLAN, PROMOTE AND IMPLEMENT RALLIES, TRIPS,...",40833,CARRIAGETRAVELCLUB.COM,510183497,990EZ,CARRIAGE TRAVEL CLUB INC,201512,https://s3.amazonaws.com/irs-form-990/20161244...,510183497_201512
12,18064,39865,43235,39865,39865,CREATE INTEREST IN THE NAZARETH FOOTBALL PROGRAM.,16984,,272212300,990EZ,NAZARETH FOOTBALL BOOSTER CLUB,201512,https://s3.amazonaws.com/irs-form-990/20161244...,272212300_201512
13,939063100,135402,61366,135402,135402,THE ORGANIZATION IS AN UNICORPORATED ASSOCIATI...,61366,,274033935,990EZ,NATIVIDAD MEDICAL STAFF CO AMISH A SHAH,201512,https://s3.amazonaws.com/irs-form-990/20161244...,274033935_201512
14,30224,94057,86876,93492,93492,FOOD BANK FOR NEEDY FAMILIES,86876,GRIFFINFOODPANTRY.COM,273276308,990EZ,FIVE LOAVES AND TWO FISH FOOD PANTRY,201512,https://s3.amazonaws.com/irs-form-990/20161244...,273276308_201512
15,97039,3232,30476,2158,2158,CARE AND EDUCATION OF PRESCHOOL AGE CHILDREN,30441,,930717774,990EZ,SHERMAN PRESCHOOL,201606,https://s3.amazonaws.com/irs-form-990/20161244...,930717774_201606
16,96793,1506,5430,1506,1506,RECREATION ORIENTED CRIME PREVENTION PGM,5430,,472143570,990EZ,MAUI POLICE ACTIVITIES LEAGUE,201512,https://s3.amazonaws.com/irs-form-990/20161244...,472143570_201512
17,782701175,15625,40947,15625,15625,ENCOURAGE EDUCATION THROUGH SCIENTIFIC RESEARCH.,40947,,746068016,990EZ,ALAMO REGIONAL ACADEMY OF SCIENCE AND ENGINEERING,201606,https://s3.amazonaws.com/irs-form-990/20161244...,746068016_201606
18,67203,72407,103590,72407,72407,COMMUNITY SERVICE TO VETERANS & FAMILIES,74960,,480119830,990EZ,AMERICAN LEGION 004,201606,https://s3.amazonaws.com/irs-form-990/20161244...,480119830_201606
30,57369,490424,151474,490424,490424,TO PROVIDE AMBULANCE SERVICE TO THE RESIDENTS ...,151474,,460305857,990EZ,PLATTE AMBULANCE SERVICE,201512,https://s3.amazonaws.com/irs-form-990/20161245...,460305857_201512
31,23185,20583,115877,8103,8103,THE ORGANIZATION'S PRIMARY FUNCTION IS TO PROV...,90166,WWW.WILLIAMSBURGROTARY.ORG,546053668,990EZ,ROTARY CLUB OF WILLIAMSBURG INC,201606,https://s3.amazonaws.com/irs-form-990/20161245...,546053668_201606


In [87]:
df2 = df_990.append(df_990EZ)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [88]:
df2.columns

Index(['/IRS990EZ/BooksInCareOfDetail/USAddress/ZIPCode',
       '/IRS990EZ/Form990TotalAssetsGrp/EOYAmt', '/IRS990EZ/GrossReceiptsAmt',
       '/IRS990EZ/NetAssetsOrFundBalancesEOYAmt',
       '/IRS990EZ/NetAssetsOrFundBalancesGrp/EOYAmt',
       '/IRS990EZ/PrimaryExemptPurposeTxt', '/IRS990EZ/TotalRevenueAmt',
       '/IRS990EZ/WebsiteAddressTxt', 'ActivityOrMissionDesc',
       'BooksInCareOfDetail/USAddress/ZIPCode', 'CYSalariesCompEmpBnftPaidAmt',
       'CYTotalExpensesAmt', 'CYTotalProfFndrsngExpnsAmt', 'CYTotalRevenueAmt',
       'Desc', 'EIN', 'EmployeeCnt', 'Form990TotalAssetsGrp/EOYAmt',
       'FormType', 'GovernmentGrantsAmt', 'GrossReceiptsAmt', 'MissionDesc',
       'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesGrp/EOYAmt',
       'OrganizationName', 'OtherSalariesAndWagesGrp/TotalAmt',
       'PYSalariesCompEmpBnftPaidAmt', 'PYTotalExpensesAmt',
       'PYTotalProfFndrsngExpnsAmt', 'PYTotalRevenueAmt',
       'PrimaryExemptPurposeTxt', 'RelatedOrganizationsAm

In [89]:
df2.dropna(axis=1, thresh=100, inplace = True)

In [90]:
df2.columns

Index(['ActivityOrMissionDesc', 'BooksInCareOfDetail/USAddress/ZIPCode',
       'CYSalariesCompEmpBnftPaidAmt', 'CYTotalExpensesAmt',
       'CYTotalProfFndrsngExpnsAmt', 'CYTotalRevenueAmt', 'Desc', 'EIN',
       'EmployeeCnt', 'Form990TotalAssetsGrp/EOYAmt', 'FormType',
       'GovernmentGrantsAmt', 'GrossReceiptsAmt', 'MissionDesc',
       'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesGrp/EOYAmt',
       'OrganizationName', 'OtherSalariesAndWagesGrp/TotalAmt',
       'PYSalariesCompEmpBnftPaidAmt', 'PYTotalExpensesAmt',
       'PYTotalProfFndrsngExpnsAmt', 'PYTotalRevenueAmt',
       'PrimaryExemptPurposeTxt', 'RelatedOrganizationsAmt', 'RevenueAmt',
       'TaxExemptBondsInd', 'TaxPeriod', 'TotalAssetsGrp/EOYAmt',
       'TotalContributionsAmt', 'TotalEmployeeCnt',
       'TotalLiabilitiesGrp/EOYAmt', 'TotalNetAssetsFundBalanceGrp/EOYAmt',
       'TotalRevenueAmt', 'TotalVolunteersCnt', 'URL', 'WebsiteAddressTxt',
       '_id'],
      dtype='object')

In [91]:
# manually combine fields in excel such as TotalRevenue and RevenueAmt
df2.to_csv('./sample_data_2017_M4.csv', index = False)

## EDA

In [None]:
df_990['GrossReceiptsAmt'].mean()

In [None]:
import warnings
warnings.filterwarnings("ignore")

from math import log

for field in ('GrossReceiptsAmt', 'CYTotalRevenueAmt', 'CYTotalExpensesAmt', 'EmployeeCnt'):
    print(field)
    
    df_990["log" + field] = [log(int(x)) if x != 0 else 0 for x in df_990[field]]
    plt.hist(df_990["log" + field])
#     plt.hist(df_990[field])
    plt.show()
    print(df_990[field].describe())
    print('\n')

In [None]:
plt.scatter(df_990['EmployeeCnt'], df_990['GrossReceiptsAmt'])

In [None]:
for field in ('GrossReceiptsAmt', 'CYTotalRevenueAmt', 'CYTotalExpensesAmt'):
    print(field)
    print(df_990['EmployeeCnt'].corr(df_990[field]))
    print('\n')

In [None]:
df_990EZ['GrossReceiptsAmt'] = [int(x) for x in df_990EZ['GrossReceiptsAmt']]

In [None]:
for field in ('GrossReceiptsAmt', 'NetAssetsOrFundBalancesEOYAmt'):
    print(field)
    
    df_990EZ[field] = [int(x) for x in df_990EZ[field]]
    plt.hist(df_990EZ[field])
    plt.show()
    print(df_990EZ[field].describe())
    print('\n')

In [None]:
df_990[df_990['EmployeeCnt'].isna()]