### Script Purpose
- Get BMF file 2014-16 from NCCS.
- Use only A confidence level records.
- Drop organizations that changed their NTEE codes between 14-16.

In [1]:
import pandas as pd
import requests
from tqdm import tqdm
import math
from irsx.xmlrunner import XMLRunner
xml_runner = XMLRunner()
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_bmf_1608=pd.read_csv('https://nccs-data.urban.org/data/bmf/2016/bmf.bm1608.csv')
df_bmf_1512=pd.read_csv('https://nccs-data.urban.org/data/bmf/2015/bmf.bm1512.csv')
df_bmf_1412=pd.read_csv('https://nccs-data.urban.org/data/bmf/2014/bmf.bm1412.csv')
df_bmf_1608['YR_MO']=1608
df_bmf_1512['YR_MO']=1512
df_bmf_1412['YR_MO']=1412

In [3]:
df_bmf_14_16=pd.concat([df_bmf_1608, df_bmf_1412, df_bmf_1512], ignore_index=True)
df_bmf_14_16.loc[df_bmf_14_16[df_bmf_14_16.nteeConf=='a'].index, 'nteeConf']='A' # Correct erroneous records 'a'.
df_bmf_14_16_confA=df_bmf_14_16[df_bmf_14_16.nteeConf=='A'] # Select A confidence level records.

In [6]:
df_bmf_14_16.to_pickle('../../dataset/intermediary/df_bmf_14_16.pkl.gz', compression='gzip')

df_bmf_14_16.groupby('nteeConf').count()['EIN']/len(df_bmf_14_16)

df_bmf_14_16_confA.sample(5)

**Drop organizations that changed their NTEE codes between 14-16.**

import ipyparallel as ipp
c = ipp.Client()
print(c.ids)
dview = c[:]

dview.execute('import pandas as pd')
dview['df_bmf_14_16_confA']=df_bmf_14_16_confA
dview['df_bmf_14_16_confA_ntee_chg']=pd.DataFrame()

# Orgs changed their NTEE codes.
@dview.parallel(block=True)
def ntee_chg(ein):
    global df_bmf_14_16_confA, df_bmf_14_16_confA_ntee_chg
    ntee1_list=df_bmf_14_16_confA[df_bmf_14_16_confA.EIN==ein]['NTEE1']
    if len(set(ntee1_list))!=1:
        df_bmf_14_16_confA_ntee_chg=pd.concat([df_bmf_14_16_confA_ntee_chg, 
                                               df_bmf_14_16_confA[df_bmf_14_16_confA.EIN==ein]]
                                             )

t=ntee_chg.map(df_bmf_14_16_confA.EIN.unique())

df_bmf_14_16_confA_ntee_chg=pd.concat(dview.gather('df_bmf_14_16_confA_ntee_chg'), ignore_index=False)
df_bmf_14_16_confA_ntee_chg[['EIN', 'YR_MO', 'NTEE1']].head(5)

dview['ein_drop_list']=df_bmf_14_16_confA_ntee_chg.EIN.unique()
dview['df_bmf_14_16_confA_chg_drop']=pd.DataFrame()

@dview.parallel(block=True)
def func_ntee_chg_drop(ein):
    global df_bmf_14_16_confA_chg_drop, df_bmf_14_16_confA
    if ein not in ein_drop_list:
        df_bmf_14_16_confA_chg_drop=pd.concat([df_bmf_14_16_confA_chg_drop, 
                                               df_bmf_14_16_confA[df_bmf_14_16_confA.EIN==ein]
                                              ])

t=func_ntee_chg_drop.map(df_bmf_14_16_confA.EIN.unique())

df_bmf_14_16_confA_chg_drop=pd.concat(dview.gather('df_bmf_14_16_confA_chg_drop')).sort_index()

len(df_bmf_14_16_confA_chg_drop)

df_to_write=df_bmf_14_16_confA_chg_drop
num_file=4
file_path_name='../../dataset/df_bmf_14_16_confA_chg_drop.pkl.gz/df_bmf_14_16_confA_chg_drop.pkl.gz'
for index in range(0, df_to_write.iloc[-1].name+1, math.ceil(df_to_write.iloc[-1].name/num_file)):
    df_temp=df_to_write.loc[index:index+math.ceil(df_to_write.iloc[-1].name/num_file)-1]
    df_temp.to_pickle(file_path_name+'-'+str(df_temp.iloc[0].name)+'-'+str(df_temp.iloc[-1].name), compression='gzip')

# Test reading file.
import os
file_list=os.listdir('../../dataset/df_bmf_14_16_confA_chg_drop.pkl.gz/')
df_test=pd.DataFrame()
for file in file_list:
    df_test=pd.concat([df_test, 
                       pd.read_pickle('../../dataset/df_bmf_14_16_confA_chg_drop.pkl.gz/'+file, compression='gzip')])
len(df_test)

**Done with data acquisition, see how many changed.**

df_bmf_14_16_confA_chg_drop=pd.DataFrame()
for file in os.listdir('../../dataset/df_bmf_14_16_confA_chg_drop.pkl.gz/'):
    df_bmf_14_16_confA_chg_drop=pd.concat([df_bmf_14_16_confA_chg_drop,
                                           pd.read_pickle('../../dataset/df_bmf_14_16_confA_chg_drop.pkl.gz/'+file, compression='gzip')
                                          ])

[len(df_bmf_14_16_confA_chg_drop.EIN.unique()), 
 len(df_bmf_14_16_confA), 
 len(df_bmf_14_16_confA_chg_drop.EIN.unique())/len(df_bmf_14_16_confA)
 1-len(df_bmf_14_16_confA_chg_drop.EIN.unique())/len(df_bmf_14_16_confA)
]

len(df_bmf_14_16_confA.EIN.unique())
# ~1.76% Changed their NTEE codes between 2014-2016. Drop these records since we have no idea on exactly when these codes were changed.

len(df_bmf_14_16_confA[['EIN', 'NTEE1']].drop_duplicates()), len(df_bmf_14_16_confA.EIN.unique())
# ~1.76% Changed their NTEE codes between 2014-2016. Drop these records since we have no idea on exactly when these codes were changed.

df_bmf_14_16_confA_rm_chg=df_bmf_14_16_confA.loc[df_bmf_14_16_confA[['EIN', 'NTEE1']].drop_duplicates(keep=False).index]

len(df_bmf_14_16_confA_rm_chg[['EIN', 'NTEE1']].drop_duplicates(keep=False)), len(df_bmf_14_16_confA_rm_chg)

len(df_bmf_14_16[['EIN', 'NTEE1', 'nteeConf']].drop_duplicates()), len(df_bmf_14_16[['EIN', 'NTEE1', 'nteeConf', 'YR_MO']].drop_duplicates())