### Script Purpose
- Concatenate text and NTEE codes.
- Prepare test dataset (`df_train.pkl.gz`).

In [8]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import os
import math
from spellchecker import SpellChecker
import nltk
nltk.download('punkt')
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from joblib import Parallel, delayed

import stanza
stanza.download('en', processors='tokenize')
nlp_en = stanza.Pipeline(lang='en', processors='tokenize', tokenize_no_ssplit=True, use_gpu=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 116kB [00:00, 11.4MB/s]                    
2020-05-08 23:49:49 INFO: Downloading these customized packages for language: en (English)...
| Processor | Package |
-----------------------
| tokenize  | ewt     |

2020-05-08 23:49:49 INFO: File exists: /root/stanza_resources/en/tokenize/ewt.pt.
2020-05-08 23:49:49 INFO: Finished downloading models and saved to /root/stanza_resources.
2020-05-08 23:49:49 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |

2020-05-08 23:49:49 INFO: Use device: gpu
2020-05-08 23:49:49 INFO: Loading: tokenize
2020-05-08 23:49:49 INFO: Done loading processors!


## `nteeConf==A` dataset

In [2]:
df_txt=pd.DataFrame()
file_list_txt=os.listdir('../../dataset/EIN_TXT_2014_18.pkl.gz/')
for file in file_list_txt:
    df_txt=pd.concat([df_txt,
                      pd.read_pickle('../../dataset/EIN_TXT_2014_18.pkl.gz/'+file, compression='gzip')
                     ])
df_txt.sample(3)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before
1862482,,341277321,,,,,,,,,,,,,,,,,1.0
1180547,93493270000000.0,350413700,EFILE,,,,,,"TO PROVIDE A PROFESSIONAL, SUPPORTIVE ENVIRONM...",PROMOTE IMPORTANCE AND VALUE OF UTILIZING A RE...,THE METROPOLITAN INDIANAPOLIS BOARD OF REALTOR...,2.014327e+17,11861855.0,990O,10/14/2014 12:05:50 PM,METROPOLITAN INDIANAPOLIS BOARD OF REALTORS INC,201312.0,2014.0,1.0
670087,93493130000000.0,364472394,EFILE,,,,,N/A##N/A##N/A##OUTREACH/OUTPATIENT COMMUNITY S...,TO PROVIDE SAFE LIVING CONDITIONS FOR PERSONS ...,,TO PROVIDE SAFE LIVING CONDITIONS,2.015213e+17,12757539.0,990,8/6/2015,BRIDGING THE TYS TO JORDAN INC,201412.0,2015.0,0.0


In [5]:
file_list_ntee=os.listdir('../../dataset/df_bmf_14_16_confA_chg_drop.pkl.gz/')
df_ntee=pd.DataFrame()
for file in file_list_ntee:
    df_ntee=pd.concat([df_ntee,
                       pd.read_pickle('../../dataset/df_bmf_14_16_confA_chg_drop.pkl.gz/'+file, compression='gzip')
                      ])
df_ein_ntee=df_ntee[['EIN', 'NTEE1']]
df_ein_ntee.drop_duplicates(inplace=True)
df_ein_ntee.sample(3)

Unnamed: 0,EIN,NTEE1
1314385,746062406,B
3972230,463431966,M
1432881,850127993,B


In [6]:
df_txt_ntee_confA_no_chg=df_txt.merge(df_ein_ntee, on='EIN', how='inner') # 2020-05-06: Use 'inner', save computation resources. Not tested after change.
df_txt_ntee_confA_no_chg.sample(10)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before,NTEE1
2422094,,770687533,,,,,,,,,,,,,,,,,,J
2304428,,226591093,,,,,,,,,,,,,,,,,1.0,
2514636,,201567591,,,,,,,,,,,,,,,,,,D
295301,93491320000000.0,816290155,EFILE,,,,,,,,,2.017232e+17,15051355.0,990PF,12/27/2017 6:49:49 AM,STAR PEAK FOUNDATION,201612.0,2017.0,0.0,
1179545,93491290000000.0,341504501,EFILE,,,,,,,,,2.015029e+17,13148705.0,990PF,01/27/2016,ROBERT AND PATRICIA SWITZER FOUNDATION,201506.0,2016.0,1.0,
1829942,93493130000000.0,911214158,EFILE,,,,,,Washington Toxics Coalition works to protect p...,The Toxic-Free Legacy campaign works to phase ...,WTC works to protect public health and the env...,2.016013e+17,13765580.0,990,09/02/2016,WASHINGTON TOXICS COALITION,201512.0,2016.0,1.0,C
507025,,752140748,,,,,,,,,,,,,,,,,1.0,P
1911049,93492330000000.0,205200059,EFILE,VERSION_NOT_SUPPORTED,VERSION_NOT_SUPPORTED,,,VERSION_NOT_SUPPORTED,,,,2.013033e+17,11126883.0,990EZ,1/9/2014,SUNDANCE STUDIO GYMNASTICS PARENTS ASSOCIATION,201212.0,2014.0,0.0,B
103110,,582052870,,,,,,,,,,,,,,,,,1.0,
1485289,93493130000000.0,943476912,EFILE,,,,,,SEE SCHEDULE OTHE PURPOSE AND MISSION OF ALLIA...,ALLIANCE HEALTH SERVICES ACADEMY HIGH SCHOOL I...,THE PURPOSE AND MISSION OF ALLIANCE HEALTH SER...,2.015013e+17,12673430.0,990,7/14/2015,ALLIANCE HEALTH SERVICES ACADEMY HIGH SCHOOL,201406.0,2015.0,0.0,B


In [7]:
df_txt_ntee_confA_no_chg['mission']=df_txt_ntee_confA_no_chg['IRS990_p1_ActvtyOrMssnDsc'].combine_first(df_txt_ntee_confA_no_chg['IRS990_p3_MssnDsc']).combine_first(df_txt_ntee_confA_no_chg['IRS990EZ_p3_PrmryExmptPrpsTxt'])
df_txt_ntee_confA_no_chg['prgrm_dsc']=df_txt_ntee_confA_no_chg['IRS990_p3_DscS'].fillna('')+'##'+df_txt_ntee_confA_no_chg['IRS990ScheduleO_ExplntnTxt'].fillna('')+'##'+\
                                      df_txt_ntee_confA_no_chg['IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt'].fillna('')+'##'+df_txt_ntee_confA_no_chg['IRS990ScheduleO_ExplntnTxt'].fillna('')+'##'+\
                                      df_txt_ntee_confA_no_chg['IRS990PF_p9a_DscrptnTxt'].fillna('')+'##'+df_txt_ntee_confA_no_chg['IRS990PF_p16b_RltnshpSttmntTxt'].fillna('')

In [8]:
def func_clean_str(string):
    if str(string)=='nan':
        return ''
    else:
        string_set=set([s for s in string.split('##') if s!='']) # Remove duplicates.
        return '; '.join(string_set) # Join together.

df_txt_ntee_confA_no_chg['mission']=df_txt_ntee_confA_no_chg['mission'].map(func_clean_str)
df_txt_ntee_confA_no_chg['prgrm_dsc']=df_txt_ntee_confA_no_chg['prgrm_dsc'].map(func_clean_str)

In [10]:
df_train=df_txt_ntee_confA_no_chg[(~df_txt_ntee_confA_no_chg['mission'].duplicated())] # Drop duplicated mission descriptions.
df_train=df_train[(~df_train['prgrm_dsc'].duplicated())] # Drop duplicated program descriptions.
df_train=df_train[(~df_train['NTEE1'].isna())] # Drop NTEE==NaN.
# Change to upper case.
df_train['mission']=df_train['mission'].map(str.upper)
df_train['prgrm_dsc']=df_train['prgrm_dsc'].map(str.upper)
df_train['NTEE1']=df_train['NTEE1'].map(str.upper)
len(df_train)

234027

In [11]:
# Spell check function. Return corrected word if unknown; return original word if known.
def spellcheck(doc):
    word_string_list=nltk.word_tokenize(doc)
    return [SpellChecker().correction(word=s).upper() for s in word_string_list]

# Use multi-processing instead of df.apply, much faster.
p=Pool(48)

In [12]:
# Correct mission.
df_train['mission_spellchk']=p.map(spellcheck, df_train['mission'])

In [17]:
# Correct prgrm dsc.
df_train['prgrm_dsc_spellchk']=p.map(spellcheck, df_train['prgrm_dsc'])

**Multi-processing vs. Dataframe.apply.**
```Python
>>> from time import time
>>> p=Pool(48)
>>> t1=time()
>>> t=p.map(spellcheck, df_train['mission'][0:20])
>>> print((time()-t1)/60)
0.20710660219192506

>>> from time import time
>>> t1=time()
>>> t=df_train['mission'][0:20].apply(spellcheck)
>>> print((time()-t1)/60)
0.9466491937637329
```

In [23]:
df_to_write=df_train
num_file=10
file_path_name='../../dataset/df_train.pkl.gz/df_train.pkl.gz'
for index in range(0, df_to_write.iloc[-1].name+1, math.ceil(df_to_write.iloc[-1].name/num_file)):
    df_temp=df_to_write.loc[index:index+math.ceil(df_to_write.iloc[-1].name/num_file)-1]
    df_temp.to_pickle(file_path_name+'-'+str(df_temp.iloc[0].name)+'-'+str(df_temp.iloc[-1].name), compression='gzip')

In [144]:
# Test reading file.
import os
file_list=os.listdir('../../dataset/df_train.pkl.gz/')
df_test=pd.DataFrame()
for file in file_list:
    df_test=pd.concat([df_test, 
                       pd.read_pickle('../../dataset/df_train.pkl.gz/'+file, compression='gzip')])
len(df_test)

229472

In [25]:
# Test reading file.
import os
file_list=os.listdir('../../dataset/df_train.pkl.gz/')
df_test=pd.DataFrame()
for file in file_list:
    df_test=pd.concat([df_test, 
                       pd.read_pickle('../../dataset/df_train.pkl.gz/'+file, compression='gzip')])
len(df_test)

234027

### Create Universal Train and Test datasets.

In [2]:
file_list=os.listdir('../../dataset/intermediary/df_train.pkl.gz/')
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, 
                        pd.read_pickle('../../dataset/intermediary/df_train.pkl.gz/'+file, compression='gzip')])
len(df_train)

234027

In [3]:
# Remove records with the same EIN#. UCF-Train and UCF-Test will not overlap.
df_ntee_universal=df_train.loc[df_train['EIN'].drop_duplicates().index]
# Join lists to strings.
df_ntee_universal['mission_spellchk']=[' '.join(s) for s in df_ntee_universal['mission_spellchk']]
df_ntee_universal['prgrm_dsc_spellchk']=[' '.join(s) for s in df_ntee_universal['prgrm_dsc_spellchk']]

In [4]:
# Check dataset size.
df_ntee_universal_train, df_ntee_universal_test = train_test_split(df_ntee_universal, test_size=0.2, random_state=520)
len(df_ntee_universal_train), len(df_ntee_universal_test)

(154424, 38607)

In [5]:
# Save universal test.
df_ntee_universal_test.to_pickle('../../dataset/df_ntee_universal/test/df_ntee_universal_test.pkl.gz', compression='gzip')

In [6]:
# Save universal train.
df_to_write=df_ntee_universal_train
num_file=5
file_path_name='../../dataset/df_ntee_universal/train/df_ntee_universal_train.pkl.gz'
file_count=0
for index in range(0, len(df_to_write), math.ceil(len(df_to_write)/num_file)):
    df_temp=df_to_write.iloc[index:index+math.ceil(len(df_to_write)/num_file)]
    df_temp.to_pickle(file_path_name+'_'+str(file_count)+'of'+str(num_file-1), compression='gzip')
    file_count+=1

In [9]:
# Check saved files.
file_list=os.listdir('../../dataset/df_ntee_universal/train/')
t1=pd.DataFrame()
for file in file_list:
    t1=pd.concat([t1, pd.read_pickle('../../dataset/df_ntee_universal/train/'+file, compression='gzip')])
t2=pd.read_pickle('../../dataset/df_ntee_universal/test/df_ntee_universal_test.pkl.gz', compression='gzip')
len(t1), len(t2)

(154424, 38607)

#### Describe universal datasets.

In [15]:
print(df_ntee_universal_test.groupby('NTEE1').count()['EIN'], '\n'*2, df_ntee_universal_test.groupby('NTEE1').count()['EIN']/len(df_ntee_universal_test))

NTEE1
A    4291
B    6419
C     827
D    1034
E    2307
F     543
G    1353
H     126
I     740
J    1132
K     522
L    1537
M    1140
N    3925
O     409
P    2318
Q     436
R     257
S    3603
T     541
U     225
V      85
W    2038
X    1098
Y    1701
Name: EIN, dtype: int64 

 NTEE1
A    0.111146
B    0.166265
C    0.021421
D    0.026783
E    0.059756
F    0.014065
G    0.035045
H    0.003264
I    0.019168
J    0.029321
K    0.013521
L    0.039811
M    0.029528
N    0.101666
O    0.010594
P    0.060041
Q    0.011293
R    0.006657
S    0.093325
T    0.014013
U    0.005828
V    0.002202
W    0.052788
X    0.028440
Y    0.044059
Name: EIN, dtype: float64


In [16]:
print(df_ntee_universal_train.groupby('NTEE1').count()['EIN'], '\n'*2, df_ntee_universal_train.groupby('NTEE1').count()['EIN']/len(df_ntee_universal_train))

NTEE1
A    17010
B    25827
C     3323
D     4239
E     9015
F     2301
G     5053
H      467
I     2947
J     4772
K     2009
L     5942
M     4693
N    15460
O     1731
P     9180
Q     1987
R     1064
S    14459
T     2032
U     1000
V      350
W     8357
X     4566
Y     6640
Name: EIN, dtype: int64 

 NTEE1
A    0.110151
B    0.167247
C    0.021519
D    0.027450
E    0.058378
F    0.014901
G    0.032722
H    0.003024
I    0.019084
J    0.030902
K    0.013010
L    0.038478
M    0.030390
N    0.100114
O    0.011209
P    0.059447
Q    0.012867
R    0.006890
S    0.093632
T    0.013159
U    0.006476
V    0.002266
W    0.054117
X    0.029568
Y    0.042998
Name: EIN, dtype: float64


## `nteeConf` random sample

In [2]:
df_txt=pd.DataFrame()
file_list_txt=os.listdir('../../dataset/intermediary/EIN_TXT_2014_18.pkl.gz/')
for file in file_list_txt:
    df_txt=pd.concat([df_txt,
                      pd.read_pickle('../../dataset/intermediary/EIN_TXT_2014_18.pkl.gz/'+file, compression='gzip')
                     ])
df_txt.sample(3)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before
1195311,93493220000000.0,43091048,EFILE,,,,,FIRST CONCERN EXISTS TO EMPOWER INDIVIDUALS TO...,A PREGNANCY RESOURCE CENTER PROVIDING FREE PRE...,SEXUAL INTEGRITY: PRESENTED INTERACTIVE CRITIC...,A PREGNANCY RESOURCE CENTER,2.017022e+17,14820759.0,990,10/10/2017 11:47:28 PM,FIRST CONCERN PREGNANCY RESOURCE CENTER,201703.0,2017.0,1.0
289704,93493120000000.0,542143612,EFILE,,,,,THE BOARD OF DIRECTORS FOR HOMES FOR OUR TROOP...,SEE SCHEDULE O,"NEW HOME PROGRAM - UNDER THIS PROGRAM, HOMES F...",TO BUILD SPECIALLY ADAPTED HOMES FOR SEVERELY ...,2.015012e+17,12630637.0,990,7/2/2015,HOMES FOR OUR TROOPS INC,201409.0,2015.0,0.0
847781,93493300000000.0,310960498,EFILE,,,,,,AMVETS is dedicated to supporting veterans and...,Providing support to war veterans.##Supporting...,AMVETS is dedicated to supporting veterans and...,2.01423e+17,12023501.0,990O,12/1/2014 10:12:49 PM,AMERICAN VETERANS OF WORLD WAR II MEMORIAL AMV...,201406.0,2014.0,1.0


In [64]:
df_ntee=pd.read_pickle('../../dataset/intermediary/df_bmf_14_16_sample20pt.pkl.bz2')
df_ein_ntee=df_ntee[['EIN', 'NTEE1', 'nteeConf']]
df_ein_ntee.drop_duplicates(inplace=True)
df_ein_ntee.sample(3)

Unnamed: 0,EIN,NTEE1,nteeConf
4486371,860201200,X,B
1158881,582643290,C,A
3959847,462677889,S,B


In [65]:
df_txt_sample20pt=df_txt.merge(df_ein_ntee, on='EIN', how='inner')
df_txt_sample20pt.sample(5)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before,NTEE1,nteeConf
258039,93491220000000.0,264116050,EFILE,,,,SEE STATEMENT 03,,,,,2.014222e+17,11979815.0,990PF,11/14/2014 11:25:24 PM,500 CAPP STREET FOUNDATION,201312.0,2014.0,0.0,A,B
279925,93493310000000.0,260434271,EFILE,,,,,,ORGANIZATION'S MISSION IS TO ENHANCE ACCESS TO...,THE ORGANIZATION HAS BEEN HELPING PATIENTS SIN...,ORGANIZATION'S MISSION IS TO ENHANCE ACCESS TO...,2.014231e+17,12005079.0,990,11/24/2014 1:10:12 PM,METROCARE OF GREATER KANSAS CITY,201312.0,2014.0,0.0,E,B
768454,93492080000000.0,710959300,EFILE,Skateboarding outreach demonstrations are one ...,To be a light in the industry and culture of s...,,,,,,,2.017408e+17,14634962.0,990EZ,8/11/2017 9:42:36 AM,UNTITLED SKATEBOARDS,201612.0,2017.0,0.0,X,B
746876,,310551828,,,,,,,,,,,,,,,,,1.0,B,A
307783,93493080000000.0,636056333,EFILE,,,,,VERSION_NOT_SUPPORTED,VERSION_NOT_SUPPORTED,VERSION_NOT_SUPPORTED,VERSION_NOT_SUPPORTED,2.014308e+17,11293109.0,990,4/7/2014,PTAA JONES VALLEY ELEMENTARY PTA,201307.0,2014.0,1.0,B,A


In [66]:
# df_txt_sample20pt has EIN duplicates, but no need to remove them. Duplicated texts in this dataframe should be removed later.
len(df_txt_sample20pt), len(df_ein_ntee)

(906373, 771772)

In [67]:
df_txt_sample20pt['mission']=df_txt_sample20pt['IRS990_p1_ActvtyOrMssnDsc'].combine_first(df_txt_sample20pt['IRS990_p3_MssnDsc']).combine_first(df_txt_sample20pt['IRS990EZ_p3_PrmryExmptPrpsTxt'])
df_txt_sample20pt['prgrm_dsc']=df_txt_sample20pt['IRS990_p3_DscS'].fillna('')+'##'+df_txt_sample20pt['IRS990ScheduleO_ExplntnTxt'].fillna('')+'##'+\
                                      df_txt_sample20pt['IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt'].fillna('')+'##'+df_txt_sample20pt['IRS990ScheduleO_ExplntnTxt'].fillna('')+'##'+\
                                      df_txt_sample20pt['IRS990PF_p9a_DscrptnTxt'].fillna('')+'##'+df_txt_sample20pt['IRS990PF_p16b_RltnshpSttmntTxt'].fillna('')

In [77]:
def func_clean_str(string):
    if str(string)=='nan':
        return ''
    else:
        string_set=set([s for s in string.split('##') if s!='']) # Remove duplicates.
        return '; '.join(string_set) # Join together.

df_txt_sample20pt['mission']=df_txt_sample20pt['mission'].map(func_clean_str)
df_txt_sample20pt['prgrm_dsc']=df_txt_sample20pt['prgrm_dsc'].map(func_clean_str)

df_train=df_txt_sample20pt[(~df_txt_sample20pt['mission'].duplicated())] # Drop duplicated mission descriptions.
df_train=df_train[(~df_train['prgrm_dsc'].duplicated())] # Drop duplicated program descriptions.
df_train=df_train[(~df_train['NTEE1'].isna())] # Drop NTEE==NaN.
# Change to upper case.
df_train['mission']=df_train['mission'].map(str.upper)
df_train['prgrm_dsc']=df_train['prgrm_dsc'].map(str.upper)
df_train['NTEE1']=df_train['NTEE1'].map(str.upper)
len(df_train)

In [10]:
# Spell check function. Return corrected word if unknown; return original word if known.
def spellcheck(doc):
    word_string_list=nltk.word_tokenize(doc)
    return [SpellChecker().correction(word=s).upper() for s in word_string_list]

In [32]:
from dask.distributed import Client
import joblib
from joblib import Parallel, delayed
client = Client("10.140.82.220:8786")
client

0,1
Client  Scheduler: tcp://10.140.82.220:8786  Dashboard: http://10.140.82.220:8787/status,Cluster  Workers: 192  Cores: 192  Memory: 0 B


In [12]:
with joblib.parallel_backend('dask'):
    %time df_train['mission_spellchk']=Parallel(n_jobs=-1)(delayed(spellcheck)(doc=doc) for doc in df_train['mission'])

CPU times: user 33min 19s, sys: 2min 34s, total: 35min 53s
Wall time: 3h 48min 54s


In [35]:
''' 
Shrink the size, computer cannot handle because 
program descriptions are much longer than mission statements 
'''
df_train_20pt=df_train.sample(round(len(df_train)*.2))

In [36]:
# Sort order by string length. Faster processing.
df_train_20pt['prgrm_dsc_len']=df_train_20pt.prgrm_dsc.apply(len)
df_train_20pt.sort_values('prgrm_dsc_len', inplace=True)

with joblib.parallel_backend('dask'):
    %time df_train_20pt['prgrm_dsc_spellchk']=Parallel(n_jobs=-1)(delayed(spellcheck)(doc=doc) for doc in df_train_20pt['prgrm_dsc'])

CPU times: user 7min 25s, sys: 28 s, total: 7min 53s
Wall time: 2h 40min 23s


In [55]:
df_train_20pt['mission_spellchk_str']=[' '.join(s) for s in df_train_20pt.mission_spellchk]
df_train_20pt['prgrm_dsc_spellchk_str']=[' '.join(s) for s in df_train_20pt.prgrm_dsc_spellchk]
df_train_20pt_save=df_train_20pt.drop(['mission_spellchk', 'prgrm_dsc_len', 'prgrm_dsc_spellchk'], axis=1)
df_train_20pt_save.rename(columns={'mission_spellchk_str':'mission_spellchk', 'prgrm_dsc_spellchk_str':'prgrm_dsc_spellchk'}, inplace=True)
df_train_20pt_save=df_train_20pt_save[df_train_20pt_save.NTEE1!='Z']
df_train_20pt_save.sample(3)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before,NTEE1,mission,prgrm_dsc,mission_spellchk,prgrm_dsc_spellchk
771667,93493110000000.0,330632256,EFILE,,,,,,ASSIST PUBLIC SCHOOLS IN PREPARING STUDENTS FO...,A. PROVIDE TRAINING FOR STUDENTS TO IMPROVE TH...,VITAL LINK'S MISSION IS TO CREATE THE LINK BET...,2.017011e+17,14668274.0,990,8/23/2017 8:27:25 AM,VITAL LINK EDUCATION-BUSINESS CONSORTIUM,201609.0,2017.0,0.0,S,ASSIST PUBLIC SCHOOLS IN PREPARING STUDENTS FO...,A. PROVIDE TRAINING FOR STUDENTS TO IMPROVE TH...,ASSIST PUBLIC SCHOOLS IN PREPARING STUDENTS FO...,A . PROVIDE TRAINING FOR STUDENTS TO IMPROVE T...
468928,93492270000000.0,461019333,EFILE,BRING AWARENESS TO AND EDUCATE THE PUBLIC ABOU...,BRING AWARENESS AND EDUCATE ABOUT PRESCRIPTION...,,,,,,,2.015127e+17,12938341.0,990EZ,10/8/2015,JUSTIN ROWLAND FOUNDATION INC,201412.0,2015.0,0.0,F,BRING AWARENESS AND EDUCATE ABOUT PRESCRIPTION...,BRING AWARENESS TO AND EDUCATE THE PUBLIC ABOU...,BRING AWARENESS AND EDUCATE ABOUT PRESCRIPTION...,BRING AWARENESS TO AND EDUCATE THE PUBLIC ABOU...
709101,93493270000000.0,221942376,EFILE,,,,,,PROVISION OF DAY CARE SERVICES FOR PRE-SCHOOL ...,PROVISION OF DAY CARE SERVICES TO OVER 250 CHI...,PROVISION OF DAY CARE SERVICES TO LOW INCOME F...,2.014227e+17,11871139.0,990,10/15/2014 5:03:08 PM,MI CASITA DAY CARE INC,201312.0,2014.0,1.0,P,PROVISION OF DAY CARE SERVICES FOR PRE-SCHOOL ...,PROVISION OF DAY CARE SERVICES TO OVER 250 CHI...,PROVISION OF DAY CARE SERVICES FOR PRE-SCHOOL ...,PROVISION OF DAY CARE SERVICES TO OVER 250 CHI...


In [56]:
len(df_train_20pt_save)

43676

In [None]:
df_train_20pt_save.to_pickle('../../dataset/UCF/test/df_ucf_test_nteeConf_random.pkl.bz2', compression='bz2')

## `nteeConf==B`

In [2]:
df_txt=pd.DataFrame()
file_list_txt=os.listdir('../../dataset/intermediary/EIN_TXT_2014_18.pkl.gz/')
for file in file_list_txt:
    df_txt=pd.concat([df_txt,
                      pd.read_pickle('../../dataset/intermediary/EIN_TXT_2014_18.pkl.gz/'+file, compression='gzip')
                     ])
df_txt.sample(3)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before
2054,93493310000000.0,264536470,EFILE,,,,,,THE OSAC IS AN ALLIANCE OF ORGANIZATIONS DEDIC...,OSAC HAS TWO PROGRAMS IN SUPPORT OF ITS MISSIO...,THE OCEAN STATE ANIMAL COALITION (OSAC) IS A N...,2.014331e+17,12053790.0,990,12/8/2014 5:09:34 PM,OCEAN STATE ANIMAL COALITION,201312.0,2014.0,0.0
675931,93492210000000.0,30555281,EFILE,"TO FUND EDUCATION TO PREVENT CHILD OBESITY, CH...",TO FUND HEALTH RESEARCH AND EDUCATION TO ELIMI...,,,,,,,2.015021e+17,12765164.0,990EZ,8/10/2015,PHAT FREE NATION INC DBA OPTIMUM HEALTH SOLUTION,201412.0,2015.0,0.0
443119,93493230000000.0,251580487,EFILE,,,,,,TO PROVIDE NEEDED EMERGENCY SERVICES,TO PROVIDE NEEDED EMERGENCY SERVICES ON A REGI...,TO PROVIDE NEEDED EMERGENCY SERVICES,2.014023e+17,11778260.0,990,9/18/2014 6:04:46 PM,CLAIRTON VOLUNTEER FIRE DEPARTMENT,201312.0,2014.0,0.0


In [3]:
df_ntee=pd.read_pickle('../../dataset/intermediary/df_bmf_14_16_nteeConfB_sample20pt.pkl.bz2')
df_ein_ntee=df_ntee[['EIN', 'NTEE1', 'nteeConf']]
df_ein_ntee.drop_duplicates(inplace=True)
df_ein_ntee.sample(3)

Unnamed: 0,EIN,NTEE1,nteeConf
573619,341458797,M,B
1268117,720937048,A,B
3087565,10697474,S,B


In [5]:
df_txt_nteeConfB=df_txt.merge(df_ein_ntee, on='EIN', how='inner')
df_txt_nteeConfB['mission']=df_txt_nteeConfB['IRS990_p1_ActvtyOrMssnDsc'].combine_first(df_txt_nteeConfB['IRS990_p3_MssnDsc']).combine_first(df_txt_nteeConfB['IRS990EZ_p3_PrmryExmptPrpsTxt'])
df_txt_nteeConfB['prgrm_dsc']=df_txt_nteeConfB['IRS990_p3_DscS'].fillna('')+'##'+df_txt_nteeConfB['IRS990ScheduleO_ExplntnTxt'].fillna('')+'##'+\
                                      df_txt_nteeConfB['IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt'].fillna('')+'##'+df_txt_nteeConfB['IRS990ScheduleO_ExplntnTxt'].fillna('')+'##'+\
                                      df_txt_nteeConfB['IRS990PF_p9a_DscrptnTxt'].fillna('')+'##'+df_txt_nteeConfB['IRS990PF_p16b_RltnshpSttmntTxt'].fillna('')

def func_clean_str(string):
    if str(string)=='nan':
        return ''
    else:
        string_set=set([s for s in string.split('##') if s!='']) # Remove duplicates.
        return '; '.join(string_set) # Join together.

df_txt_nteeConfB['mission']=df_txt_nteeConfB['mission'].map(func_clean_str)
df_txt_nteeConfB['prgrm_dsc']=df_txt_nteeConfB['prgrm_dsc'].map(func_clean_str)

df_train=df_txt_nteeConfB[(~df_txt_nteeConfB['mission'].duplicated())] # Drop duplicated mission descriptions.
df_train=df_train[(~df_train['prgrm_dsc'].duplicated())] # Drop duplicated program descriptions.
df_train=df_train[(~df_train['NTEE1'].isna())] # Drop NTEE==NaN.
# Change to upper case.
df_train['mission']=df_train['mission'].map(str.upper)
df_train['prgrm_dsc']=df_train['prgrm_dsc'].map(str.upper)
df_train['NTEE1']=df_train['NTEE1'].map(str.upper)
len(df_train)

99992

In [12]:
# Spell check function. Return corrected word if unknown; return original word if known.
def spellcheck(doc):
    word_string_list=nltk.word_tokenize(doc)
    return [SpellChecker().correction(word=s).upper() for s in word_string_list]

In [21]:
from dask.distributed import Client
import joblib
from joblib import Parallel, delayed
client = Client("10.140.82.220:8786")
client

0,1
Client  Scheduler: tcp://10.140.82.220:8786  Dashboard: http://10.140.82.220:8787/status,Cluster  Workers: 192  Cores: 192  Memory: 0 B


In [14]:
''' 
Shrink the size, computer cannot handle because 
program descriptions are much longer than mission statements 
'''
df_train_5k=df_train.sample(5000)

# Sort order by string length. Faster processing.
df_train_5k['prgrm_dsc_len']=df_train_5k.prgrm_dsc.apply(len)
df_train_5k.sort_values('prgrm_dsc_len', inplace=True)

In [22]:
with joblib.parallel_backend('dask'):
    %time df_train_5k['mission_spellchk']=Parallel(n_jobs=-1)(delayed(spellcheck)(doc=doc) for doc in df_train_5k['mission'])

CPU times: user 49.2 s, sys: 3.97 s, total: 53.1 s
Wall time: 5min 26s


In [17]:
with joblib.parallel_backend('dask'):
    %time df_train_5k['prgrm_dsc_spellchk']=Parallel(n_jobs=-1)(delayed(spellcheck)(doc=doc) for doc in df_train_5k['prgrm_dsc'])

CPU times: user 51.3 s, sys: 3.33 s, total: 54.6 s
Wall time: 19min 21s


In [23]:
df_train_5k['mission_spellchk_str']=[' '.join(s) for s in df_train_5k.mission_spellchk]
df_train_5k['prgrm_dsc_spellchk_str']=[' '.join(s) for s in df_train_5k.prgrm_dsc_spellchk]
df_train_5k_save=df_train_5k.drop(['mission_spellchk', 'prgrm_dsc_len', 'prgrm_dsc_spellchk'], axis=1)
df_train_5k_save.rename(columns={'mission_spellchk_str':'mission_spellchk', 'prgrm_dsc_spellchk_str':'prgrm_dsc_spellchk'}, inplace=True)
df_train_5k_save=df_train_5k_save[df_train_5k_save.NTEE1!='Z']
df_train_5k_save.sample(3)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before,NTEE1,nteeConf,mission,prgrm_dsc,mission_spellchk,prgrm_dsc_spellchk
247768,93493130000000.0,954823489,EFILE,,,,,,TO SERVE AS A MODEL FOR EDUCATION IN AN ETHNIC...,NEW ROAD SCHOOLS STRIVES TO PROVIDE A PROGRESS...,NEW ROADS SCHOOL IS AN INDEPENDENT SCHOOL IN L...,2.015413e+17,12707797.0,990,7/23/2015,NEW ROADS SCHOOL,201406.0,2015.0,0.0,B,B,TO SERVE AS A MODEL FOR EDUCATION IN AN ETHNIC...,NEW ROAD SCHOOLS STRIVES TO PROVIDE A PROGRESS...,TO SERVE AS A MODEL FOR EDUCATION IN AN ETHNIC...,NEW ROAD SCHOOLS STRIVES TO PROVIDE A PROGRESS...
120670,93492240000000.0,134204741,EFILE,"CAMPBELL HALL IN NO HOLLYWOOD, CA##EPISCOPAL D...",TO PROMOTE AND APPLY THE PRINCIPLES OF NONVIOL...,,,,,,,2.014224e+17,11793237.0,990EZ,9/23/2014 10:06:34 PM,HANDS IN HEALING,201406.0,2014.0,0.0,P,B,TO PROMOTE AND APPLY THE PRINCIPLES OF NONVIOL...,BISHOP OF THE PROTESTANT EPISCOPAL CHURCH IN L...,TO PROMOTE AND APPLY THE PRINCIPLES OF NON-VIO...,BISHOP OF THE PROTESTANT EPISCOPAL CHURCH IN L...
310159,93493350000000.0,61328903,EFILE,,,,,,"PROJECT GENESIS, INC. IS DEDICATED TO PROVIDIN...","FUNDED BY THE DEPARTMENT OF Social SERVICES, B...","PROJECT GENESIS, INC. IS DEDICATED TO PROVIDIN...",2.016135e+17,14302659.0,990,4/19/2017 4:40:18 PM,PROJECT GENESIS,201606.0,2017.0,1.0,J,B,"PROJECT GENESIS, INC. IS DEDICATED TO PROVIDIN...","FUNDED BY THE DEPARTMENT OF SOCIAL SERVICES, B...","PROJECT GENESIS , INC. IS DEDICATED TO PROVIDI...","FUNDED BY THE DEPARTMENT OF SOCIAL SERVICES , ..."


In [24]:
df_train_5k_save.to_pickle('../../dataset/UCF/test/df_ucf_test_nteeConf_B.pkl.bz2', compression='bz2')

## Count text field length by confidence level

In [100]:
len(df_txt_sample20pt)

355626

In [96]:
df_txt_sample20pt.columns

Index(['DLN', 'EIN', 'FILING_TYPE', 'IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt',
       'IRS990EZ_p3_PrmryExmptPrpsTxt', 'IRS990PF_p16b_RltnshpSttmntTxt',
       'IRS990PF_p9a_DscrptnTxt', 'IRS990ScheduleO_ExplntnTxt',
       'IRS990_p1_ActvtyOrMssnDsc', 'IRS990_p3_DscS', 'IRS990_p3_MssnDsc',
       'OBJECT_ID', 'RETURN_ID', 'RETURN_TYPE', 'SUB_DATE', 'TAXPAYER_NAME',
       'TAX_PERIOD', 'YEAR', '95_and_before', 'NTEE1', 'nteeConf', 'mission',
       'prgrm_dsc'],
      dtype='object')

In [None]:
df_txt_sample20pt=df_txt_sample20pt.sample(10000)
df_txt_sample20pt['mission_toks']=nlp_en(df_txt_sample20pt['mission'].values.tolist())
df_txt_sample20pt['prgrm_dsc_toks']=nlp_en(df_txt_sample20pt['prgrm_dsc'].values.tolist())

In [None]:
df_txt_sample20pt['mission_len']=[len(str(s)) for s in df_txt_sample20pt.mission_toks]
df_txt_sample20pt['prgrm_dsc_len']=[len(str(s)) for s in df_txt_sample20pt.prgrm_dsc_toks]
df_txt_sample20pt.nteeConf.replace('a', 'A', inplace=True)

In [None]:
df_txt_sample20pt.nteeConf.unique()

In [81]:
df_txt_sample20pt.groupby('nteeConf').mean()['mission_len']

nteeConf
A     88.228373
B    108.188221
C     46.507251
Name: mission_len, dtype: float64

In [83]:
df_txt_sample20pt.groupby('nteeConf').std()['mission_len']

nteeConf
A    132.048508
B    146.078629
C    103.678769
Name: mission_len, dtype: float64

In [82]:
df_txt_sample20pt.groupby('nteeConf').mean()['prgrm_dsc_len']

nteeConf
A    324.274401
B    351.144025
C    127.979055
Name: prgrm_dsc_len, dtype: float64

In [84]:
df_txt_sample20pt.groupby('nteeConf').std()['prgrm_dsc_len']

nteeConf
A    1711.473759
B    1329.498709
C     688.376164
Name: prgrm_dsc_len, dtype: float64

In [91]:
df_train_20pt

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before,NTEE1,mission,prgrm_dsc,mission_spellchk,prgrm_dsc_len,prgrm_dsc_spellchk,mission_spellchk_str,prgrm_dsc_spellchk_str
280792,9.349331e+13,453774366,EFILE,,,,,,THE ORGANIZATION SERVES AS A FOCUS FOR THE EDU...,T,THE ORGANIZATION PROVIDES RHEUMATOLOGY MANAGER...,2.014031e+17,12005404.0,990O,11/24/2014 1:49:29 PM,NATIONAL ORGANIZATION OF RHEUMATOLOGY MANAGERS,201312.0,2014.0,0.0,S,THE ORGANIZATION SERVES AS A FOCUS FOR THE EDU...,T,"[THE, ORGANIZATION, SERVES, AS, A, FOCUS, FOR,...",1,[T],THE ORGANIZATION SERVES AS A FOCUS FOR THE EDU...,T
176112,9.349316e+13,956220540,EFILE,,,,,,TEACHERS COLLECTIVE BARGAINING,1,TEACHERS COLLECTIVE BARGAINING,2.014116e+17,11907394.0,990O,10/23/2014 10:44:49 PM,AMERICAN FEDERATION OF TEACHERS OCEANSIDE,201312.0,2014.0,1.0,B,TEACHERS COLLECTIVE BARGAINING,1,"[TEACHERS, COLLECTIVE, BARGAINING]",1,[1],TEACHERS COLLECTIVE BARGAINING,1
780142,9.349329e+13,521918702,EFILE,,,,,,"PROVIDE HOUSING, JOB TRAINING AND COUNSELING T...",PR,"PROVIDE HOUSING, JOB TRAINING AND COUNSELING T...",2.017229e+17,14983358.0,990,11/28/2017 8:48:17 PM,ACCESS HOUSING INC DC,201611.0,2017.0,0.0,L,"PROVIDE HOUSING, JOB TRAINING AND COUNSELING T...",PR,"[PROVIDE, HOUSING, ,, JOB, TRAINING, AND, COUN...",2,[PR],"PROVIDE HOUSING , JOB TRAINING AND COUNSELLING...",PR
773733,9.349314e+13,550761069,EFILE,,,,,,Operation of free after school educational pro...,fgfd,Operation of free after school educational pro...,2.017314e+17,14745022.0,990,9/18/2017 6:22:53 AM,BOB BURDETTE CENTER INC,201606.0,2017.0,0.0,P,OPERATION OF FREE AFTER SCHOOL EDUCATIONAL PRO...,FGFD,"[OPERATION, OF, FREE, AFTER, SCHOOL, EDUCATION...",4,[FGD],OPERATION OF FREE AFTER SCHOOL EDUCATIONAL PRO...,FGD
661795,9.349213e+13,743142902,EFILE,LOTS,PROVIDE THE FOLSOM POLICE OFFICERS WITH LEGAL ...,,,,,,,2.014013e+17,11808154.0,990EO,9/26/2014 10:59:10 AM,FOLSOM POLICE OFFICERS ASSOCIATION,201312.0,2014.0,0.0,I,PROVIDE THE FOLSOM POLICE OFFICERS WITH LEGAL ...,LOTS,"[PROVIDE, THE, FOLLOW, POLICE, OFFICERS, WITH,...",4,[LOTS],PROVIDE THE FOLLOW POLICE OFFICERS WITH LEGAL ...,LOTS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297885,9.349332e+13,221494442,EFILE,,,,,BACKGROUND ========== RARITAN BAY MEDICAL CENT...,"RBMC IS COMMITTED TO PROVIDING PROFESSIONAL, C...",EXPENSES INCURRED IN PROVIDING VARIOUS MEDICAL...,"RARITAN BAY MEDICAL CENTER, A NON-PROFIT HEALT...",2.014032e+17,12116066.0,990,12/31/2014,RARITAN BAY MEDICAL CENTER,201312.0,2014.0,1.0,E,"RBMC IS COMMITTED TO PROVIDING PROFESSIONAL, C...",BREAST CANCER SUPPORT GROUPS -----------------...,"[BMC, IS, COMMITTED, TO, PROVIDING, PROFESSION...",39026,"[BREAST, CANCER, SUPPORT, GROUPS, I-, I-, I-, ...","BMC IS COMMITTED TO PROVIDING PROFESSIONAL , C...",BREAST CANCER SUPPORT GROUPS I- I- I- I- I- I-...
58888,9.349332e+13,541217184,EFILE,,,,,PROGRAM SERVICE ACCOMPLISHMENTS: SENTARA HEALT...,AS PART OF SENTARA HEALTHCARE'S INTEGRATED HEA...,SENTARA MEDICAL GROUP (SMG) IS A 448 PROVIDER ...,SEE SCHEDULE O.AS PART OF SENTARA HEALTHCARE'S...,2.014232e+17,12056032.0,990,12/9/2014 7:36:49 AM,SENTARA MEDICAL GROUP,201312.0,2014.0,1.0,E,AS PART OF SENTARA HEALTHCARE'S INTEGRATED HEA...,"IN AUGUST 2012, SENTARA AND MDLIVE ANNOUNCED A...","[AS, PART, OF, SINATRA, HEALTHCARE, IS, INTEGR...",42634,"[IN, AUGUST, 2012, ,, SINATRA, AND, LIVE, ANNO...",AS PART OF SINATRA HEALTHCARE IS INTEGRATED HE...,"IN AUGUST 2012 , SINATRA AND LIVE ANNOUNCED AN..."
588403,9.349332e+13,250969492,EFILE,,,,,AGH McCandless gives patients in the North Hil...,"See Form 990, Page 2, Part III","West Penn Allegheny Health System, Inc. is a t...","WEST PENN ALLEGHENY HEALTH SYSTEM, INC. IS A T...",2.015032e+17,13128350.0,990,01/21/2016,WEST PENN ALLEGHENY HEALTH SYSTEM INC,201412.0,2016.0,1.0,E,"SEE FORM 990, PAGE 2, PART III",ADAGIO HEALTH DIETETIC INTERNSHIP CLINICAL ROT...,"[SEE, FORM, 990, ,, PAGE, 2, ,, PART, III]",51957,"[ADAGIO, HEALTH, DIETETIC, INTERNSHIP, CLINICA...","SEE FORM 990 , PAGE 2 , PART III",ADAGIO HEALTH DIETETIC INTERNSHIP CLINICAL ROT...
297647,9.349332e+13,223452412,EFILE,,,,,BACKGROUND ========== MONMOUTH MEDICAL CENTER ...,TO MEET THE HEALTHCARE NEEDS OF OUR COMMUNITY ...,EXPENSES INCURRED IN PROVIDING MEDICALLY NECES...,TO MEET THE HEALTHCARE NEEDS OF OUR COMMUNITY ...,2.017332e+17,15059363.0,990,12/28/2017 8:52:13 AM,MONMOUTH MEDICAL CENTER,201612.0,2017.0,0.0,E,TO MEET THE HEALTHCARE NEEDS OF OUR COMMUNITY ...,EXPENSES INCURRED IN PROVIDING MEDICALLY NECES...,"[TO, MEET, THE, HEALTHCARE, NEEDS, OF, OUR, CO...",56968,"[EXPENSES, INCURRED, IN, PROVIDING, MEDICALLY,...",TO MEET THE HEALTHCARE NEEDS OF OUR COMMUNITY ...,EXPENSES INCURRED IN PROVIDING MEDICALLY NECES...
