### Script Purpose
- Concatenate text and NTEE codes.
- Prepare test dataset (`df_train.pkl.gz`).

In [21]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import os
import math
from spellchecker import SpellChecker
import nltk
nltk.download('punkt')
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## `nteeConf==A` dataset

In [2]:
df_txt=pd.DataFrame()
file_list_txt=os.listdir('../../dataset/EIN_TXT_2014_18.pkl.gz/')
for file in file_list_txt:
    df_txt=pd.concat([df_txt,
                      pd.read_pickle('../../dataset/EIN_TXT_2014_18.pkl.gz/'+file, compression='gzip')
                     ])
df_txt.sample(3)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before
1862482,,341277321,,,,,,,,,,,,,,,,,1.0
1180547,93493270000000.0,350413700,EFILE,,,,,,"TO PROVIDE A PROFESSIONAL, SUPPORTIVE ENVIRONM...",PROMOTE IMPORTANCE AND VALUE OF UTILIZING A RE...,THE METROPOLITAN INDIANAPOLIS BOARD OF REALTOR...,2.014327e+17,11861855.0,990O,10/14/2014 12:05:50 PM,METROPOLITAN INDIANAPOLIS BOARD OF REALTORS INC,201312.0,2014.0,1.0
670087,93493130000000.0,364472394,EFILE,,,,,N/A##N/A##N/A##OUTREACH/OUTPATIENT COMMUNITY S...,TO PROVIDE SAFE LIVING CONDITIONS FOR PERSONS ...,,TO PROVIDE SAFE LIVING CONDITIONS,2.015213e+17,12757539.0,990,8/6/2015,BRIDGING THE TYS TO JORDAN INC,201412.0,2015.0,0.0


In [5]:
file_list_ntee=os.listdir('../../dataset/df_bmf_14_16_confA_chg_drop.pkl.gz/')
df_ntee=pd.DataFrame()
for file in file_list_ntee:
    df_ntee=pd.concat([df_ntee,
                       pd.read_pickle('../../dataset/df_bmf_14_16_confA_chg_drop.pkl.gz/'+file, compression='gzip')
                      ])
df_ein_ntee=df_ntee[['EIN', 'NTEE1']]
df_ein_ntee.drop_duplicates(inplace=True)
df_ein_ntee.sample(3)

Unnamed: 0,EIN,NTEE1
1314385,746062406,B
3972230,463431966,M
1432881,850127993,B


In [6]:
df_txt_ntee_confA_no_chg=df_txt.merge(df_ein_ntee, on='EIN', how='inner') # 2020-05-06: Use 'inner', save computation resources. Not tested after change.
df_txt_ntee_confA_no_chg.sample(10)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before,NTEE1
2422094,,770687533,,,,,,,,,,,,,,,,,,J
2304428,,226591093,,,,,,,,,,,,,,,,,1.0,
2514636,,201567591,,,,,,,,,,,,,,,,,,D
295301,93491320000000.0,816290155,EFILE,,,,,,,,,2.017232e+17,15051355.0,990PF,12/27/2017 6:49:49 AM,STAR PEAK FOUNDATION,201612.0,2017.0,0.0,
1179545,93491290000000.0,341504501,EFILE,,,,,,,,,2.015029e+17,13148705.0,990PF,01/27/2016,ROBERT AND PATRICIA SWITZER FOUNDATION,201506.0,2016.0,1.0,
1829942,93493130000000.0,911214158,EFILE,,,,,,Washington Toxics Coalition works to protect p...,The Toxic-Free Legacy campaign works to phase ...,WTC works to protect public health and the env...,2.016013e+17,13765580.0,990,09/02/2016,WASHINGTON TOXICS COALITION,201512.0,2016.0,1.0,C
507025,,752140748,,,,,,,,,,,,,,,,,1.0,P
1911049,93492330000000.0,205200059,EFILE,VERSION_NOT_SUPPORTED,VERSION_NOT_SUPPORTED,,,VERSION_NOT_SUPPORTED,,,,2.013033e+17,11126883.0,990EZ,1/9/2014,SUNDANCE STUDIO GYMNASTICS PARENTS ASSOCIATION,201212.0,2014.0,0.0,B
103110,,582052870,,,,,,,,,,,,,,,,,1.0,
1485289,93493130000000.0,943476912,EFILE,,,,,,SEE SCHEDULE OTHE PURPOSE AND MISSION OF ALLIA...,ALLIANCE HEALTH SERVICES ACADEMY HIGH SCHOOL I...,THE PURPOSE AND MISSION OF ALLIANCE HEALTH SER...,2.015013e+17,12673430.0,990,7/14/2015,ALLIANCE HEALTH SERVICES ACADEMY HIGH SCHOOL,201406.0,2015.0,0.0,B


In [7]:
df_txt_ntee_confA_no_chg['mission']=df_txt_ntee_confA_no_chg['IRS990_p1_ActvtyOrMssnDsc'].combine_first(df_txt_ntee_confA_no_chg['IRS990_p3_MssnDsc']).combine_first(df_txt_ntee_confA_no_chg['IRS990EZ_p3_PrmryExmptPrpsTxt'])
df_txt_ntee_confA_no_chg['prgrm_dsc']=df_txt_ntee_confA_no_chg['IRS990_p3_DscS'].fillna('')+'##'+df_txt_ntee_confA_no_chg['IRS990ScheduleO_ExplntnTxt'].fillna('')+'##'+\
                                      df_txt_ntee_confA_no_chg['IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt'].fillna('')+'##'+df_txt_ntee_confA_no_chg['IRS990ScheduleO_ExplntnTxt'].fillna('')+'##'+\
                                      df_txt_ntee_confA_no_chg['IRS990PF_p9a_DscrptnTxt'].fillna('')+'##'+df_txt_ntee_confA_no_chg['IRS990PF_p16b_RltnshpSttmntTxt'].fillna('')

In [8]:
def func_clean_str(string):
    if str(string)=='nan':
        return ''
    else:
        string_set=set([s for s in string.split('##') if s!='']) # Remove duplicates.
        return '; '.join(string_set) # Join together.

df_txt_ntee_confA_no_chg['mission']=df_txt_ntee_confA_no_chg['mission'].map(func_clean_str)
df_txt_ntee_confA_no_chg['prgrm_dsc']=df_txt_ntee_confA_no_chg['prgrm_dsc'].map(func_clean_str)

In [10]:
df_train=df_txt_ntee_confA_no_chg[(~df_txt_ntee_confA_no_chg['mission'].duplicated())] # Drop duplicated mission descriptions.
df_train=df_train[(~df_train['prgrm_dsc'].duplicated())] # Drop duplicated program descriptions.
df_train=df_train[(~df_train['NTEE1'].isna())] # Drop NTEE==NaN.
# Change to upper case.
df_train['mission']=df_train['mission'].map(str.upper)
df_train['prgrm_dsc']=df_train['prgrm_dsc'].map(str.upper)
df_train['NTEE1']=df_train['NTEE1'].map(str.upper)
len(df_train)

234027

In [11]:
# Spell check function. Return corrected word if unknown; return original word if known.
def spellcheck(doc):
    word_string_list=nltk.word_tokenize(doc)
    return [SpellChecker().correction(word=s).upper() for s in word_string_list]

# Use multi-processing instead of df.apply, much faster.
p=Pool(48)

In [12]:
# Correct mission.
df_train['mission_spellchk']=p.map(spellcheck, df_train['mission'])

In [17]:
# Correct prgrm dsc.
df_train['prgrm_dsc_spellchk']=p.map(spellcheck, df_train['prgrm_dsc'])

**Multi-processing vs. Dataframe.apply.**
```Python
>>> from time import time
>>> p=Pool(48)
>>> t1=time()
>>> t=p.map(spellcheck, df_train['mission'][0:20])
>>> print((time()-t1)/60)
0.20710660219192506

>>> from time import time
>>> t1=time()
>>> t=df_train['mission'][0:20].apply(spellcheck)
>>> print((time()-t1)/60)
0.9466491937637329
```

In [23]:
df_to_write=df_train
num_file=10
file_path_name='../../dataset/df_train.pkl.gz/df_train.pkl.gz'
for index in range(0, df_to_write.iloc[-1].name+1, math.ceil(df_to_write.iloc[-1].name/num_file)):
    df_temp=df_to_write.loc[index:index+math.ceil(df_to_write.iloc[-1].name/num_file)-1]
    df_temp.to_pickle(file_path_name+'-'+str(df_temp.iloc[0].name)+'-'+str(df_temp.iloc[-1].name), compression='gzip')

In [144]:
# Test reading file.
import os
file_list=os.listdir('../../dataset/df_train.pkl.gz/')
df_test=pd.DataFrame()
for file in file_list:
    df_test=pd.concat([df_test, 
                       pd.read_pickle('../../dataset/df_train.pkl.gz/'+file, compression='gzip')])
len(df_test)

229472

In [25]:
# Test reading file.
import os
file_list=os.listdir('../../dataset/df_train.pkl.gz/')
df_test=pd.DataFrame()
for file in file_list:
    df_test=pd.concat([df_test, 
                       pd.read_pickle('../../dataset/df_train.pkl.gz/'+file, compression='gzip')])
len(df_test)

234027

### Create Universal Train and Test datasets.

In [2]:
file_list=os.listdir('../../dataset/intermediary/df_train.pkl.gz/')
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, 
                        pd.read_pickle('../../dataset/intermediary/df_train.pkl.gz/'+file, compression='gzip')])
len(df_train)

234027

In [3]:
# Remove records with the same EIN#.
df_ntee_universal=df_train.loc[df_train['EIN'].drop_duplicates().index]
# Join lists to strings.
df_ntee_universal['mission_spellchk']=[' '.join(s) for s in df_ntee_universal['mission_spellchk']]
df_ntee_universal['prgrm_dsc_spellchk']=[' '.join(s) for s in df_ntee_universal['prgrm_dsc_spellchk']]

In [4]:
# Check dataset size.
df_ntee_universal_train, df_ntee_universal_test = train_test_split(df_ntee_universal, test_size=0.2, random_state=520)
len(df_ntee_universal_train), len(df_ntee_universal_test)

(154424, 38607)

In [5]:
# Save universal test.
df_ntee_universal_test.to_pickle('../../dataset/df_ntee_universal/test/df_ntee_universal_test.pkl.gz', compression='gzip')

In [6]:
# Save universal train.
df_to_write=df_ntee_universal_train
num_file=5
file_path_name='../../dataset/df_ntee_universal/train/df_ntee_universal_train.pkl.gz'
file_count=0
for index in range(0, len(df_to_write), math.ceil(len(df_to_write)/num_file)):
    df_temp=df_to_write.iloc[index:index+math.ceil(len(df_to_write)/num_file)]
    df_temp.to_pickle(file_path_name+'_'+str(file_count)+'of'+str(num_file-1), compression='gzip')
    file_count+=1

In [9]:
# Check saved files.
file_list=os.listdir('../../dataset/df_ntee_universal/train/')
t1=pd.DataFrame()
for file in file_list:
    t1=pd.concat([t1, pd.read_pickle('../../dataset/df_ntee_universal/train/'+file, compression='gzip')])
t2=pd.read_pickle('../../dataset/df_ntee_universal/test/df_ntee_universal_test.pkl.gz', compression='gzip')
len(t1), len(t2)

(154424, 38607)

#### Describe universal datasets.

In [15]:
print(df_ntee_universal_test.groupby('NTEE1').count()['EIN'], '\n'*2, df_ntee_universal_test.groupby('NTEE1').count()['EIN']/len(df_ntee_universal_test))

NTEE1
A    4291
B    6419
C     827
D    1034
E    2307
F     543
G    1353
H     126
I     740
J    1132
K     522
L    1537
M    1140
N    3925
O     409
P    2318
Q     436
R     257
S    3603
T     541
U     225
V      85
W    2038
X    1098
Y    1701
Name: EIN, dtype: int64 

 NTEE1
A    0.111146
B    0.166265
C    0.021421
D    0.026783
E    0.059756
F    0.014065
G    0.035045
H    0.003264
I    0.019168
J    0.029321
K    0.013521
L    0.039811
M    0.029528
N    0.101666
O    0.010594
P    0.060041
Q    0.011293
R    0.006657
S    0.093325
T    0.014013
U    0.005828
V    0.002202
W    0.052788
X    0.028440
Y    0.044059
Name: EIN, dtype: float64


In [16]:
print(df_ntee_universal_train.groupby('NTEE1').count()['EIN'], '\n'*2, df_ntee_universal_train.groupby('NTEE1').count()['EIN']/len(df_ntee_universal_train))

NTEE1
A    17010
B    25827
C     3323
D     4239
E     9015
F     2301
G     5053
H      467
I     2947
J     4772
K     2009
L     5942
M     4693
N    15460
O     1731
P     9180
Q     1987
R     1064
S    14459
T     2032
U     1000
V      350
W     8357
X     4566
Y     6640
Name: EIN, dtype: int64 

 NTEE1
A    0.110151
B    0.167247
C    0.021519
D    0.027450
E    0.058378
F    0.014901
G    0.032722
H    0.003024
I    0.019084
J    0.030902
K    0.013010
L    0.038478
M    0.030390
N    0.100114
O    0.011209
P    0.059447
Q    0.012867
R    0.006890
S    0.093632
T    0.013159
U    0.006476
V    0.002266
W    0.054117
X    0.029568
Y    0.042998
Name: EIN, dtype: float64


## `nteeConf` random sample

In [3]:
df_txt=pd.DataFrame()
file_list_txt=os.listdir('../../dataset/intermediary/EIN_TXT_2014_18.pkl.gz/')
for file in file_list_txt:
    df_txt=pd.concat([df_txt,
                      pd.read_pickle('../../dataset/intermediary/EIN_TXT_2014_18.pkl.gz/'+file, compression='gzip')
                     ])
df_txt.sample(3)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before
534051,93493320000000.0,204423661,EFILE,,,,,,Provide Islamic and cultural services to local...,"Provide services on weekly, monthly and yearly...","Provide Islamic educational, social and cultur...",2.017032e+17,15063289.0,990,12/28/2017 9:46:58 PM,MUSLIM ASSOCIATION OF PUGET SOUND,201612.0,2017.0,0.0
1980956,,942929175,,,,,,,,,,,,,,,,,1.0
117750,93493290000000.0,522253225,EFILE,,,,,,TO PROVIDE EDUCATION AND INFORMATION TO MEMBER...,PUBLIC EDUCATION REGARDING AMERICA'S HEALTH CA...,THE CORPORATION IS ORGANIZED AND SHALL BE OPER...,2.016229e+17,14102111.0,990O,1/23/2017 8:44:15 AM,COALITION TO PROTECT AMERICAS HEALTH CARE,201605.0,2017.0,0.0


In [4]:
df_ntee=pd.read_pickle('../../dataset/intermediary/df_bmf_14_16_sample20pt.pkl.bz2')

In [7]:
df_ein_ntee=df_ntee[['EIN', 'NTEE1']]
df_ein_ntee.drop_duplicates(inplace=True)
df_ein_ntee.sample(3)

Unnamed: 0,EIN,NTEE1
3956835,462470627,N
2973535,910226963,Y
14444,20526488,A


In [11]:
df_txt_sample20pt=df_txt.merge(df_ein_ntee, on='EIN', how='inner')
df_txt_sample20pt.sample(5)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before,NTEE1
837947,93492070000000.0,56011981,EFILE,ENHANCE PERSONAL AND PROFESSIONAL GROWTH THROU...,OUR MISSION IS TO PROVIDE A FORUM FOR OPEN EXC...,,,,,,,2.017307e+17,14628905.0,990EO,8/10/2017 6:31:53 AM,PROFESSIONAL PHOTOGRAPHERS ASSOCIATION OF NEW ...,201610.0,2017.0,1.0,A
598998,93493330000000.0,742212674,EFILE,,,,,,TO PLACE PHYSICALLY CHALLENGED INDIVIDUALS IN ...,SUPPORTED EMPLOYMENT IN PUBLIC SECTOR AND SHEL...,TO PLACE PHYSICALLY CHALLENGED INDIVIDUALS IN ...,2.015233e+17,13222198.0,990,02/22/2016,JUNCTION FIVE-O-FIVE,201508.0,2016.0,1.0,J
445579,93493160000000.0,390808503,EFILE,,,,,,To improve the health and well-being of all pe...,"Good Samaritan Health Center of Merrill, Wisco...","Good Samaritan Health Center of Merrill, Wisco...",2.015016e+17,12667691.0,990,7/13/2015,GOOD SAMARITAN HEALTH CENTER OF MERRILL WISCON...,201406.0,2015.0,1.0,E
867970,93492130000000.0,370631056,EFILE,THE MONTICELLO CHAMBER OF COMMERCE SPONSORED A...,SEE ATTACHED,,,,,,,2.015313e+17,12770410.0,990EO,8/11/2015,MONTICELLO CHAMBER OF COMMERCE,201412.0,2015.0,1.0,S
392584,93493010000000.0,611205613,EFILE,,,,,,HOUSING MANAGER AND EMPLOYEE PROVIDER TO FOUR ...,THE ORGANIZATION PROVIDES HOUSING AND SOCIAL S...,HOUSING MANAGER AND EMPLOYEE PROVIDER TO FOUR ...,2.017001e+17,14554521.0,990,7/6/2017 8:31:50 PM,DAY SPRING INC,201606.0,2017.0,1.0,P


In [13]:
# df_txt_sample20pt has EIN duplicates, but no need to remove them. Duplicated texts in this dataframe should be removed later.
len(df_txt_sample20pt), len(df_ein_ntee)

(900379, 768148)

In [14]:
df_txt_sample20pt['mission']=df_txt_sample20pt['IRS990_p1_ActvtyOrMssnDsc'].combine_first(df_txt_sample20pt['IRS990_p3_MssnDsc']).combine_first(df_txt_sample20pt['IRS990EZ_p3_PrmryExmptPrpsTxt'])
df_txt_sample20pt['prgrm_dsc']=df_txt_sample20pt['IRS990_p3_DscS'].fillna('')+'##'+df_txt_sample20pt['IRS990ScheduleO_ExplntnTxt'].fillna('')+'##'+\
                                      df_txt_sample20pt['IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt'].fillna('')+'##'+df_txt_sample20pt['IRS990ScheduleO_ExplntnTxt'].fillna('')+'##'+\
                                      df_txt_sample20pt['IRS990PF_p9a_DscrptnTxt'].fillna('')+'##'+df_txt_sample20pt['IRS990PF_p16b_RltnshpSttmntTxt'].fillna('')

In [15]:
def func_clean_str(string):
    if str(string)=='nan':
        return ''
    else:
        string_set=set([s for s in string.split('##') if s!='']) # Remove duplicates.
        return '; '.join(string_set) # Join together.

df_txt_sample20pt['mission']=df_txt_sample20pt['mission'].map(func_clean_str)
df_txt_sample20pt['prgrm_dsc']=df_txt_sample20pt['prgrm_dsc'].map(func_clean_str)

In [17]:
df_train=df_txt_sample20pt[(~df_txt_sample20pt['mission'].duplicated())] # Drop duplicated mission descriptions.
df_train=df_train[(~df_train['prgrm_dsc'].duplicated())] # Drop duplicated program descriptions.
df_train=df_train[(~df_train['NTEE1'].isna())] # Drop NTEE==NaN.
# Change to upper case.
df_train['mission']=df_train['mission'].map(str.upper)
df_train['prgrm_dsc']=df_train['prgrm_dsc'].map(str.upper)
df_train['NTEE1']=df_train['NTEE1'].map(str.upper)
len(df_train)

218640

In [22]:
# Spell check function. Return corrected word if unknown; return original word if known.
def spellcheck(doc):
    word_string_list=nltk.word_tokenize(doc)
    return [SpellChecker().correction(word=s).upper() for s in word_string_list]

# Use multi-processing instead of df.apply, much faster.
p=Pool(48)

In [None]:
# Correct mission.
df_train['mission_spellchk']=p.map(spellcheck, df_train['mission'])

In [None]:
# Correct prgrm dsc.
df_train['prgrm_dsc_spellchk']=p.map(spellcheck, df_train['prgrm_dsc'])