# Part I: Data Clean & Vectorize

## A. Compile & Clean DF
- Join 16 csv files downloaded from Candid site
- Search Refinements:
    - Education
    - Malawi (Country), Nepal (Country), Senegal (Country), Burkina Faso (Country), Mali (Country), Guatemala (Country), Nicaragua (Country), Haiti (Country)
    - Chicago (Illinois, United States), Boston (Massachusetts, United States), Detroit (Michigan, United States), New York (United States), Bridgeport (Connecticut, United States), Oakland (California, United States)
    - Adults, Adolescents, Children, Preteens
    - At-risk youth, Economically disadvantaged people
    - Recipients located in United States (Country)
    - Years from 2021 to 2022

In [1]:
import pandas as pd
import numpy as np
import random
import re

In [2]:
df_test = pd.read_csv('csv_files/candid_1.csv', header = [1])[:-2]

In [3]:
#create list of all 16 candid .csv downloads
candid_lst = []
count = 1

while count < 17:
    candid_lst.append('candid_' + str(count) + '.csv')
    count+=1

In [4]:
#create list of candid dfs and concat
df_lst = []
for i in candid_lst:
    df = pd.read_csv('csv_files/' + i, header = [1])[:-2]
    df_lst.append(df)
df = pd.concat(df_lst, ignore_index = True)

In [5]:
#rename cols
df = df.rename(columns = {'Grantmaker Name':'grantmaker', 'Grantmaker State':'grantmaker_state',
                          'Recipient Name':'recipient', 'Recipient City':'recip_city',
                          'Recipient State/Country': 'recip_state_cntry',
                          'Year Authorized' : 'year', 'Grant Amount': 'amount', 
                          'Primary Subject': 'subject', 'Support Strategies':'strategy',
                          'Description': 'description'})

In [6]:
#drop dupes, grants < 5k, grants without descriptions
df = df.drop_duplicates()
df = df[df['amount'] >= 5000]
df = df[df['description'].notna()]

In [7]:
#remove scientific notation
pd.options.display.float_format = '{:.4f}'.format

In [8]:
#format yr
df['year'] = df['year'].astype(int)

In [9]:
#clean recip_city
df['recip_city'] = np.where(df.recip_city.str.contains('New Yorik'), 'New York City', df.recip_city)

In [10]:
#clean subject 
df['subject'] = df.subject.str.replace(' ','_')
df['subject'] = df.subject.str.replace("'","")
df['subject'] = df.subject.str.replace("-","_")
df['subject'] = df['subject'].apply(lambda x: x[:-1])
df['subject'] = df.subject.str.lower()

In [11]:
#df.subject.unique()

In [12]:
#clean strategy
df['strategy'] = df.strategy.str.replace(' ','_')
df['strategy'] = df.strategy.str.replace("'","")
df['strategy'] = df.strategy.str.replace("-","_")
df['strategy'] = df.strategy.str.replace(",","_")
df['strategy'] = df.strategy.str.lower()

In [13]:
df['strategy'] = df.strategy.str.split(';').tolist()

In [14]:
df['strategy'][2:7]

2    [research_and_evaluation, capacity_building_an...
3    [faculty_and_staff_development, capacity_build...
4    [capacity_building_and_technical_assistance, e...
6                    [program_support, equal_access, ]
7                                  [program_support, ]
Name: strategy, dtype: object

In [15]:
#clean description of html tags
clnr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

def cleanhtml(doc):
  clean_doc = re.sub(clnr, '', doc)
  return clean_doc

df['description'] = df['description'].apply(cleanhtml)


In [16]:
#label grants as major or transformative
df['gift_type'] = np.where(df.amount < 100000, 'major','transformative')

In [17]:
df.shape

(1036, 11)

In [18]:
df.reset_index(drop = True, inplace = True)

In [19]:
df.head(1)

Unnamed: 0,grantmaker,grantmaker_state,recipient,recip_city,recip_state_cntry,subject,year,amount,strategy,description,gift_type
0,"Bloomberg Philanthropies, Inc.",New York,Success Academy Charter Schools,New York City,New York,elementary_and_secondary_education,2022,100000000.0,,Bloomberg Philanthropies announced $100 millio...,transformative


In [20]:
#pickle df
import pickle
with open('grant_df.pickle', 'wb') as f:
    pickle.dump(df, f)

In [21]:
#df['doc'] = df['doc'].apply(lambda row: row.lower())

In [22]:
#corpus = df['doc'].tolist()
#sum([len(d.split(' ')) for d in corpus])

## B. NLP Text Processing

*Coding support from spaCy processing chapter of 'Blueprints for Text Analytics Using Python: Machine Learning-Based Solutions for Common Real World (NLP) Applications' published by O'Relly*

In [23]:
#spacy lemmetization
from spacy.lang.en import STOP_WORDS
import textacy

In [24]:
import spacy
from spacy import displacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

In [25]:
nlp = spacy.load('en_core_web_sm')

In [26]:
def custom_tokenizer(nlp):
    prefixes = [pattern for pattern in nlp.Defaults.prefixes
               if pattern not in ['-','_','#']]
    suffixes = [pattern for pattern in nlp.Defaults.suffixes
               if pattern not in ['-','_','#']]
    infixes = [pattern for pattern in nlp.Defaults.prefixes
               if not re.search(pattern, 'xx-xx')]
    
    return Tokenizer(vocab = nlp.vocab,
                     rules = nlp.Defaults.tokenizer_exceptions,
                     prefix_search = compile_prefix_regex(prefixes).search,
                     suffix_search = compile_suffix_regex(suffixes).search,
                     infix_finditer = compile_infix_regex(infixes).finditer, 
                     token_match = nlp.Defaults.token_match)

nlp.tokenizer = custom_tokenizer(nlp)

In [27]:
def extract_lemmas(doc, **kwargs):
    return [t.lemma_.lower().strip() for t in textacy.extract.words(doc, **kwargs)]

In [28]:
def extract_noun_phrases(doc, preceding_pos=['NOUN'], sep='_'):
    patterns = []
    for pos in preceding_pos:
        patterns.append(f'POS:{pos} POS:NOUN:+')
    spans = textacy.extract.matches.token_matches(doc, patterns = patterns)
    return [sep.join([t.lemma_.lower().strip() for t in s]) for s in spans]

In [59]:
def extract_entities(doc, include_types=None, sep='_'):
    ents = textacy.extract.entities(doc,
                                   include_types=include_types,
                                   exclude_types=None,
                                   drop_determiners=True,
                                   min_freq=1)
    return [sep.join([t.lemma_ for t in e]) for e in ents]

In [30]:
def extract_nlp(doc):
    return {
        'lemma': extract_lemmas(doc,
                                 exclude_pos = ['PART','PUNCT','DET','PRON','SYM','SPACE','NUM'],
                                 filter_stops = True),
        'n_gram': extract_noun_phrases(doc, ['ADJ','NOUN']),
        'ent': extract_entities(doc, ['GPE','LOC'])
    }

In [62]:
#lemma function tester

te_doc = df['description'][random.randrange(1000)]
te_sp_doc = nlp(te_doc)

print(te_sp_doc)
extract_nlp(te_sp_doc)

Grant in support of research to assess the economic, health and environmental impacts on food and to develop a more transparent public procurement process in New York State


{'lemma': ['grant',
  'support',
  'research',
  'assess',
  'economic',
  'health',
  'environmental',
  'impact',
  'food',
  'develop',
  'transparent',
  'public',
  'procurement',
  'process',
  'new',
  'york',
  'state'],
 'n_gram': ['environmental_impact',
  'public_procurement',
  'public_procurement_process',
  'procurement_process'],
 'ent': ['New_York_State']}

In [32]:
#lemma function tester pt 2

displacy.render(te_sp_doc, style='ent')

In [33]:
nlp_columns = list(extract_nlp(nlp.make_doc('')).keys())
print(nlp_columns)

['lemma', 'n_gram', 'ent']


In [34]:
for col in nlp_columns:
    df[col] = None

In [63]:
batch_size = 50

for i in range(0, len(df), batch_size):
    docs = nlp.pipe(df['description'][i:i+batch_size])
    
    for j, doc in enumerate(docs):
        for col, values in extract_nlp(doc).items():
            df[col].iloc[i+j] = values

In [64]:
df['lemma'] = df['lemma'].map(str)
df['n_gram'] = df['n_gram'].map(str)
df['ent'] = df['ent'].map(str)

In [65]:
#pickle lemmatized df
with open('grant_df_lemma.pickle', 'wb') as f:
    pickle.dump(df, f)

## C. Vectorization

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(min_df = 2)

In [73]:
def get_vecs(token_col_lst, vec=vec):
    dict = {}
    for col in token_col_lst:
        col_vec = vec.fit_transform(df[col])
        col_terms = vec.get_feature_names_out()
        dict[col] = [col_vec, col_terms]
    return dict 

In [74]:
cols = ['lemma','n_gram','ent']

mtx_terms_dict = get_vecs(cols)

In [75]:
#pickle vec_terms_dict
with open('mtx_terms_dict.pickle', 'wb') as f:
    pickle.dump(mtx_terms_dict, f)