In [1]:
import gspread
import json
import pandas as pd
import spacy
import time
from google.oauth2 import service_account
from google.cloud import storage, bigquery

In [2]:
location = "INSERT LOCATION"
project_id = "INSERT PROJECT ID"
bucket_name = "INSERT BUCKET NAME"
credentials_file = "INSERT CREDENTIALS FILE"
scopes = [
    "https://www.googleapis.com/auth/drive",
    "https://www.googleapis.com/auth/bigquery",
]

Define below the parameters for this run

In [3]:
# The version parameter used to create different versions of the data in BigQuery tables
version = "INSERT VERSION NAME"

# Bag of Words spreadsheet details
bow_spreadsheet_id = "INSERT LINK TO BAG OF WORDS SPREADSHEET"
bow_sheet_name = "INSERT SHEET NAME"

# Excluded bag of words details
bow_exclude_sheet_id = "INSERT LINK TO BAG OF WORDS REMOVE"
bow_exclude_sheet_name = "INSERT SHEET NAME"

In [4]:
bigquery_client = bigquery.Client(location=location)

In [5]:
storage_client = storage.Client(project=project_id)
bucket = storage_client.get_bucket(bucket_name)
credentials_blob = bucket.blob(credentials_file)

In [6]:
credentials_text = credentials_blob.download_as_text()
credentials_dict = json.loads(credentials_text)
credentials = service_account.Credentials.from_service_account_info(credentials_dict, scopes=scopes)

In [7]:
gc = gspread.authorize(credentials)

In [8]:
bag_of_words_sheet = gc.open_by_key(bow_spreadsheet_id).worksheet(bow_sheet_name)
bow_exclude_sheet = gc.open_by_key(bow_exclude_sheet_id).worksheet(bow_exclude_sheet_name)

In [9]:
bow_df = pd.DataFrame(bag_of_words_sheet.get_all_records())
bwe_df = pd.DataFrame(bow_exclude_sheet.get_all_records())
cct_df = pd.DataFrame(cct_sheet.get_all_records())
cmr_df = pd.DataFrame(cmr_sheet.get_all_records())

In [10]:
# Load English language model
nlp = spacy.load("en_core_web_sm")

In [11]:
def find_lemma(text):
    lemma = ""
    if text is None:
        return lemma
    doc = nlp(text.lower())
    for token in doc:
        lemma += token.lemma_ + " "
    return lemma.strip()

In [12]:
bow_df["lemma"] = bow_df["Word"].apply(find_lemma)
bwe_df["lemma"] = bwe_df["Word"].apply(find_lemma)
cct_df["lemma"] = cct_df["Word"].apply(find_lemma)
cmr_df["lemma"] = cmr_df["Word"].apply(find_lemma)


In [13]:
bow_exclude_list = bwe_df["lemma"].tolist()

In [14]:
# Find duplicate lemmas in bow_df
bow_df[bow_df["lemma"].duplicated(keep=False)].sort_values(by="lemma")

Unnamed: 0,Index,Word,Priority,Program 1,Program 2,Program 3,lemma


In [15]:
# Remove one of the duplicate lemmas to prevent multiple counting
bow_df = bow_df.drop_duplicates(subset="lemma", keep="first")

In [16]:
bwe_df[bwe_df["lemma"].duplicated(keep=False)].sort_values(by="lemma")

Unnamed: 0,Index,Word,Notes,lemma
48,49,Ministries,,ministry
97,98,MINISTRIES,,ministry
59,60,Recreational,,recreational
98,99,Recreational,,recreational
93,94,Supported Living,,support live
119,120,Supported Living,,support live


In [17]:
# Remove one of the duplicate lemmas to prevent multiple counting
bwe_df = bwe_df.drop_duplicates(subset="lemma", keep="first")

In [18]:
# Find duplicate lemmas in cct_df
cct_df[cct_df["lemma"].duplicated(keep=False)].sort_values(by="lemma")

Unnamed: 0,Word,lemma
9,Families,family
36,Families,family


In [19]:
# Drop duplicate lemmas from cct_df
cct_df = cct_df.drop_duplicates(subset="lemma", keep="first")

In [20]:
# Find duplicate lemmas in cmr_df
cmr_df[cmr_df["lemma"].duplicated(keep=False)].sort_values(by="lemma")

Unnamed: 0,Word,lemma


In [21]:
# Drop duplicate lemmas from cmr_df
cmr_df = cmr_df.drop_duplicates(subset="lemma", keep="first")

In [22]:
# Helper function to find whether a word exists in a text
def word_exists_in_text(text, word):
    exists = False
    if text is not None and word is not None:
        text = text.lower()
        word = word.lower()
        if word in text:
            exists = True
    return exists

In [23]:
us_states = [
    "alabama",
    "alaska",
    "arizona",
    "arkansas",
    "california",
    "colorado",
    "connecticut",
    "delaware",
    "florida",
    "georgia",
    "hawaii",
    "idaho",
    "illinois",
    "indiana",
    "iowa",
    "kansas",
    "kentucky",
    "louisiana",
    "maine",
    "maryland",
    "massachusetts",
    "michigan",
    "minnesota",
    "mississippi",
    "missouri",
    "montana",
    "nebraska",
    "nevada",
    "new hampshire",
    "new jersey",
    "new mexico",
    "new york",
    "north carolina",
    "north dakota",
    "ohio",
    "oklahoma",
    "oregon",
    "pennsylvania",
    "rhode island",
    "south carolina",
    "south dakota",
    "tennessee",
    "texas",
    "utah",
    "vermont",
    "virginia",
    "washington",
    "west virginia",
    "wisconsin",
    "wyoming",
    "usa",
    "u.s.",
    "u.s.a.",
    "united states",
    "america",
    "american",
    # "dc",
    "d.c.",
    "district of columbia"
]

In [24]:
query = """
SELECT *
FROM analysis.NAME_OF_TABLE_CREATED_IN_CUT_1
"""

In [25]:
query_job = bigquery_client.query(query)

In [26]:
orgs = query_job.to_dataframe()

In [27]:
orgs.shape

(110547, 49)

In [28]:
orgs.sample(5)

Unnamed: 0,ein,return_timestamp,tax_period_end_date,business_name,address,city,state,zip,phone,business_officer_name,...,unemployment_rate_percent,labor_force_participation_rate_percent,less_than_hs_percent,total_population,median_household_income,gini,gini_percentile,gini_top_twenty,latitude,longitude
93042,340718413,2022-11-15T08:04:19-06:00,2021-12-31,WOMENS WELSH CLUBS OF AMERICA,22199 CENTER RIDGE ROAD,ROCKY RIVER,OH,44116,4403310420,SIAN PETZ,...,0.075,0.635,0.080929,662300,55109,0.5086,0.937267,True,41.457542,-81.863371
61423,205078107,2022-11-11T12:00:33-08:00,2021-12-31,TOTAL FAMILY CARE COALITION,3406 N STREET SE,WASHINGTON,DC,20019,2022491000,GAIL AVENT,...,0.071,0.714,0.065993,404527,93547,0.5199,0.96118,True,38.8748,-76.957127
17152,591087090,2022-05-16T14:00:37-05:00,2021-06-30,UNITED WAY OF INDIAN RIVER,PO BOX 1960,VERO BEACH,FL,32961,7725678900,MEREDITH EGAN,...,0.06,0.494,0.102304,70515,61594,0.5061,0.932298,True,27.69006,-80.41268
4187,830211970,2022-05-16T08:52:13-05:00,2021-06-30,NATIONAL ASSOCIATION OF STUDENT,1801 PENNSYLVANIA AVE NW NO 850,WASHINGTON,DC,20006,2027850453,JUSTIN DRAEGER,...,0.071,0.714,0.065993,404527,93547,0.5199,0.96118,True,38.90031,-77.042017
84322,460667855,2022-01-20T16:16:58-08:00,2021-06-30,The Savila Collaborative,1317 Isleta Blvd SW,Albuquerque,NM,87105,5053127296,Toni Martorelli,...,0.056,0.626,0.093249,357173,56920,0.4754,0.790683,False,35.054424,-106.67479


### The code below completes in approximately 2 hours

In [29]:
# For each organization, find the total number of times each word in the bag of words appears in the description, call it total_count
# In another column called priority_count, find the total number of times each word that has priority = 1 appears in the description
# Search within all of these description columns:
# mission, description,
# prog_srvc_accom_acty_2_grp_desc, prog_srvc_accom_acty_3_grp_desc,
# prog_srvc_accom_acty_other_grp_desc, mission_desc, other_expenses_group_desc

# Find place names in the mission column using spaCy's NER
# If any of the NER spaces are non-US, then flag the org as non-US in a new column

for index, row in orgs.iterrows():
    mission = find_lemma(row["mission"])
    description = find_lemma(row["description"])
    business_name = find_lemma(row["business_name"])
    prog_srvc_accom_acty_2_grp_desc = find_lemma(row["prog_srvc_accom_acty_2_grp_desc"])
    prog_srvc_accom_acty_3_grp_desc = find_lemma(row["prog_srvc_accom_acty_3_grp_desc"])

    mission_has_non_us_location = False
    if mission is not None:
        doc = nlp(mission)
        for ent in doc.ents:
            if ent.label_ == "GPE":
                place = ent.text.lower()
                if place not in us_states:
                    mission_has_non_us_location = True
                    break

    mission_has_excluded_word = False
    mission_excluded_word_list = []
    business_name_has_excluded_word = False
    business_name_excluded_word_list = []


    for word in bow_exclude_list:
        if word_exists_in_text(mission, word):
            mission_has_excluded_word = True
            mission_excluded_word_list.append(word)
        if word_exists_in_text(business_name, word):
            business_name_has_excluded_word = True
            business_name_excluded_word_list.append(word)

    mission_excluded_word_list = sorted(list(set(mission_excluded_word_list)))
    business_name_excluded_word_list = sorted(list(set(business_name_excluded_word_list)))

    # These will be comma separated, alphabetical list of words that matched in the given column
    mission_matched_words = []
    description_matched_words = []
    business_name_matched_words = []
    prog_srvc_accom_acty_2_grp_desc_matched_words = []
    prog_srvc_accom_acty_3_grp_desc_matched_words = []

    mission_matched_words_priority_1A = []
    description_matched_words_priority_1A = []
    business_name_matched_words_priority_1A = []
    prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1A = []
    prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1A = []

    mission_matched_words_priority_1B = []
    description_matched_words_priority_1B = []
    business_name_matched_words_priority_1B = []
    prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1B = []
    prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1B = []

    mission_matched_words_priority_1C = []
    description_matched_words_priority_1C = []
    business_name_matched_words_priority_1C = []
    prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1C = []
    prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1C = []

    orgs.loc[index, "mission_count"] = 0
    orgs.loc[index, "description_count"] = 0
    orgs.loc[index, "business_name_count"] = 0
    orgs.loc[index, "prog_srvc_accom_acty_2_grp_desc_count"] = 0
    orgs.loc[index, "prog_srvc_accom_acty_3_grp_desc_count"] = 0

    orgs.loc[index, "mission_count_priority_1A"] = 0
    orgs.loc[index, "description_count_priority_1A"] = 0
    orgs.loc[index, "business_name_count_priority_1A"] = 0
    orgs.loc[index, "prog_srvc_accom_acty_2_grp_desc_count_priority_1A"] = 0
    orgs.loc[index, "prog_srvc_accom_acty_3_grp_desc_count_priority_1A"] = 0

    orgs.loc[index, "mission_count_priority_1B"] = 0
    orgs.loc[index, "description_count_priority_1B"] = 0
    orgs.loc[index, "business_name_count_priority_1B"] = 0
    orgs.loc[index, "prog_srvc_accom_acty_2_grp_desc_count_priority_1B"] = 0
    orgs.loc[index, "prog_srvc_accom_acty_3_grp_desc_count_priority_1B"] = 0

    orgs.loc[index, "mission_count_priority_1C"] = 0
    orgs.loc[index, "description_count_priority_1C"] = 0
    orgs.loc[index, "business_name_count_priority_1C"] = 0
    orgs.loc[index, "prog_srvc_accom_acty_2_grp_desc_count_priority_1C"] = 0
    orgs.loc[index, "prog_srvc_accom_acty_3_grp_desc_count_priority_1C"] = 0

    orgs.loc[index, "mission_has_non_us_location"] = mission_has_non_us_location
    orgs.loc[index, "mission_has_excluded_word"] = mission_has_excluded_word
    orgs.loc[index, "mission_excluded_word_list"] = ",".join(mission_excluded_word_list)
    orgs.loc[index, "business_name_has_excluded_word"] = business_name_has_excluded_word
    orgs.loc[index, "business_name_excluded_word_list"] = ",".join(business_name_excluded_word_list)
    

    if index % 10000 == 0:
        print(index, time.strftime("%H:%M:%S"))

    for index4, row4 in bow_df.iterrows():
        word = row4["lemma"]
        priority = row4["Priority"]

        if word_exists_in_text(mission, word):
            orgs.loc[index, "mission_count"] += 1
            mission_matched_words.append(word)
        if word_exists_in_text(description, word):
            orgs.loc[index, "description_count"] += 1
            description_matched_words.append(word)
        if word_exists_in_text(business_name, word):
            orgs.loc[index, "business_name_count"] += 1
            business_name_matched_words.append(word)
        if word_exists_in_text(prog_srvc_accom_acty_2_grp_desc, word):
            orgs.loc[index, "prog_srvc_accom_acty_2_grp_desc_count"] += 1
            prog_srvc_accom_acty_2_grp_desc_matched_words.append(word)
        if word_exists_in_text(prog_srvc_accom_acty_3_grp_desc, word):
            orgs.loc[index, "prog_srvc_accom_acty_3_grp_desc_count"] += 1
            prog_srvc_accom_acty_3_grp_desc_matched_words.append(word)


        if priority == "1A":
            if word_exists_in_text(mission, word):
                orgs.loc[index, "mission_count_priority_1A"] += 1
                mission_matched_words_priority_1A.append(word)
            if word_exists_in_text(description, word):
                orgs.loc[index, "description_count_priority_1A"] += 1
                description_matched_words_priority_1A.append(word)
            if word_exists_in_text(business_name, word):
                orgs.loc[index, "business_name_count_priority_1A"] += 1
                business_name_matched_words_priority_1A.append(word)
            if word_exists_in_text(prog_srvc_accom_acty_2_grp_desc, word):
                orgs.loc[index, "prog_srvc_accom_acty_2_grp_desc_count_priority_1A"] += 1
                prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1A.append(word)
            if word_exists_in_text(prog_srvc_accom_acty_3_grp_desc, word):
                orgs.loc[index, "prog_srvc_accom_acty_3_grp_desc_count_priority_1A"] += 1
                prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1A.append(word)


        if priority == "1B":
            if word_exists_in_text(mission, word):
                orgs.loc[index, "mission_count_priority_1B"] += 1
                mission_matched_words_priority_1B.append(word)
            if word_exists_in_text(description, word):
                orgs.loc[index, "description_count_priority_1B"] += 1
                description_matched_words_priority_1B.append(word)
            if word_exists_in_text(business_name, word):
                orgs.loc[index, "business_name_count_priority_1B"] += 1
                business_name_matched_words_priority_1B.append(word)
            if word_exists_in_text(prog_srvc_accom_acty_2_grp_desc, word):
                orgs.loc[index, "prog_srvc_accom_acty_2_grp_desc_count_priority_1B"] += 1
                prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1B.append(word)
            if word_exists_in_text(prog_srvc_accom_acty_3_grp_desc, word):
                orgs.loc[index, "prog_srvc_accom_acty_3_grp_desc_count_priority_1B"] += 1
                prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1B.append(word)


        if priority == "1C":
            if word_exists_in_text(mission, word):
                orgs.loc[index, "mission_count_priority_1C"] += 1
                mission_matched_words_priority_1C.append(word)
            if word_exists_in_text(description, word):
                orgs.loc[index, "description_count_priority_1C"] += 1
                description_matched_words_priority_1C.append(word)
            if word_exists_in_text(business_name, word):
                orgs.loc[index, "business_name_count_priority_1C"] += 1
                business_name_matched_words_priority_1C.append(word)
            if word_exists_in_text(prog_srvc_accom_acty_2_grp_desc, word):
                orgs.loc[index, "prog_srvc_accom_acty_2_grp_desc_count_priority_1C"] += 1
                prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1C.append(word)
            if word_exists_in_text(prog_srvc_accom_acty_3_grp_desc, word):
                orgs.loc[index, "prog_srvc_accom_acty_3_grp_desc_count_priority_1C"] += 1
                prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1C.append(word)

    orgs.loc[index, "mission_matched_words"] = ",".join(sorted(mission_matched_words))
    orgs.loc[index, "description_matched_words"] = ",".join(sorted(description_matched_words))
    orgs.loc[index, "business_name_matched_words"] = ",".join(sorted(business_name_matched_words))
    orgs.loc[index, "prog_srvc_accom_acty_2_grp_desc_matched_words"] = ",".join(sorted(prog_srvc_accom_acty_2_grp_desc_matched_words))
    orgs.loc[index, "prog_srvc_accom_acty_3_grp_desc_matched_words"] = ",".join(sorted(prog_srvc_accom_acty_3_grp_desc_matched_words))

    orgs.loc[index, "mission_matched_words_priority_1A"] = ",".join(sorted(mission_matched_words_priority_1A))
    orgs.loc[index, "description_matched_words_priority_1A"] = ",".join(sorted(description_matched_words_priority_1A))
    orgs.loc[index, "business_name_matched_words_priority_1A"] = ",".join(sorted(business_name_matched_words_priority_1A))
    orgs.loc[index, "prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1A"] = ",".join(sorted(prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1A))
    orgs.loc[index, "prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1A"] = ",".join(sorted(prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1A))

    orgs.loc[index, "mission_matched_words_priority_1B"] = ",".join(sorted(mission_matched_words_priority_1B))
    orgs.loc[index, "description_matched_words_priority_1B"] = ",".join(sorted(description_matched_words_priority_1B))
    orgs.loc[index, "business_name_matched_words_priority_1B"] = ",".join(sorted(business_name_matched_words_priority_1B))
    orgs.loc[index, "prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1B"] = ",".join(sorted(prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1B))
    orgs.loc[index, "prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1B"] = ",".join(sorted(prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1B))

    orgs.loc[index, "mission_matched_words_priority_1C"] = ",".join(sorted(mission_matched_words_priority_1C))
    orgs.loc[index, "description_matched_words_priority_1C"] = ",".join(sorted(description_matched_words_priority_1C))
    orgs.loc[index, "business_name_matched_words_priority_1C"] = ",".join(sorted(business_name_matched_words_priority_1C))
    orgs.loc[index, "prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1C"] = ",".join(sorted(prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1C))
    orgs.loc[index, "prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1C"] = ",".join(sorted(prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1C))

    # mission_key_words_list concatenates priority 1A, 1B, and 1C words and presents them in a comma-separated list format.
    mission_key_words_list = sorted(list(set(mission_matched_words_priority_1A + mission_matched_words_priority_1B + mission_matched_words_priority_1C)))
    description_key_words_list = sorted(list(set(description_matched_words_priority_1A + description_matched_words_priority_1B + description_matched_words_priority_1C)))
    business_name_key_words_list = sorted(list(set(business_name_matched_words_priority_1A + business_name_matched_words_priority_1B + business_name_matched_words_priority_1C)))
    program_2_key_words_list = sorted(list(set(prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1A + prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1B + prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1C)))
    program_3_key_words_list = sorted(list(set(prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1A + prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1B + prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1C)))

    orgs.loc[index, "mission_key_words_list"] = ",".join(mission_key_words_list)
    orgs.loc[index, "description_key_words_list"] = ",".join(description_key_words_list)
    orgs.loc[index, "business_name_key_words_list"] = ",".join(business_name_key_words_list)
    orgs.loc[index, "program_2_key_words_list"] = ",".join(program_2_key_words_list)
    orgs.loc[index, "program_3_key_words_list"] = ",".join(program_3_key_words_list)


0 20:21:39
10000 20:32:59
20000 20:47:19
30000 21:01:11
40000 21:14:40
50000 21:28:54
60000 21:45:08
70000 22:00:19
80000 22:15:04
90000 22:30:09
100000 22:45:12
110000 23:00:38


Enrichment is complete. Let's check some sample orgs

In [30]:
orgs.sample(10)

Unnamed: 0,ein,return_timestamp,tax_period_end_date,business_name,address,city,state,zip,phone,business_officer_name,...,mission_matched_words_priority_1C,description_matched_words_priority_1C,business_name_matched_words_priority_1C,prog_srvc_accom_acty_2_grp_desc_matched_words_priority_1C,prog_srvc_accom_acty_3_grp_desc_matched_words_priority_1C,mission_key_words_list,description_key_words_list,business_name_key_words_list,program_2_key_words_list,program_3_key_words_list
6362,862116517,2022-11-14T16:29:41-06:00,2021-12-31,PEWIN Foundation Inc,511 Avenue of the Americas 7336,New York,NY,10011,6468834897,Meghna Desai,...,,,,,,,,,,
623,521601960,2022-03-26T11:26:26-05:00,2021-06-30,EARTHSHARE,1717 K STREET NW NO 900,WASHINGTON,DC,20006,2403330300,BRAD LEIBOV,...,work,work,,,,work,work,,,
82213,20243160,2022-01-20T08:44:29-06:00,2021-09-30,GIRL SCOUTS OF THE GREEN AND WHITE MTNS,ONE COMMERCE DRIVE,BEDFORD,NH,3110,6036274158,PATRICIA MELLOR,...,,,,,,,"entrepreneur,entrepreneurship,skill",,"skill,train",skill
23856,474433502,2022-05-12T18:29:08-05:00,2021-12-31,JOY OVERFLOW INTERNATIONAL MINISTRIES INC,417 1/2 EASTERN BLVD,ESSEX,MD,21221,4436006820,VICTOR AKINYEMI,...,,,,,,train,,,,
107752,392025582,2022-10-31T13:39:41-07:00,2021-12-31,ARTWORKS FOR MILWAUKEE INC,207 E BUFFALO STREET,MILWAUKEE,WI,53202,4147089996,TERRY MURPHY,...,work,work,work,,,"career,career readiness,skill,work","job,skill,work,workforce",work,,
74497,42491918,2022-05-16T11:59:09-05:00,2021-06-30,GREATER FALL RIVER RE-CREATION,45 ROCK ST,FALL RIVER,MA,2722,5086790922,GRACE GERLING,...,,,,,,,,,,
66542,370843671,2022-09-01T19:06:59-07:00,2021-10-31,GOLDEN GOOD SHEPHERD HOME,101 PRAIRIE MILLS ROAD,GOLDEN,IL,62339,2176964421,PAM FLESNER,...,,,,,,,,,,
41083,752564380,2022-06-30T13:17:34-05:00,2021-06-30,CASA OF TRINITY,PO BOX 2259,ATHENS,TX,75751,9036757070,KEITH LOPER,...,,work,,,,,"train,work",,,
72856,10268926,2022-05-03T07:24:52-05:00,2021-06-30,TOPSHAM PUBLIC LIBRARY,25 FORESIDE ROAD,TOPSHAM,ME,4086,2077251727,SUSAN PREECE,...,,,,,,,,,,
26921,430652650,2022-11-04T13:38:13-05:00,2021-12-31,LUTHERAN FAMILY AND CHILDREN'S SERVICES,9666 OLIVE BLVD,ST LOUIS,MO,63132,3147875100,JEFF COOK,...,,,,,,,train,,,


In [31]:
# Additional columns to help with analysis
orgs["build_candidate"] = orgs["total_expenses"] >= 2_000_000

In [32]:
# Convert "tax_period_end_date" column to datetime format
orgs["tax_period_end_date"] = pd.to_datetime(orgs["tax_period_end_date"])

# Extract year from tax_period_end_date and call it year
orgs["year"] = orgs["tax_period_end_date"].dt.year

In [33]:
orgs.to_csv(f"orgs_v{version}.csv", index=False)

In [34]:
# Persist this table to bigquery
client = bigquery.Client(location=location)
dataset_id = "analysis"
table_id = f"cut_2_v{version}"
# table_id = "colab_test"

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)

job_config = bigquery.LoadJobConfig()
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
job_config.source_format = bigquery.SourceFormat.CSV
job_config.autodetect = True

with open(f"orgs_v{version}.csv", "rb") as source_file:
    job = client.load_table_from_file(source_file, table_ref, job_config=job_config)

Apply the filters to the enriched Cut 2 table

In [39]:
filter_query = f"""
CREATE OR REPLACE TABLE analysis.cut_2_v{version}_filtered AS
SELECT *
FROM analysis.cut_2_v{version}
WHERE
-- Below has to be True for ALL orgs
NOT mission_has_non_us_location -- Condition 2
AND NOT mission_has_excluded_word
AND
-- Below has to be True for ANY org
( mission_count_priority_1A >= 1  -- Condition 3
  OR mission_count_priority_1B >= 2 -- Condition 4
  OR (mission_count_priority_1B + mission_count_priority_1C >= 3) -- Condition 5
  OR mission_count >= 6 -- Condition 6
)
-- The new condition for removing orgs that has a single 1A word but nothing else
AND NOT (
  mission_count = 1
  AND mission_count_priority_1A = 1
  AND description_count_priority_1A = 0
  AND prog_srvc_accom_acty_2_grp_desc_count_priority_1A = 0
  AND prog_srvc_accom_acty_3_grp_desc_count_priority_1A = 0
)
-- Reapply the revenue conditions (because additional orgs didn't go through that check)
AND total_revenue BETWEEN 250000 AND 24000000
AND total_expenses BETWEEN 250000 AND 18000000
;
"""

In [40]:
filter_query_job = bigquery_client.query(filter_query)

Create the historical table

In [41]:
history_query = f"""
CREATE OR REPLACE TABLE analysis.cut_2_v{version}_filtered_historical AS
SELECT c.* EXCEPT
  (
    ein,
    return_timestamp,
    tax_period_end_date,
    business_name,
    address,
    city,
    state,
    zip,
    phone,
    business_officer_name,
    business_officer_title,
    website,
    formation_year,
    mission,
    voting_members_count,
    total_employee_count,
    total_volunteer_count,
    revenue_from_grants,
    revenue_from_program_services,
    revenue_from_investment,
    revenue_from_other,
    total_revenue,
    expenses_for_grants,
    expenses_for_members,
    expenses_for_employees,
    expenses_for_professional_fundraising,
    expenses_for_other,
    total_expenses,
    principal_office_salary,
    states_where_form_990_filed,
    states_where_form_990_filed_count,
    description,
    prog_srvc_accom_acty_2_grp_desc,
    prog_srvc_accom_acty_3_grp_desc,
    prog_srvc_accom_acty_other_grp_desc,
    mission_desc,
    other_expenses_group_desc
  ),
  i.*,
  EXTRACT(YEAR FROM i.tax_period_end_date) AS return_year
FROM analysis.cut_2_v{version}_filtered AS c
LEFT JOIN original.irs_990_latest AS i ON CAST(c.ein AS STRING) = i.ein
;
"""

In [42]:
history_query_job = bigquery_client.query(history_query)