<a href="https://colab.research.google.com/github/mtsu-data-science/mtsu-hackmt-2022/blob/main/project_maslow_starter_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import os

In [2]:
def download_project_maslow_files():
    """The purpose of this function is to handle getting the nonprofit.txt and nonprofit_text.txt
    files.  This functin will create a data directory, add a .gitignore file so
    source control does not pick up the txt files, and then check to see
    if either nonprofit.txt and nonprofit_text.txt have been downloaded yet.
    If not, it will proceed to download the files.
    """

    if os.path.isdir("data") is False:
        print("Data folder does not exist, recreating now")
        os.mkdir("data")

    if os.path.isfile("data/.gitignore") is False:
        os.system("echo *.txt > data/.gitignore")

    if os.path.isfile("data/nonprofit.txt") is False:
        print("Downloading nonprofit.txt...")
        os.system("wget https://mtsu-dsi-hackathon-2022.s3.amazonaws.com/nonprofit.txt -O data/nonprofit.txt -q")

    if os.path.isfile("data/nonprofit_text.txt") is False:
        print("Downloading nonprofit_text.txt...")
        os.system("wget https://mtsu-dsi-hackathon-2022.s3.amazonaws.com/nonprofit_text.txt -O data/nonprofit_text.txt -q")

In [3]:
def get_non_profit_df():
    """This function will first run the download_project_maslow_files() to check to
    see if the files have been downloaded to the `data/` folder.  If not,
    it will download the files.
    Then, it will read in the file and return a dataframe with the nonprofit
    """

    download_project_maslow_files()

    col_types = {
        "nonprofit_id": "Int64",
        "reporting_year": "Int64",
        "ein": "Int64",
        "businessname": "str",
        "phone": "str",
        "address1": "str",
        "address2": "str",
        "city": "str",
        "stabbrv": "str",
        "zip": "str"
      }

    df = pd.read_csv("data/nonprofit.txt", sep = "|", dtype=col_types)

    return df
df_np = get_non_profit_df()
df_np.head()

Data folder does not exist, recreating now
Downloading nonprofit.txt...
Downloading nonprofit_text.txt...


Unnamed: 0,nonprofit_id,reporting_year,ein,businessname,phone,address1,address2,city,stabbrv,zip
0,10,2021,10274998,MOUNT ST JOSEPH,2078730705,7 HIGHWOOD STREET,7 HIGHWOOD STREET,WATERVILLE,ME,4901
1,11,2020,10275026,BELFAST CURLING CLUB,2073389851,PO BOX 281 BELMONT AVE,PO BOX 281 BELMONT AVE,BELFAST,ME,4915
2,12,2021,10275130,Unity College,2075097100,90 Quaker Hill Road,90 Quaker Hill Road,Unity,ME,4988
3,13,2020,10275156,Western Maine Community Action Inc,2076453764,20 Church Street,20 Church Street,East Wilton,ME,4324
4,14,2020,10275159,JUNIOR ACHIEVEMENT OF MAINE INC,2073474333,565 CONGRESS STREET NO 306,565 CONGRESS STREET NO 306,PORTLAND,ME,4101


In [4]:
def get_non_profit_text_df():
    """This function will first run the download_project_maslow_files() to check to
    see if the files have been downloaded to the `data/` folder.  If not,
    it will download the files.
    Then, it will read in the file and return a dataframe with the nonprofit_text
    """

    download_project_maslow_files()

    col_types = {
        "nonprofit_text_id": "Int64",
        "reporting_year": "Int64",
        "nonprofit_id": "Int64",
        "grouptype": "str",
        "description": "str"
    }

    df = pd.read_csv("data/nonprofit_text.txt", sep = "|", encoding="cp1252", dtype=col_types)

    return df

df_text = get_non_profit_text_df()
df_text.head()

Unnamed: 0,nonprofit_text_id,reporting_year,nonprofit_id,grouptype,description
0,10,2020,4553,charitablegroup,MAINTAIN AND BEAUTIFY THE DEGREGORIE PARK MAIN...
1,11,2019,4978,charitablegroup,PROVIDING HOUSING AND RESIDENTIAL SERVICES FOR...
2,12,2017,37,charitablegroup,PROVIDING SCHOLARSHIPS AND EDUCATIONS ASSISTAN...
3,13,2020,78,charitablegroup,SUBSISTENCE GRANTS TO INDIGENT SEAMEN BENEFICI...
4,14,2020,81,charitablegroup,The organization's resources are devoted entir...
