In [23]:
import requests  # To get the pages
from bs4 import BeautifulSoup # and to process them
from bs4.element import Comment

from time import sleep      # Allowing us to pause between pulls
from random import random   # And allowing that pause to be random

import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk import FreqDist

In [24]:
##functions used for scraping code

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True



##Creating a file name for each URL from the nonprofits
def generate_filename_from_url(url) :
    
    if not url :
        return None
    
    # drop the http or https
    name = url.replace("https","").replace("http","")

    # Replace useless chareacters with UNDERSCORE
    name = name.replace("://","").replace(".","_").replace("/","_")
    
    # remove last underscore
    last_underscore_spot = name.rfind("_")
    
    name = name[:last_underscore_spot] + name[(last_underscore_spot+1):]

    # tack on .txt
    name = name + ".txt"
    
    return(name)



## Small Nonprofits

In [25]:
##Creating a dictionary to store the links to each nonprofit's mission/vision/values statements

small_nonprofit_pages = dict()

small_nonprofit_pages["about_us"] = """
https://www.sonoraninstitute.org/our-story/mission-vision/
https://www.internationalconservation.org/about
https://www.cecsb.org/about/our-mission/
https://www.friends.org/about-us/our-story
https://www.earthshare.org/about-earthshare/
https://www.chattahoochee.org/about/
https://www.conservationnw.org/about-us/
https://www.qlf.org/about-qlf/mission-vision/
https://www.globalgreen.org/
https://www.sustainablenorthwest.org/focus
""".split()


In [29]:
for page in small_nonprofit_pages :  
    for link in small_nonprofit_pages[page] : 
        output_file_name = generate_filename_from_url(link) #Using the function created above, create a file name
        
        # pull the page 
        try:
            r = requests.get(link)
        except :
            pass
        
        # process the page if r status code is 200 (successful pull)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, 'html.parser')
            texts=soup.findAll(text=True)
            visible_texts = filter(tag_visible, texts)
         # write out the page to a file with the appropriate name
        with open(output_file_name,'w',encoding = "UTF-8") as outfile :
            outfile.write(" ".join(t.strip() for t in visible_texts))
    
        # Pause for a bit
    wait_time = 5 + random()*10
    print(f"Waiting for {wait_time:.02f} seconds.")
        
    sleep(wait_time)
            

Waiting for 9.80 seconds.


In [30]:
small_MVVs = [
             "www_cecsb_org_about_our-mission.txt",
             "www_chattahoochee_org_about.txt",
             "www_conservationnw_org_about-us.txt",
             "www_earthshare_org_about-earthshare.txt",
             "www_friends_org_about-usour-story.txt",
             "www_globalgreen_org.txt",
             "www_internationalconservation_orgabout.txt",
             "www_qlf_org_about-qlf_mission-vision.txt",
             "www_sonoraninstitute_org_our-story_mission-vision.txt",
             "www_sustainablenorthwest_orgfocus.txt"]

## Medium Nonprofits

In [31]:
##Creating a dictionary to store the links to each nonprofit's mission/vision/values statements

medium_nonprofit_pages = dict()

medium_nonprofit_pages["about_us"] = """
https://www.wildearthguardians.org/about-us/mission-vision-history/
https://www.wta.org/our-work/about
https://www.ncascades.org/discover/north-cascades-institute
https://www.pcta.org/about-us/our-mission-vision-and-values/
https://www.pachamama.org/about/mission
https://www.treepeople.org/our-work/
https://www.mohonkpreserve.org/what-we-do/
https://www.pecpa.org/about/
https://www.stand.earth/about/mission-and-principles
https://www.earthisland.org/index.php/aboutUs/about-earth-island
""".split()

In [32]:
for page in medium_nonprofit_pages :  
    for link in medium_nonprofit_pages[page] : 
        output_file_name = generate_filename_from_url(link) #Using the function created above, create a file name
        
        # pull the page 
        try:
            r = requests.get(link)
        except :
            pass
        
        # process the page if r status code is 200 (successful pull)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, 'html.parser')
            texts=soup.findAll(text=True)
            visible_texts = filter(tag_visible, texts)
         # write out the page to a file with the appropriate name
        with open(output_file_name,'w',encoding = "UTF-8") as outfile :
            outfile.write(" ".join(t.strip() for t in visible_texts))
    
        # Pause for a bit
    wait_time = 5 + random()*10
    print(f"Waiting for {wait_time:.02f} seconds.")
        
    sleep(wait_time)
            



Waiting for 13.03 seconds.


In [33]:
medium_nonprofits = ["www_earthisland_org_index_php_aboutUsabout-earth-island.txt",
                    "www_mohonkpreserve_org_what-we-do.txt",
                    "www_ncascades_org_discovernorth-cascades-institute.txt",
                    "www_pachamama_org_aboutmission.txt",
                    "www_pcta_org_about-us_our-mission-vision-and-values.txt",
                    "www_pecpa_org_about.txt",
                    "www_stand_earth_aboutmission-and-principles.txt",
                    "www_treepeople_org_our-work.txt",
                    "www_wildearthguardians_org_about-us_mission-vision-history.txt",
                    "www_wta_org_our-workabout.txt"]

## Large Nonprofits

In [35]:
##Creating a dictionary to store the links to each nonprofit's mission/vision/values statements

large_nonprofit_pages = dict()

large_nonprofit_pages["about_us"] = """
https://www.ceres.org/about-us
https://www.cbf.org/about-cbf/our-mission/
https://www.grownyc.org/about
https://www.rmi.org/about/
https://www.earthjustice.org/about
https://www.rff.org/about/
https://www.waterkeeper.org/who-we-are/
https://www.tpwf.org/our-story/
https://www.thesca.org/about
https://www.climaterealityproject.org/our-mission
""".split()

In [36]:
for page in large_nonprofit_pages :  
    for link in large_nonprofit_pages[page] : 
        output_file_name = generate_filename_from_url(link) #Using the function created above, create a file name
        
        # pull the page 
        try:
            r = requests.get(link)
        except :
            pass
        
        # process the page if r status code is 200 (successful pull)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, 'html.parser')
            texts=soup.findAll(text=True)
            visible_texts = filter(tag_visible, texts)
         # write out the page to a file with the appropriate name
        with open(output_file_name,'w',encoding = "UTF-8") as outfile :
            outfile.write(" ".join(t.strip() for t in visible_texts))
    
        # Pause for a bit
    wait_time = 5 + random()*10
    print(f"Waiting for {wait_time:.02f} seconds.")
        
    sleep(wait_time)

Waiting for 11.38 seconds.


In [37]:
large_nonprofits = ["www_cbf_org_about-cbf_our-mission.txt",
                    "www_ceres_orgabout-us.txt",
                    "www_climaterealityproject_orgour-mission.txt",
                    "www_earthjustice_orgabout.txt",
                    "www_grownyc_orgabout.txt",
                    "www_rff_org_about.txt",
                    "www_rmi_org_about.txt",
                    "www_thesca_orgabout.txt",
                    "www_tpwf_org_our-story.txt",
                    "www_waterkeeper_org_who-we-are.txt"]

In [40]:
### Fixing Error files that came up blank

error_links= dict()

error_links["about_us"] = """
https://earthshare.org/about-earthshare/
https://internationalconservation.org/about
https://rmi.org/about/
https://tpwf.org/our-story/
https://pcta.org/about-us/our-mission-vision-and-values/
""".split()

In [46]:
for page in error_links :  
    for link in error_links[page] : 
        output_file_name = generate_filename_from_url(link) #Using the function created above, create a file name
        
        # pull the page 
        try:
            r = requests.get(link)
        except :
            pass
        
        # process the page if r status code is 200 (successful pull)
#         if r.status_code == 200:
#             soup = BeautifulSoup(r.text, 'html.parser')
#             texts=soup.findAll(text=True)
#             visible_texts = filter(tag_visible, texts)

            soup = BeautifulSoup(r.text, 'html.parser')
            texts=soup.findAll(text=True)
            visible_texts = filter(tag_visible, texts)


         # write out the page to a file with the appropriate name
        with open(output_file_name,'w',encoding = "Latin-1") as outfile :
            outfile.write(" ".join(t.strip() for t in visible_texts))
    
        # Pause for a bit
    wait_time = 5 + random()*10
    print(f"Waiting for {wait_time:.02f} seconds.")
        
    sleep(wait_time)

Waiting for 12.47 seconds.


## Security Issue

The five above links had some sort of security barrier: so I just copied and pasted MVVs from websites into new files