# NPO Crawler v1.1

Updated on 2024/08/07, by Xiang-Yi Huang

### Step 0: Install Required Packages

In [1]:
!pip install beautifulsoup4




[notice] A new release of pip is available: 24.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





### Step 1: Let's Crawl !
In this version, only NPOs in Taipei and New Taipei are crawled.

In [46]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
import threading

# A lock to ensure thread-safe console output
print_lock = threading.Lock()

# Function to fetch data for a specific orgid
def fetch_data(orgid):
    url = f"https://www.npo.org.tw/orgnpointroduction.aspx?tid=200&orgid={orgid}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')

        # Try to extract the address first to decide whether to continue parsing
        try:
            intro_list = soup.find("div", class_="intro_list")
            address_text = intro_list.find(string=lambda t: "地址：" in t)
            address = address_text.split("地址：")[1].strip() if address_text else ""
        except:
            address = ""

        # Filter by address containing "臺北", "台北", or "新北"
        if not any(city in address for city in ["臺北", "台北", "新北"]):
            with print_lock:
                print(f"Skipping orgid {orgid} (Address does not match)", flush=True)
            return None  # Skip if the address does not meet the condition

        # Extract organization code
        try:
            org_code_text = intro_list.find(string=lambda t: "機構代碼：" in t)
            org_code = org_code_text.split("機構代碼：")[1].strip() if org_code_text else ""
        except:
            org_code = ""

        # Extract organization attribute between "機構屬性：" and "服務項目"
        try:
            profile_text = soup.find("div", class_="profile off").get_text(strip=True)
            if "機構屬性：" in profile_text and "服務項目" in profile_text:
                org_attr = profile_text.split("機構屬性：")[1].split("服務項目")[0].strip()
            else:
                org_attr = ""
        except:
            org_attr = ""

        # Extract organization name
        try:
            org_name_text = intro_list.find(string=lambda t: "機構名稱：" in t)
            org_name = org_name_text.split("機構名稱：")[1].strip() if org_name_text else ""
        except:
            org_name = ""

        # Extract CEO
        try:
            ceo_text = intro_list.find(string=lambda t: "執行長：" in t)
            ceo = ceo_text.split("執行長：")[1].strip() if ceo_text else ""
        except:
            ceo = ""

        # Extract contact person
        try:
            contact_text = intro_list.find(string=lambda t: "聯絡人：" in t)
            contact = contact_text.split("聯絡人：")[1].strip() if contact_text else ""
        except:
            contact = ""

        # Extract phone number from <a> tag and any text after it
        try:
            phone_a_tag = intro_list.find(string=lambda t: "電話：" in t).parent.find("a")
            phone_text = phone_a_tag.get_text(strip=True) if phone_a_tag else ""
            
            # Check for any text after the <a> tag and concatenate
            phone_parent_text = phone_a_tag.parent.get_text(strip=True) if phone_a_tag else ""
            phone = phone_parent_text.split("電話：")[1].strip() if phone_parent_text else ""
        except:
            phone = ""

        # Extract fax number (if applicable)
        try:
            fax_text = intro_list.find(string=lambda t: "傳真：" in t)
            fax = fax_text.split("傳真：")[1].strip() if fax_text else ""
        except:
            fax = ""

        # Extract website from <a> tag
        try:
            website_text = intro_list.find(string=lambda t: "網址：" in t).parent.find("a").get_text(strip=True)
            website = website_text if website_text else ""
        except:
            website = ""

        # Extract email from <a> tag
        try:
            email_text = intro_list.find(string=lambda t: "電子郵件：" in t).parent.find("a").get_text(strip=True)
            email = email_text if email_text else ""
        except:
            email = ""

        # Output progress
        with print_lock:
            print(f"Processed orgid {orgid}", flush=True)

        # Return the data if the address matches the filter
        return {
            "機構代碼": org_code,
            "機構名稱": org_name,
            "機構屬性": org_attr,
            "執行長": ceo,
            "聯絡人": contact,
            "電話": phone,
            "傳真": fax,
            "網址": website,
            "電子郵件": email,
            "地址": address
        }

    except Exception as e:
        with print_lock:
            print(f"Error fetching orgid {orgid}: {e}", flush=True)
        return None

# Function to fetch data for all orgids using multithreading
def fetch_all_data(start_id, end_id):
    data = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        futures = [executor.submit(fetch_data, orgid) for orgid in range(start_id, end_id + 1)]
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result:  # Only append if result is not None (address matches)
                data.append(result)
    return data

# Function to save the data into a CSV file
def save_to_excel(data, filename):
    df = pd.DataFrame(data)
    
    # Some exceptions may occur when converting data types
    df["機構代碼"] = pd.to_numeric(df["機構代碼"], errors='coerce')
    df = df.sort_values(by="機構代碼")
    
    df.to_excel(filename, index=False)

In [47]:
# Main function
start_id = 7773
end_id = 7785
data = fetch_all_data(start_id, end_id)
save_to_excel(data, "npo_data1.xlsx")

Processed orgid 7774
Processed orgid 7773
Skipping orgid 7782 (Address does not match)
Processed orgid 7785
Processed orgid 7778
Skipping orgid 7781 (Address does not match)
Processed orgid 7777
Skipping orgid 7779 (Address does not match)
Processed orgid 7775
Processed orgid 7780
Processed orgid 7784
Processed orgid 7776
Skipping orgid 7783 (Address does not match)
