In [4]:
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import csv
from urllib.parse import urlencode
import pandas as pd
import time

class Node:
    def __init__(self, name, state, zipcode, students, income):
        self.name = name
        self.state = state
        self.zipcode = zipcode
        self.students = students
        self.income = income
        self.left = None
        self.right = None

def extract_source(url):
    params = {'api_key': API_KEY, 'url': url}
    for _ in range(NUM_RETRIES):
        try:
            response = requests.get('http://api.scraperapi.com/', params=urlencode(params))
            if response.status_code in [200, 404]:
                break
        except requests.exceptions.ConnectionError:
            response = ''
    if response.status_code == 200:
        source = response.text
    return source

def extract_link(source):
    links = []
    soup=BeautifulSoup(source, features="html.parser")
    school=soup.find_all('li', class_ = "search-results__list__item")
    for x in school:
        school = None
        for y in x.find_all('a', class_ = "MuiTypography-root MuiTypography-inherit MuiLink-root MuiLink-underlineNone search-result__link nss-cyqlwo"):
            school = y['href']
            links.append(school)
    return links

def scrape_url(url):
    soup = BeautifulSoup(extract_source(url), features="html.parser")
    name = soup.find('h1', class_ = "postcard__title").text
    address = soup.find('address', class_ = "profile__address--compact").text
    state = address[-8:-6]
    zipcode = address[-5:]
    students = soup.find('section', class_ = "block--two-poll")
    if students is None:
        students = soup.find('section', class_ = "block--two-poll--no-poll block--two-poll--expansion")
        students = students.find('div', class_ = "scalar__value").text
    else:
        students = students.find('div', class_ = "scalar__value").text
    income = soup.find_all('div', class_ = "profile__bucket--5")
    if len(income) == 1:
        income = 'Online'
    else:
        income = income[1].text
        start = income.find('$')
        end = income.find(',')+4
        income = income[start:end]
    data.append([name, state, zipcode, students, income])

API_KEY = '006192aace270d7f21e27e69f9bc1c1f'
NUM_RETRIES = 5
NUM_THREADS = 5
page_beg = 'https://www.niche.com/k12/search/largest-public-high-schools/?page='    
fields = ['Name', 'State', 'Zip Code', 'Students', 'Income']
data = []
schools = []

def buildTree(data):
    if len(data) == 0:
        return
    middle, students = len(data) // 2, data[len(data) // 2][3]
    students = int(students.replace(',', ''))
    new_node = Node(data[middle][0], data[middle][1], data[middle][2], students, data[middle][4])
    new_node.left = buildTree(data[:middle])
    new_node.right = buildTree(data[middle+1:])
    return new_node

def findSchool(root, limit):
    if not root:
        return
    if root.students < limit:
        schools.append([root.name, root.state, root.zipcode, root.students, root.income])
    findSchool(root.left, limit)
    findSchool(root.right, limit)

with open('test.csv', 'w', newline = '') as f:
    write = csv.writer(f)
    write.writerow(fields)
    for i in range(1,3):
        page = page_beg + str(i)
        start = time.time()
        print("Link " + str(i))
        links = extract_link(extract_source(page))
        with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
            executor.map(scrape_url, links)
        data.sort(key= lambda x: x[3])
        write.writerows(data)
        print(str(len(data)) + " schools pulled")
        tree = buildTree(data)
        findSchool(tree, 10000)
        print(schools)
        end = time.time()
        print(end - start)
        data = []

Link 1
25 schools pulled
[['Indiana Connections Academy', 'IN', '46278', 6687, 'Online'], ['Granada Hills Charter', 'CA', '91344', 5462, '$65,290'], ['North Star Academy Charter School of Newark', 'NJ', '07102', 6298, '$37,675'], ['Brooklyn Technical High School', 'NY', '11217', 5943, '$97,055'], ['Ohio Connections Academy', 'OH', '44114', 5709, 'Online'], ['Hendry Virtual Instruction Program', 'FL', '33935', 6490, 'Online'], ['South Carolina Connections Academy', 'SC', '29210', 6298, 'Online'], ['Volusia Virtual Instruction Program District Provided', 'FL', '32129', 7573, 'Online'], ['Visions in Education', 'CA', '95608', 7189, '$70,684'], ['Idaho Home Learning Academy', 'ID', '83252', 7003, 'Online'], ['Blue Ridge Academy', 'CA', '93252', 6967, 'Online'], ['Primavera - Online', 'AZ', '85225', 7046, 'Online'], ['River Springs Charter School', 'CA', '92590', 7363, '$98,631'], ['Agora Cyber Charter School', 'PA', '19406', 7337, 'Online'], ['Reach Cyber Charter School', 'PA', '17111', 81