In [1]:
import re
import time
import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

In [4]:
chromedriver = f'/Users/nailiding/Downloads/downloads_from_chrome/chromedriver'
browser = webdriver.Chrome(executable_path=chromedriver)

In [505]:
browser.get('https://tools.nycenet.edu/snapshot/2019/')

In [33]:
text_box = browser.find_element_by_tag_name('input')
text_box.click()

lst = browser.find_element_by_tag_name('ul')
schools = lst.text.split('\n')

text_box.send_keys(schools[0])
text_box.send_keys(Keys.ENTER)

In [26]:
lst = browser.find_element_by_tag_name('ul')

In [27]:
schools = lst.text.split('\n')

In [28]:
schools[:10]

['P.S. 015 Roberto Clemente - [ES] 01M015',
 'P.S. 015 Roberto Clemente - [Pre-K] 01M015',
 'P.S. 019 Asher Levy - [ES] 01M019',
 'P.S. 019 Asher Levy - [Pre-K] 01M019',
 'P.S. 020 Anna Silver - [ES] 01M020',
 'P.S. 020 Anna Silver - [Pre-K] 01M020',
 'P.S. 034 Franklin D. Roosevelt - [K-8] 01M034',
 'P.S. 034 Franklin D. Roosevelt - [Pre-K] 01M034',
 'The STAR Academy - P.S.63 - [ES] 01M063',
 'The STAR Academy - P.S.63 - [Pre-K] 01M063']

In [29]:
text_box.send_keys(schools[0])

In [30]:
text_box.send_keys(Keys.ENTER)

In [506]:
def get_browser(url, chromedriver):
    browser = webdriver.Chrome(executable_path=chromedriver)
    browser.get(url)
    wait = WebDriverWait(browser, 20) # maximum wait time is 20 seconds 
    return browser, wait

def get_schools(wait):
    input_box = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'input')))
    input_box.click()
    elem_schools = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'ul')))
    schools = list(filter(lambda x: 'Pre-K' not in x, elem_schools.text.split('\n')))
    return schools

def extract_num(text):
    try:
        # pattern to find any number (int or float)
        pattern = r'[-+]?\d*\.\d+|\d+'
        result = re.findall(pattern, text)[0]
        return float(result)
    except:
        return np.nan
    
def get_school_si_gen_info(wait):
    
    enroll, asian, black, hispanic, white = np.nan, np.nan, np.nan, np.nan, np.nan
    try:
        elem_gen_info = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='metric-group gen']")))
        gen_info = elem_gen_info.text.split('\n')

        for item in gen_info:
            if 'enrollment' in item.lower():
                item = item.replace(',', '')
                enroll = extract_num(item)

            if 'asian' in item.lower():
                asian = extract_num(item)/100

            if 'black' in item.lower():
                black = extract_num(item)/100

            if 'hispanic' in item.lower():
                hispanic = extract_num(item)/100

            if 'white' in item.lower():
                white = extract_num(item)/100
        return enroll, asian, black, hispanic, white   
    except:
        return enroll, asian, black, hispanic, white
    
def get_school_si_loc(wait):
    address, borough, city, state, zipcode = None, None, 'New York', None, None
    try:
        elem_loc = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='metric-group location']")))
        location = elem_loc.text.split('\n')
        address = location[1]
        regions = location[2].split(', ')
        borough = regions[0]
        state = regions[1].split(' ')[0]
        zipcode = regions[1].split(' ')[1]
        return address, borough, city, state, zipcode
    except:
        return address, borough, city, state, zipcode
        
def get_school_si(wait):
    tab_info = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='tab-button-info']")))
    tab_info.click()
    
    elem_gen_info = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='metric-group gen']")))
    gen_info = elem_gen_info.text.split('\n')
    
    enroll, asian, black, hispanic, white = get_school_si_gen_info(wait)
    address, borough, city, state, zipcode = get_school_si_loc(wait)
    
    si = [
        address, 
        borough, 
        city, 
        state, 
        zipcode,
        enroll, 
        asian, 
        black, 
        hispanic, 
        white,
    ]
    
    print(si)
    
    return si

def get_school_sa(wait):
    tab_sa = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='tab-button-sa']")))
    tab_sa.click()
    
    xpath_sa = "//div[@class='tab-content print-always']"
    elem_overall = wait.until(EC.presence_of_element_located((By.XPATH, xpath_sa)))
    overall_rating = elem_overall.find_element_by_class_name('rating-description').text
    
    value_path = "div[@class='metric-bignum']/div[@class='school-value']"
    xpath_engl = f"//div[@class='metric-group perf'][h3='English']/{value_path}"
    elem_engl = wait.until(EC.presence_of_element_located((By.XPATH, xpath_engl)))
    xpath_math = f"//div[@class='metric-group perf'][h3='Math']/{value_path}"
    elem_math = wait.until(EC.presence_of_element_located((By.XPATH, xpath_math)))
    
    math_score = extract_num(elem_math.text)/100
    engl_score = extract_num(elem_engl.text)/100
    
    sa = [
        overall_rating, 
        engl_score, 
        math_score,
    ]

    return sa

def get_school_ct(wait):
    tab_ct = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='tab-button-ct']")))
    tab_ct.click()
    
    xpath_ct = "//div[@class='fr-tab-content page-bottom']/div[@class='element-overall-rating']"
    elem_ct = wait.until(EC.presence_of_element_located((By.XPATH, xpath_ct)))
    ct_rating = elem_ct.find_element_by_class_name('rating-description').text
    
    return [ct_rating]

def get_school_se(wait):
    tab_se = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='tab-button-se']")))
    tab_se.click()
    
    xpath_se = "//div[@class='tab-content print-always']"
    elem_se = wait.until(EC.presence_of_element_located((By.XPATH, xpath_se)))
    se_rating = elem_se.find_element_by_class_name('rating-description').text
    
    return [se_rating]

def get_school_sf(wait):
    tab_sf = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='tab-button-sf']")))
    tab_sf.click()
    
    xpath_sf = "//div[@class='tab-content print-always']"
    elem_sf = wait.until(EC.presence_of_element_located((By.XPATH, xpath_sf)))
    sf_rating = elem_sf.find_element_by_class_name('rating-description').text
    
    return [sf_rating]

def get_school_tr(wait):
    tab_tr = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='tab-button-tr']")))
    tab_tr.click()
    
    xpath_tr = "//div[@class='tab-content print-always']"
    elem_tr = wait.until(EC.presence_of_element_located((By.XPATH, xpath_tr)))
    tr_rating = elem_tr.find_element_by_class_name('rating-description').text
    
    return [tr_rating]

def get_school_es(wait):
    tab_es = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='tab-button-es']")))
    tab_es.click()
    
    xpath_es = "//div[@class='tab-content print-always']"
    elem_es = wait.until(EC.presence_of_element_located((By.XPATH, xpath_es)))
    es_rating = elem_es.find_element_by_class_name('rating-description').text
    
    return [es_rating]

def get_school_data(browser, url, school_name):
    # reset browser to the search box 
    browser.get(url)
    wait = WebDriverWait(browser, 20)
    input_box = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'input')))
    input_box.send_keys(school_name)
    input_clickable = wait.until(EC.element_to_be_clickable((By.XPATH, "//li[@id='result-item-0']")))
    input_clickable.click()
    
    si = get_school_si(wait)
    sa = get_school_sa(wait)
    ct = get_school_ct(wait)
    se = get_school_se(wait)
    es = get_school_es(wait)
    sf = get_school_sf(wait)
    tr = get_school_tr(wait)
    
    data = si+sa+ct+se+es+sf+tr 
    
    return data

def get_all_schools_data(browser, url, schools):
    schools_data = []
    for school_name in schools:
        data = get_school_data(browser, url, school_name)
        time.sleep(3)
        schools_data.append(data)
    return schools_data

In [473]:
url = 'https://tools.nycenet.edu/snapshot/2019/'
browser, wait = get_browser(url, chromedriver)

In [474]:
schools = get_schools(wait)

In [87]:
schools

['P.S. 015 Roberto Clemente - [ES] 01M015',
 'P.S. 019 Asher Levy - [ES] 01M019',
 'P.S. 020 Anna Silver - [ES] 01M020',
 'P.S. 034 Franklin D. Roosevelt - [K-8] 01M034',
 'The STAR Academy - P.S.63 - [ES] 01M063',
 'P.S. 064 Robert Simon - [ES] 01M064',
 'P.S. 110 Florence Nightingale - [ES] 01M110',
 'P.S. 134 Henrietta Szold - [ES] 01M134',
 'P.S. 140 Nathan Straus - [K-8] 01M140',
 'P.S. 142 Amalia Castro - [ES] 01M142',
 'P.S. 184m Shuang Wen - [K-8] 01M184',
 'P.S. 188 The Island School - [K-8] 01M188',
 'Orchard Collegiate Academy - [HS] 01M292',
 'The East Village Community School - [ES] 01M315',
 'University Neighborhood Middle School - [MS] 01M332',
 "The Children's Workshop School - [ES] 01M361",
 'Neighborhood School - [ES] 01M363',
 'Earth School - [ES] 01M364',
 'School for Global Leaders - [MS] 01M378',
 'University Neighborhood High School - [HS] 01M448',
 'East Side Community School - [HS] 01M450',
 'East Side Community School - [MS] 01M450',
 'Forsyth Satellite Academ

In [507]:
get_school_data(browser, url, 'P.S. 212 Midtown West - [ES] 02M212')

ElementNotInteractableException: Message: element not interactable
  (Session info: chrome=78.0.3904.108)


In [475]:
get_all_schools_data(browser, url, schools[:5])

['333 East 4 Street', 'Manhattan', 'New York', 'NY', '10009', 161.0, 0.12, 0.28, 0.55, 0.04, 'Excellent', 0.62, 0.65, 'Excellent', 'Excellent', 'Excellent', 'Excellent', 'Good']
['185 1 Avenue', 'Manhattan', 'New York', 'NY', '10003', 239.0, 0.06, 0.22, 0.62, 0.07, 'Excellent', 0.59, 0.48, 'Excellent', 'Good', 'Excellent', 'Excellent', 'Good']
['166 Essex Street', 'Manhattan', 'New York', 'NY', '10002', 439.0, 0.29, 0.13, 0.51, 0.04, 'Good', 0.36, 0.37, 'Fair', 'Fair', 'Fair', 'Good', 'Good']
['730 East 12 Street', 'Manhattan', 'New York', 'NY', '10009', 288.0, 0.03, 0.33, 0.6, 0.03, 'Needs Improvement', 0.2, 0.11, 'Fair', 'Fair', 'Fair', 'Good', 'Fair']
['121 East 3 Street', 'Manhattan', 'New York', 'NY', '10009', 207.0, 0.02, 0.21, 0.65, 0.08, 'Good', 0.44, 0.34, 'Excellent', 'Good', 'Good', 'Excellent', 'Good']


[['333 East 4 Street',
  'Manhattan',
  'New York',
  'NY',
  '10009',
  161.0,
  0.12,
  0.28,
  0.55,
  0.04,
  'Excellent',
  0.62,
  0.65,
  'Excellent',
  'Excellent',
  'Excellent',
  'Excellent',
  'Good'],
 ['185 1 Avenue',
  'Manhattan',
  'New York',
  'NY',
  '10003',
  239.0,
  0.06,
  0.22,
  0.62,
  0.07,
  'Excellent',
  0.59,
  0.48,
  'Excellent',
  'Good',
  'Excellent',
  'Excellent',
  'Good'],
 ['166 Essex Street',
  'Manhattan',
  'New York',
  'NY',
  '10002',
  439.0,
  0.29,
  0.13,
  0.51,
  0.04,
  'Good',
  0.36,
  0.37,
  'Fair',
  'Fair',
  'Fair',
  'Good',
  'Good'],
 ['730 East 12 Street',
  'Manhattan',
  'New York',
  'NY',
  '10009',
  288.0,
  0.03,
  0.33,
  0.6,
  0.03,
  'Needs Improvement',
  0.2,
  0.11,
  'Fair',
  'Fair',
  'Fair',
  'Good',
  'Fair'],
 ['121 East 3 Street',
  'Manhattan',
  'New York',
  'NY',
  '10009',
  207.0,
  0.02,
  0.21,
  0.65,
  0.08,
  'Good',
  0.44,
  0.34,
  'Excellent',
  'Good',
  'Good',
  'Excellent',
  'Go