In [2]:
#https://www.niche.com/k12/search/best-public-high-schools/m/san-diego-metro-area/
#The 2019 Best Public High Schools ranking is based on rigorous analysis of key statistics and millions of reviews from 
#students and parents using data from the U.S. Department of Education. Ranking factors include state test scores, college 
#readiness, graduation rates, SAT/ACT scores, teacher quality, and high school ratings.
#How ranking was calculated - https://www.niche.com/about/methodology/best-public-high-schools/

In [3]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [4]:
executable_path = {'executable_path': 'webScraping/chromedriver.exe'}
browser = Browser('chrome',**executable_path, headless=False)

In [5]:
### Scrape Niche Website
niche_url = 'https://www.niche.com/k12/search/best-public-high-schools/m/san-diego-metro-area/'
browser.visit(niche_url)

In [6]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [7]:
results = soup.find_all('li',class_='search-results__list__item')
#print(results)

In [8]:
#create funciton to get the address from the link
def getAddress (link):
    executable_path = {'executable_path': 'webScraping/chromedriver.exe'}
    browser = Browser('chrome',**executable_path, headless=False)
    ### Scrape School Website
    browser.visit(link)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    results = soup.find('div',class_='profile__address')
    address_divs = results.find_all('div')
    address_div1 = address_divs[1]
    school_address = address_div1.text
    return(school_address)

In [9]:
high_schools = []

In [10]:
# Loop through returned results
for result in results:
    
    try:
        # Create school data dictionary
        school_data= {}
        
        div = result.find('div', class_='search-result')
        
        # Get the school link to access school info such as school address
        school_link = div.a['href']
        #call getAddress to get school address using school_link as parameter
        school_address = getAddress(school_link)
        #print(school_address)
        
        # Get the school name
        school_name = div.find('h2',class_='search-result__title').text
        #Get the school ranking
        ranking = div.find('span', class_='search-result-badge-ordinal').text
        #print(f'href:{link}, school name: {school_name}, ranking: {ranking}')

        
        school_data["Name"] = school_name
        school_data["Address"] = school_address
        school_data["Ranking"] = ranking
        school_data["Link"] = school_link
        high_schools.append(school_data)
        
    except AttributeError as e:
        pass
#         print(e)
print(high_schools)

[{'Name': 'Canyon Crest Academy', 'Address': '5951 Village Center Loop Rd.San Diego, CA 92130', 'Ranking': '1', 'Link': 'https://www.niche.com/k12/canyon-crest-academy-san-diego-ca/'}, {'Name': 'Torrey Pines High School', 'Address': '3710 Del Mar Heights Rd.San Diego, CA 92130', 'Ranking': '2', 'Link': 'https://www.niche.com/k12/torrey-pines-high-school-san-diego-ca/'}, {'Name': 'Westview High School', 'Address': '13500 Camino Del SurSan Diego, CA 92129', 'Ranking': '3', 'Link': 'https://www.niche.com/k12/westview-high-school-san-diego-ca/'}, {'Name': 'Del Norte High School', 'Address': '16601 Nighthawk Ln.San Diego, CA 92127', 'Ranking': '4', 'Link': 'https://www.niche.com/k12/del-norte-high-school-san-diego-ca/'}, {'Name': 'Coronado High School', 'Address': '650 D Ave.Coronado, CA 92118', 'Ranking': '5', 'Link': 'https://www.niche.com/k12/coronado-high-school-coronado-ca/'}, {'Name': 'San Dieguito High Academy', 'Address': '800 Santa Fe Dr.Encinitas, CA 92024', 'Ranking': '6', 'Link'

In [11]:
df = pd.DataFrame(high_schools, index=None)
#Change data type of column Ranking
df['Ranking'] = df['Ranking'].astype('int')
#Rearrange Columns
df=df[['Name','Ranking','Address','Link']]
df

Unnamed: 0,Name,Ranking,Address,Link
0,Canyon Crest Academy,1,"5951 Village Center Loop Rd.San Diego, CA 92130",https://www.niche.com/k12/canyon-crest-academy...
1,Torrey Pines High School,2,"3710 Del Mar Heights Rd.San Diego, CA 92130",https://www.niche.com/k12/torrey-pines-high-sc...
2,Westview High School,3,"13500 Camino Del SurSan Diego, CA 92129",https://www.niche.com/k12/westview-high-school...
3,Del Norte High School,4,"16601 Nighthawk Ln.San Diego, CA 92127",https://www.niche.com/k12/del-norte-high-schoo...
4,Coronado High School,5,"650 D Ave.Coronado, CA 92118",https://www.niche.com/k12/coronado-high-school...
5,San Dieguito High Academy,6,"800 Santa Fe Dr.Encinitas, CA 92024",https://www.niche.com/k12/san-dieguito-high-ac...
6,Grossmont Middle College High School,7,"8800 Grossmont College Dr.El Cajon, CA 92020",https://www.niche.com/k12/grossmont-middle-col...
7,Sage Creek High School,8,"3900 Cannon Rd.Carlsbad, CA 92010",https://www.niche.com/k12/sage-creek-high-scho...
8,Rancho Bernardo High School,9,"13010 Paseo LucidoSan Diego, CA 92128",https://www.niche.com/k12/rancho-bernardo-high...
9,Preuss School UCSD,10,"9500 Gilman Dr.La Jolla, CA 92093",https://www.niche.com/k12/preuss-school-ucsd-l...


In [12]:
df.to_csv("datasets/school_ranking.csv",header=True, index=False)

In [15]:
df.to_json("datasets/school_ranking.json", orient="records")