In [38]:
import requests
from bs4 import BeautifulSoup
import json

# Scraping usnews for a given engineering major's top universities
* We scrape four fields: Name, Location, Score and Ranking(interpreted from Score).
* Combine these four fields to form a list
* Store results in a json file with the following structure:
{major_name1: {university_name1: {location:, score:, rank:}}}

In [2]:
majors = ['aerospace', 'biological-agricultural', 'biomedical','chemical-engineering', 
          'civil-engineering', 'computer-engineering','electrical-engineering', 
          'environmental-engineering', 'industrial-engineering','material-engineering', 
          'mechanical-engineering', 'nuclear-engineering', 'petroleum engineering']

In [18]:
url = "https://www.usnews.com/best-graduate-schools/search?program=top-engineering-schools&name=&specialty=petroleum-engineering"
# Find user agent in headers in network in Chrome developer tools
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

In [47]:
# Names of universities
universities = []
for a_tag in soup.select("a.Anchor-s8bzdzo-0.fwxkXI"):
    universities.append(a_tag.get_text())

# locations
locations = []
for p_tag in soup.select("p.Paragraph-fqygwe-0.cPvbgl"):
    locations.append(p_tag.get_text())


# scores
scores = []
for p_tag in soup.select("p.fqygwe-0-Paragraph-hHEPzZ.kkoztb"):
    if p_tag.get_text() != 'N/A':
        scores.append(float(p_tag.get_text()))

# ranks (recreated from scores)
ranks = create_ranks(scores)  

In [48]:
print(len(universities), len(locations), len(scores), len(ranks))

26 14 12 12


In [51]:
universities[:12]

['University of Texas--Austin (Cockrell)',
 'Stanford University',
 'Texas A&M University--College Station',
 'University of Tulsa',
 'Colorado School of Mines',
 'Pennsylvania State University--University Park',
 'University of Oklahoma',
 'University of Southern California (Viterbi)',
 'Louisiana State University--Baton Rouge',
 'Texas Tech University (Whitacre)',
 'University of Kansas',
 'University of Wyoming']

In [54]:
def create_ranks(scores):
    '''
    Creates ranks corresponding to the scores, allowing for ties between universities.
    '''
    ranks = list(range(1, len(scores) + 1))    
    # Allowing for ties
    for i, score in enumerate(scores[1:], start=1):
        if score == scores[i - 1]:
            ranks[i] = ranks[i-1] 
    return ranks   

In [53]:
# final_code
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} 

results = {}

for major in majors:
    results[major] = {}
    
    url = "https://www.usnews.com/best-graduate-schools/search?program=top-engineering-schools&name=&specialty=" + major
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Names of universities
    universities = []
    for a_tag in soup.select("a.Anchor-s8bzdzo-0.fwxkXI"):
        universities.append(a_tag.get_text())
    
    # locations
    locations = []
    for p_tag in soup.select("p.Paragraph-fqygwe-0.cPvbgl"):
        locations.append(p_tag.get_text())
    
    # scores
    scores = []
    for p_tag in soup.select("p.fqygwe-0-Paragraph-hHEPzZ.kkoztb"):
        if p_tag.get_text() != 'N/A':
            scores.append(float(p_tag.get_text()))
    
    # ranks (recreated from scores)
    ranks = create_ranks(scores)
    
    # Dropping universities with empty scores
    universities = universities[:len(scores)]
    locations = locations[:len(scores)]
    
    # Storing results
    for university, location, score, rank in zip(universities, locations, scores, ranks):
        results[major][university] = {'location': location, 'score' : score, 'rank': rank}

{'aerospace': {'California Institute of Technology': {'location': 'Pasadena, CA',
   'score': 4.7,
   'rank': 1},
  'Massachusetts Institute of Technology': {'location': 'Cambridge, MA',
   'score': 4.7,
   'rank': 1},
  'Stanford University': {'location': 'Stanford, CA', 'score': 4.6, 'rank': 3},
  'Georgia Institute of Technology': {'location': 'Atlanta, GA',
   'score': 4.5,
   'rank': 4},
  'University of Michigan--Ann Arbor': {'location': 'Ann Arbor, MI',
   'score': 4.4,
   'rank': 5},
  'Purdue University--West Lafayette': {'location': 'West Lafayette, IN',
   'score': 4.2,
   'rank': 6},
  'Texas A&M University--College Station': {'location': 'College Station, TX',
   'score': 4.0,
   'rank': 7},
  'Princeton University': {'location': 'Princeton, NJ',
   'score': 3.9,
   'rank': 8},
  'University of Illinois--Urbana-Champaign': {'location': 'Urbana, IL',
   'score': 3.9,
   'rank': 8},
  'University of Texas--Austin (Cockrell)': {'location': 'Austin, TX',
   'score': 3.9,
   'r

In [55]:
with open('usnews.json', 'w') as usnews:
    json.dump(results, usnews)