In [1]:
%matplotlib inline

In [2]:
from IPython.display import HTML
import bs4
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Get List of Masters in Science Education Programs from GradSchools.com to Excel

In [3]:
HTML(r'<iFrame src="http://www.gradschools.com/masters/teacher-education/teaching-science" width=800 height=400></iFrame>')

## Get all Listings from each page from the site's prgram list

In [4]:
sitelist = [r"http://www.gradschools.com/masters/teacher-education/teaching-science?page={}".format(el) for el in range(8)]
sitelist

['http://www.gradschools.com/masters/teacher-education/teaching-science?page=0',
 'http://www.gradschools.com/masters/teacher-education/teaching-science?page=1',
 'http://www.gradschools.com/masters/teacher-education/teaching-science?page=2',
 'http://www.gradschools.com/masters/teacher-education/teaching-science?page=3',
 'http://www.gradschools.com/masters/teacher-education/teaching-science?page=4',
 'http://www.gradschools.com/masters/teacher-education/teaching-science?page=5',
 'http://www.gradschools.com/masters/teacher-education/teaching-science?page=6',
 'http://www.gradschools.com/masters/teacher-education/teaching-science?page=7']

In [5]:
records = bs4.element.ResultSet([])
for site in sitelist:
    r = requests.get(site)
    soup = BeautifulSoup(r.text, 'lxml')
    names = soup.find_all("div", class_="content-item")
    records.extend(names)
    print(len(records))


25
50
75
100
125
150
175
199


Example Listing:

In [6]:
print(records[0].prettify())

<div class="content-item">
 <address class="online">
  Online
 </address>
 <h4>
  University of Southern California
 </h4>
 <h3>
  <a href="/graduate-schools-in-united-states/online/university-southern-california/master-of-arts-in-teaching-secondary-social-science-online-177714">
   Master of Arts in Teaching - Secondary Social Science (Online)
  </a>
 </h3>
 <p>
 </p>
 <p>
  Address the needs of today's classrooms with a Master of Arts in Teaching degree delivered online at the USC Rossier School of...
 </p>
 <div class="content-campus-program-level-info">
  <p class="item-type">
   <strong>
    Program:
   </strong>
   Online
  </p>
  <p class="item-level">
   <strong>
    Degree:
   </strong>
   Master
  </p>
 </div>
</div>



## Extract information from Each Listing

In [22]:
def extract_record_info(record):
    """Returns dictionary of program data from an html record"""
    data = {}
    
    data['city'] = record('address')[0].text
    data['program'] = record('h4')[0].text
    
    aa = record('h3')[0]('a')[0]
    data['major'] = aa.text
    data['weblink'] = aa.attrs['href']
    
    pp = record('p')
    data['description'] = pp[1].text
    data['online'] = pp[2].text.lstrip().splitlines()[1]
    data['degree'] = pp[3].text.lstrip().splitlines()[1]
    
    for key in data:
        data[key] = data[key].lstrip().rstrip()
        
    return data

programs = [extract_record_info(record) for record in records]
programs[15]

{'city': 'Montevallo, AL',
 'degree': 'Master',
 'description': 'Class A Cert allows student to teach secondary education in the area of chemistry; offered as traditional and alternative.',
 'major': '6-12 Secondary Education (Chemistry)',
 'online': 'Campus',
 'program': 'University of Montevallo',
 'weblink': '/graduate-schools-in-united-states/alabama/university-montevallo-montevallo/6-12-secondary-education-chemistry-244726'}

## Reformat data to a Pandas DataFrame

In [62]:
df = pd.DataFrame(programs)

# Clean up a bit...
df.loc[(df['city'] == 'Online') | (df['city'] == ','), 'city'] = ''


df.head(10)

Unnamed: 0,city,degree,description,major,online,program,weblink
0,,Master,Address the needs of today's classrooms with a...,Master of Arts in Teaching - Secondary Social ...,Online,University of Southern California,/graduate-schools-in-united-states/online/univ...
1,,Master,Increase your knowledge in teaching science at...,Master of Natural Sciences Education,Online,Colorado State University,/graduate-schools-in-united-states/online/colo...
2,"Keene, NH",Master,This concentration is the only science teacher...,Environmental Studies (MS) with a concentratio...,Campus,Antioch University New England,/graduate-schools-in-united-states/new-hampshi...
3,"Chicago, IL",Master,The M.Ed. in Science Education program has thr...,Science Education,Campus,Loyola University Chicago,/graduate-schools-in-united-states/illinois/lo...
4,"Garden City, NY",Master,Our program leads to New York State certificat...,Science Education,Campus,Adelphi University,/graduate-schools-in-united-states/new-york/ad...
5,"Galloway, OH",Master,The Master of Arts in Education Program helps ...,Education,Campus,Stockton University,/graduate-schools-in-united-states/new-jersey/...
6,"New Britain, CT",Master,,STEM Education for Certified Teachers M.S.,Campus,Central Connecticut State University,/graduate-schools-in-united-states/connecticut...
7,"Teaneck, NJ",Master,"Designed to prepare ""Highly-Qualified"" Element...",MA Science; Science Teaching Specialist Concen...,Campus,Fairleigh Dickinson University,/graduate-schools-in-united-states/new-jersey/...
8,,Master,Master of Science in Physical Sciences (concen...,Physical Science,Online,Emporia State University,/graduate-schools-in-united-states/online/empo...
9,"San Juan, PR",Master,,Physics Education,Campus,University of Puerto Rico - Rio Piedras Campus,/graduate-schools-in-united-states/puerto-rico...


## Export Data to Excel

In [63]:
df.to_excel('Masters Ed Program List.xlsx')

## Extra Credit: Get Lattitude and Longitude of Each Location, for Mapping

In [20]:
from geopy.geocoders import Nominatim
geolocator = Nominatim()
location = geolocator.geocode("175 5th Avenue NYC")

In [21]:
# df['city'].apply(geolocator.geocode)

In [25]:
city = df['city'][3]
geolocator.geocode(city)

Location(Chicago, Cook County, Illinois, United States of America, (41.8755546, -87.6244211, 0.0))

In [67]:
cities = []
for city in df['city']:
    cities.append(geolocator.geocode(city, timeout=4))
    print("Complete: {}".format(city))
    
cities[:10]    

Complete: 
Complete: 
Complete: Keene, NH
Complete: Chicago, IL
Complete: Garden City, NY
Complete: Galloway, OH
Complete: New Britain, CT
Complete: Teaneck, NJ
Complete: 
Complete: San Juan, PR
Complete: Allendale, MI
Complete: Buffalo, NY
Complete: Bellingham, WA
Complete: East Stroudsburg, PA
Complete: Malvern, Australia
Complete: Montevallo, AL
Complete: Newport News, VA
Complete: Pretoria, South Africa
Complete: Colorado Springs, CO
Complete: Hamilton, NY
Complete: Greenville, NC
Complete: 
Complete: Madison, IL
Complete: 
Complete: Bangkok, Thailand
Complete: Grahamstown, South Africa
Complete: Ishikawa, Japan
Complete: Lyndonville, VT
Complete: Ankara, Turkey
Complete: Oxford, OH
Complete: Akron, OH
Complete: Corvallis, OR
Complete: Nagoya, Japan
Complete: Irvine, CA
Complete: Detroit, MI
Complete: Vancouver, BC
Complete: 
Complete: Bronx, NY
Complete: Bozeman, MT
Complete: Houston, TX
Complete: Victoria, BC
Complete: Bowling Green, OH
Complete: Fitchburg, MA
Complete: Shippensb

[None,
 None,
 Location(Keene, Cheshire County, New Hampshire, United States of America, (42.933597, -72.2784263, 0.0)),
 Location(Chicago, Cook County, Illinois, United States of America, (41.8755546, -87.6244211, 0.0)),
 Location(Garden City, New York, United States of America, (40.7266477, -73.6343051, 0.0)),
 Location(Galloway, Franklin County, Ohio, United States of America, (39.9139207, -83.1633335, 0.0)),
 Location(Town of New Britain, Hartford County, Connecticut, United States of America, (41.68174355, -72.7881464300835, 0.0)),
 Location(Teaneck, Bergen County, New Jersey, United States of America, (40.8975992, -74.0159726, 0.0)),
 None,
 Location(San Juan Royal, 468, Avenida Cândido de Abreu, São Francisco, Curitiba, Microrregião de Curitiba, Mesorregião Metropolitana de Curitiba, PR, Região Sul, 80530-000, Brasil, (-25.4207039, -49.2688138757722, 0.0))]

In [68]:
city

''

In [82]:
city_full, lattitudes, longitudes, importances = [], [], [], []
for city in cities:
    aa = city.address if city else ''
    city_full.append(aa)
    
    aa = city.latitude if city else None
    lattitudes.append(aa)
    
    aa = city.longitude if city else None
    longitudes.append(aa)
    
    aa = city.raw['importance'] if city else None
    importances.append(aa)
        
city_full[:10]

['',
 '',
 'Keene, Cheshire County, New Hampshire, United States of America',
 'Chicago, Cook County, Illinois, United States of America',
 'Garden City, New York, United States of America',
 'Galloway, Franklin County, Ohio, United States of America',
 'Town of New Britain, Hartford County, Connecticut, United States of America',
 'Teaneck, Bergen County, New Jersey, United States of America',
 '',
 'San Juan Royal, 468, Avenida Cândido de Abreu, São Francisco, Curitiba, Microrregião de Curitiba, Mesorregião Metropolitana de Curitiba, PR, Região Sul, 80530-000, Brasil']

In [83]:
df['lattitude'] = lattitudes
df['longitude'] = longitudes
df['city detailed'] = city_full
df['city importance'] = importances

df.head()

Unnamed: 0,city,degree,description,major,online,program,weblink,lattitude,longitude,city detailed,city importance
0,,Master,Address the needs of today's classrooms with a...,Master of Arts in Teaching - Secondary Social ...,Online,University of Southern California,/graduate-schools-in-united-states/online/univ...,,,,
1,,Master,Increase your knowledge in teaching science at...,Master of Natural Sciences Education,Online,Colorado State University,/graduate-schools-in-united-states/online/colo...,,,,
2,"Keene, NH",Master,This concentration is the only science teacher...,Environmental Studies (MS) with a concentratio...,Campus,Antioch University New England,/graduate-schools-in-united-states/new-hampshi...,42.933597,-72.278426,"Keene, Cheshire County, New Hampshire, United ...",0.528618
3,"Chicago, IL",Master,The M.Ed. in Science Education program has thr...,Science Education,Campus,Loyola University Chicago,/graduate-schools-in-united-states/illinois/lo...,41.875555,-87.624421,"Chicago, Cook County, Illinois, United States ...",0.992648
4,"Garden City, NY",Master,Our program leads to New York State certificat...,Science Education,Campus,Adelphi University,/graduate-schools-in-united-states/new-york/ad...,40.726648,-73.634305,"Garden City, New York, United States of America",0.630725


In [84]:
city

## Export New Data to Excel

In [89]:
df.to_excel('Masters Ed Program List (with mapdata).xlsx')
df.to_hdf('Masters Ed Program List (with mapdata).h5', 'data')

ImportError: HDFStore requires PyTables, "No module named 'tables'" problem importing