In [72]:
import time
from urlparse import urljoin
from bs4 import BeautifulSoup
import requests
from openpyxl import Workbook
from IPython.display import Image

In [78]:
def get_faculty_info_by_netid(netid):
    BASE_URL = "http://www.kellogg.northwestern.edu/"
    response = requests.get("http://www.kellogg.northwestern.edu/faculty/faculty_search_results.aspx?netid={0}".format(netid))
    soup = BeautifulSoup(response.content)
    headshot_image_tag = soup.find(id="imgFacultyImage")
    if headshot_image_tag:
        headshot_image_url = urljoin(BASE_URL, headshot_image_tag['src'])
    else:
        headshot_image_url = ''
    return {'headshot_image_url': headshot_image_url,
            'name': soup.find(id="lblName").string if soup.find(id="lblName") else '',
            'url': response.url,
            'office': soup.find(id="lblOffice").string if soup.find(id="lblOffice") else '',
            'department': soup.find(id="lblDepartment").string if soup.find(id="lblDepartment") else '',
            'title': soup.find(id="lblTitle").string if soup.find(id="lblTitle") else ''}
    

In [31]:
def get_faculty_netids():
    response = requests.get("http://www.kellogg.northwestern.edu/faculty/advanced_search.aspx")
    soup = BeautifulSoup(response.content)
    return sorted([option['value'] for option in soup.find(id="plcprimarymaincontent_1_selBrowseByName").find_all("option") if option['value']])
        

In [33]:
netids = get_faculty_netids()

In [59]:
faculty_info = get_faculty_info_by_netid(netids[0])
print faculty_info

{'name': u'Joel K. Shapiro', 'office': u'Jacobs Center Room 495', 'title': u'Clinical Associate Professor and Executive Director for the Program on Data Analytics at Kellogg', 'url': u'http://www.kellogg.northwestern.edu/faculty/directory/shapiro_joel_k.aspx', 'headshot_image_url': 'http://www.kellogg.northwestern.edu/~/media/Images/faculty/headshot/Shapiro_Joel_51315.ashx', 'department': u'MARKETS & CUSTOMERS', 'email': 'jshapiro@kellogg.northwestern.edu'}


In [51]:
r = requests.get(faculty_info['headshot_image_url'])
print r.headers

{'content-length': '16351', 'content-disposition': 'inline; filename="Albert_Terri_0913.jpg"', 'x-aspnet-version': '4.0.30319', 'accept-ranges': 'bytes', 'expires': 'Tue, 21 Jul 2015 15:59:47 GMT', 'server': 'Microsoft-IIS/7.5', 'last-modified': 'Tue, 17 Sep 2013 18:45:36 GMT', 'cache-control': 'private, max-age=604800', 'date': 'Tue, 14 Jul 2015 15:59:47 GMT', 'x-frame-options': 'SAMEORIGIN', 'x-powered-by': 'ASP.NET', 'content-type': 'image/jpeg'}


In [56]:
def save_list_to_excel_workbook(dict_list, filename):
    wb = Workbook()
    worksheet = wb.create_sheet()
    worksheet.title = "Kellogg Faculty"
    for row in dict_list:
        worksheet.append(row.items())
    wb.save(filename = filename)

def save_list_to_csv(dict_list, filename):
    import csv
    with open(filename, 'wb') as f:
        dw = csv.DictWriter(f, fieldnames=dict_list[0].keys())
        dw.writeheader()
        dw.writerows(dict_list)

In [70]:
info_by_netid = {}

In [81]:
print "Already have info for netids: {0}".format(', '.join(sorted(info_by_netid.keys())))
for netid in netids:
    if netid not in info_by_netid:
        print "Getting info for {0}".format(netid)
        info_by_netid[netid] = get_faculty_info_by_netid(netid)
        time.sleep(0.2)

Already have info for netids: JKS987, aad400, aal953, aaz426, aaz777, aba747, aba897, acc675, acd937, ach465, aci140, acm104, adc128, ads232, aea450, aes711, aes797, aew632, agm111, agw812, aja886, ajg877, ajr484, ajr908, ajx391, akr857, alo137, amtybout, amv553, andtoga, ane686, ard, aro357, asa906, asr279, ath454, avi743, awg221, awi974, ayl676, bab234, bbala, bbm180, bci802, bdc773, besanko, beu104, bfj879, bgb655, bgk766, bha822, bjc943, bjl470, bmu055, bmw430, bov740, bre616, brg253, bsb220, bsc536, bshwom, bst047, bte632, btm278, buzzi, bwa017, bwr295, cbl566, cbr943, cch274, cfa496, cfu111, cga846, cgn482, cjo754, ckv816, cld280, cmb816, costis, coughlan, css661, cts823, cwo918, dad618, dad899, dae324, dag246, dag951, dam299, das714, dau958, dbm330, dch687, dco291, ddr741, dhh667, dic002, djd155, djg894, djm724, djw649, dla634, dli309, dli989, dmessick, dpa674, dpj782, dps631, dra, dranove, drc156, dsf419, dsg456, dvb429, dya741, eal780, eby668, ecs693, edb628, edn755, edp105, e

In [86]:
save_list_to_csv(info_by_netid.values(), "faculty.csv")