In [1]:
# https://medium.com/technofunnel/web-scraping-with-python-using-beautifulsoup-76b710e3e92f
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [2]:
import requests 
from bs4 import BeautifulSoup 
import re
from statistics import mode
import pandas as pd
import numpy as np

# Scraping manually

## Columbia Stats

In [217]:
URL = "https://stat.columbia.edu/department-directory/faculty-and-lecturers/"

r = requests.get(URL) 

c_soup = BeautifulSoup(r.content, 'html5lib') 

In [218]:
c_profs = c_soup.findAll(attrs = {'class':'container-fluid cn-container'}) 

In [219]:
columbia_stats = []
for prof in c_profs:
    # name
    name = prof.find(attrs = {'class':'cn-entry-name'}).text

    # title
    title = prof.findAll(attrs = {'class': 'col-md-3'})[1].text.strip('\n').strip()

    # img
    try:
        img = prof.find('img')
        img = img['src']
    except:
        img = None

    item = {'name':name, 'title':title, 'img':img}
    columbia_stats.append(item)

## UPenn Stats

In [221]:
URL = "https://statistics.wharton.upenn.edu/faculty/faculty-list/"

r = requests.get(URL) 

p_soup = BeautifulSoup(r.content, 'html5lib') 

In [222]:
p_profs = p_soup.findAll(attrs = {'class':'wdp_listing-row vc_row vc_row-fluid'}) 

In [223]:
upenn_stats = []
for prof in p_profs:
    # name
    name = prof.find(attrs = {'class':'wdp_listing-name'}).text

    # title
    s = str(prof)
    m = re.search("[a-zA-Z0-9 ]*(Professor|Lecturer)[a-zA-Z0-9_ ]*", s)
    if m:
        title = m.group(0)
    else:
        title = None


    # img
    try:
        img = prof.find('img')
        img = img['src']
    except:
        img = None

    item = {'name':name, 'title':title, 'img':img}
    upenn_stats.append(item)

In [224]:
columbia_stats[:3]

[{'name': 'David Blei',
  'title': 'Professor',
  'img': 'https://stat.columbia.edu/wp-content/uploads/connections-images/david-blei/Blei-David-1.jpg'},
 {'name': 'John P. Cunningham',
  'title': 'Professor',
  'img': 'https://stat.columbia.edu/wp-content/uploads/connections-images/john-cunningham/cunningham_original.jpg'},
 {'name': 'Richard A. Davis',
  'title': 'Howard Levene Professor',
  'img': 'https://stat.columbia.edu/wp-content/uploads/connections-images/richard-davis/Davis-Richard.jpg'}]

In [225]:
upenn_stats[:3]

[{'name': 'Bhaswar B. Bhattacharya',
  'title': 'Assistant Professor of Statistics and Data Science',
  'img': 'https://faculty.wharton.upenn.edu/wp-content/uploads/2016/09/Bhaswar_Homepage_Cropped.jpg'},
 {'name': 'Tony Cai',
  'title': ' Silberberg Professor in Applied Mathematics and Statistics',
  'img': 'https://faculty.wharton.upenn.edu/wp-content/uploads/2016/11/Tony_Cai_Crop7.jpg'},
 {'name': 'Yuxin Chen',
  'title': 'Associate Professor of Statistics and Data Science',
  'img': 'https://faculty.wharton.upenn.edu/wp-content/uploads/2016/11/Yuxin_Chen.jpeg'}]

# Automating

## Define functions

In [43]:
def get_soup(URL):
    r = requests.get(URL) 
    return BeautifulSoup(r.content, 'html5lib') 

In [168]:
# return 3 images, to make sure images selected are the headshots 
def select_headshots(soup):
    headshots = soup.body.find_all('img')
    return headshots[len(headshots)//6], headshots[len(headshots)//3], headshots[len(headshots)//2]

In [205]:
# check if the link for imgs are src or srcset
def src_or_srcset(tag):
    try: 
        tag['src']
        return True
    except:
        return False

In [87]:
# for each tag containing imgs, find their first parents that contains the the keyword 
# (professor, lecturer, student (some pages include grad students), ...) 
# My rational is since the tag includes the title already, it likely also includes the name of the person. 

def find_profile_class(tag):
    s = str(tag)
    if re.search(r'(Professor|Lecturer|Student|Director)', s):
        m = re.findall(r'class="(.+?)"', s)
        return tag, m[0]
    else:
        return find_profile_class(tag.parent)

In [268]:
# possible to use ML instead of regex to find name?
def find_name_pos(tag):
    pos = 0
    strs = tag.stripped_strings
    while True: 
        s = next(strs)
        if re.match("^([a-zA-Z.]*[,]* [a-zA-Z.]* [a-zA-Z.]*|[a-zA-Z.]*[,]* [a-zA-Z.]*)[, Ph.D.]*$", s):
            if "University" not in s and "College" not in s and 'Department' not in s:
                return pos
        pos += 1

In [102]:
def find_title_pos(tag):
    pos = 0
    strs = tag.stripped_strings
    while True: 
        s = next(strs)
        if re.match("[a-zA-Z0-9 ]*(Professor|Lecturer|Student|Director)[a-zA-Z0-9_ ]*", s):
            return pos
        else:
            pos += 1

In [252]:
def find_by_pos(tag, pos):
    it = tag.stripped_strings
    try:
        item = next(x for i,x in enumerate(it) if i==pos)
    except:
        item = None
    return item

In [214]:
def find_img(tag, src):
    try:
        if src:
            img = tag.find('img')['src']
        else:
            img = tag.find('img')['srcset'].split()[0]
    except:
        img = None
    return img

## Putting it together

In [241]:
def get_department_info(URL):
    # get the soup (entire webpage)
    soup = get_soup(URL)
    
    # get 3 headshot tags in different pos. Because all the pages have headshots ('img' in html), 
    # that's something in common for all pages regardless of the design 
    headshots = select_headshots(soup)
    
    # use the random headshot to find the name of the class containing profiles, name pos and title pos
    profile_classes = []
    tags = []
    src_or_not = None
    
    # for each tag containing imgs, find their first parents that contains the the keyword 
    # (professor, lecturer, student (some pages include grad students), ...) 
    # My rational is since the tag includes the title already, it likely also includes the name of the person. 
    for headshot in headshots:
        tag, profile_c = find_profile_class(headshot)
        profile_classes.append(profile_c)
        tags.append(tag)
        
    # compare the class name of tag containg profile found from ancestors of the 3 headshot tags, 
    # if 2 or more out of 3 are same, select it as the class name of profile tags. If all 3 are different, throw an error
    if len(set(profile_classes)) != 3:
        if profile_classes[0] == profile_classes[1] or profile_classes[0] == profile_classes[2]:
            profile_class = profile_classes[0]
            tmp_tag = tags[0]
            src_or_not = src_or_srcset(headshots[0])
        else:
            profile_class = profile_classes[1]
            tmp_tag = tags[1]
            src_or_not = src_or_srcset(headshots[1])
    else:
        raise Exception("class names for profiles are different")

    # find the position of name and title in the tmp_tag (from 1 of parents of headshots)
    name_pos = find_name_pos(tmp_tag)
    title_pos = find_title_pos(tmp_tag)

    
    # get all profiles
    profs = soup.find_all(attrs = {'class':profile_class})
    items = []
    # get info for each profile
    for p in profs:
        name = find_by_pos(p, name_pos)
        title = find_by_pos(p, title_pos)
        img = find_img(p, src_or_not)
        item = {'name': name, 'title': title, 'img':img}
        items.append(item)
    return items

## Testing

### Stats

In [269]:
c_stats = get_department_info("https://stat.columbia.edu/department-directory/faculty-and-lecturers/")

In [270]:
c_stats[:2]

[{'name': 'David Blei',
  'title': 'Professor',
  'img': 'https://stat.columbia.edu/wp-content/uploads/connections-images/david-blei/Blei-David-1.jpg'},
 {'name': 'John P. Cunningham',
  'title': 'Professor',
  'img': 'https://stat.columbia.edu/wp-content/uploads/connections-images/john-cunningham/cunningham_original.jpg'}]

In [261]:
penn_stats = get_department_info("https://statistics.wharton.upenn.edu/faculty/faculty-list/")

In [262]:
penn_stats[:2]

[{'name': 'Bhaswar B. Bhattacharya',
  'title': 'Assistant Professor of Statistics and Data Science',
  'img': 'https://faculty.wharton.upenn.edu/wp-content/uploads/2016/09/Bhaswar_Homepage_Cropped.jpg'},
 {'name': 'Tony Cai',
  'title': 'Daniel H. Silberberg Professor in Applied Mathematics and Statistics, Professor of Statistics and Data Science',
  'img': 'https://faculty.wharton.upenn.edu/wp-content/uploads/2016/11/Tony_Cai_Crop7.jpg'}]

In [178]:
h_stats = get_department_info("https://statistics.fas.harvard.edu/faculty")

In [179]:
h_stats[:2]

[{'name': 'Morgane Austern',
  'title': 'Assistant Professor',
  'img': '//static.hwpi.harvard.edu/files/styles/profile_thumbnail/public/statistics-2/files/img-20210401-wa0022.jpg?m=1647356154&itok=X-5OHRTO'},
 {'name': 'Joseph K. Blitzstein',
  'title': 'Professor of the Practice in Statistics,',
  'img': '//static.hwpi.harvard.edu/files/styles/profile_thumbnail/public/statistics-2/files/blitzstein_1_0.jpg?m=1629481923&itok=K83gCeQ4'}]

In [180]:
y_stats = get_department_info("https://statistics.yale.edu/people")

In [181]:
y_stats[:2]

[{'name': 'Andrew Barron',
  'title': 'Charles C. and Dorothea S. Dilley Professor of Statistics & Data Science',
  'img': 'https://statistics.yale.edu/sites/default/files/styles/people_user_image/public/pictures/picture-15-1362597227.jpg?itok=sPNt5_6y'},
 {'name': 'Elisa Celis',
  'title': 'Assistant Professor of Statistics & Data Science',
  'img': 'https://statistics.yale.edu/sites/default/files/styles/people_user_image/public/pictures/picture-1274-1563890739.jpg?itok=9WAMwA7S'}]

In [182]:
b_stats = get_department_info("https://www.brown.edu/academics/public-health/biostats/people/faculty")

In [183]:
b_stats[:2]

[{'name': 'Alyssa Bilinski, Ph.D.',
  'title': 'Peterson Family Assistant Professor of Health Policy, Assistant Professor of Health Services, Policy and Practice and Biostatistics',
  'img': 'https://www.brown.edu/academics/public-health/biostats/sites/biostats/files/images/people/abilinsk_photo_.jpg'},
 {'name': 'Stavroula  Chrysanthopoulou, Ph.D.',
  'title': "Assistant Professor of Biostatistics, Director of the Master's Graduate Program in Biostatistics",
  'img': 'https://www.brown.edu/academics/public-health/biostats/sites/biostats/files/images/people/schrysan_photo_%20%281%29.jpg'}]

In [184]:
cornell_stats = get_department_info("https://stat.cornell.edu/people/faculty")

Exception: class names for profiles are different

## Math

In [253]:
c_math = get_department_info("https://www.math.columbia.edu/people/faculty-by-rank/")

In [254]:
c_math[:2]
# Doesn't work because the tag containing name is outside

[{'name': 'New York',
  'title': 'Professor',
  'img': '//www.math.columbia.edu/department/website/wp-content/uploads/connections-images/mohammed-abouzaid/abouzaid_original-4f5f03379ff345dc53d8afd47027fbab.jpg'},
 {'name': 'New York',
  'title': 'Associate Professor (w/ tenure)',
  'img': '//www.math.columbia.edu/department/website/wp-content/uploads/connections-images/amol-aggarwal/AmolAggarwal-450732ed7d7b0ab496fe9e1e33b7ce46.jpg'}]

In [271]:
# only the standing faculty
penn_math = get_department_info("https://www.math.upenn.edu/people/standing-faculty")

In [272]:
penn_math[:3]

[{'name': 'Jonathan  Block',
  'title': 'Professor of Mathematics',
  'img': 'https://www.math.upenn.edu/sites/default/files/styles/profile-thumb/public/Block_0.jpg?itok=8lsKofiB'},
 {'name': 'Eugenio  Calabi',
  'title': None,
  'img': '../sites/www.math.upenn.edu/themes/math/images/penn-shield.png'},
 {'name': 'Ching-Li  Chai',
  'title': 'Professor of Mathematics',
  'img': 'https://www.math.upenn.edu/sites/default/files/styles/profile-thumb/public/Chai.jpg?itok=tsPC4wGh'}]

In [273]:
h_math = get_department_info("https://www.math.harvard.edu/people/")

In [275]:
h_math[:2]

[{'name': 'Alpöge, Levent',
  'title': 'Junior Fellow',
  'img': 'https://www.math.harvard.edu/wp-content/uploads/Levent_Alpoge.jpeg'},
 {'name': 'Antunes, Jonier',
  'title': 'Lecturer',
  'img': 'https://www.math.harvard.edu/wp-content/uploads/Jonier_Amaral_Antunes.jpeg'}]

In [276]:
y_math = get_department_info("https://math.yale.edu/people/all-faculty")

In [278]:
y_math
# Doesn't work because the class names are "even"/"odd"

[{'name': None, 'title': None, 'img': None},
 {'name': 'About', 'title': 'News', 'img': None},
 {'name': 'About', 'title': 'News', 'img': None},
 {'name': 'News', 'title': None, 'img': None},
 {'name': 'Employment', 'title': None, 'img': None},
 {'name': 'People', 'title': 'All Faculty', 'img': None},
 {'name': 'All Faculty', 'title': None, 'img': None},
 {'name': 'Gibbs Assistant Professors', 'title': None, 'img': None},
 {'name': 'Visitors & Other Researchers', 'title': None, 'img': None},
 {'name': 'Emeritus Faculty', 'title': None, 'img': None},
 {'name': 'Staff', 'title': None, 'img': None},
 {'name': 'Courses', 'title': None, 'img': None},
 {'name': 'Ph.D. Requirements', 'title': None, 'img': None},
 {'name': 'The Qualifying Exams', 'title': None, 'img': None},
 {'name': 'Undergraduate',
  'title': 'First year student resources',
  'img': None},
 {'name': 'First year student resources', 'title': None, 'img': None},
 {'name': 'Director of Undergraduate Studies', 'title': None, 'im

In [279]:
b_math = get_department_info("https://www.brown.edu/academics/math/faculty")
# doesn't have a class name for the profile

IndexError: list index out of range