In [6]:
import requests 
from bs4 import BeautifulSoup 
import re
from statistics import mode
import pandas as pd
import numpy as np

## Common functions

In [7]:
def get_soup(URL):
    r = requests.get(URL) 
    return BeautifulSoup(r.content, 'html5lib') 

In [8]:
# return 3 images, to make sure images selected are the headshots 
def select_headshots(soup):
    headshots = soup.body.find_all('img')
    return headshots[len(headshots)//6], headshots[len(headshots)//3], headshots[len(headshots)//2]

In [9]:
# check if the link for imgs are src or srcset
def src_or_srcset(tag):
    try: 
        tag['src']
        return True
    except:
        return False

In [10]:
# possible to use ML instead of regex to find name?
def find_name_pos(tag):
    pos = 0
    strs = tag.stripped_strings
    while True: 
        s = next(strs)
        if re.match("^([a-zA-Z.]*[,]* [a-zA-Z.]* [a-zA-Z.]*|[a-zA-Z.]*[,]* [a-zA-Z.]*)[, Ph.D.]*$", s):
            if "University" not in s and "College" not in s and 'Department' not in s:
                return pos
        pos += 1

In [11]:
def find_title_pos(tag):
    pos = 0
    strs = tag.stripped_strings
    while True: 
        s = next(strs)
        if re.match("[a-zA-Z0-9 ]*(Professor|Lecturer|Student|Director)[a-zA-Z0-9_ ]*", s):
            return pos
        else:
            pos += 1

In [12]:
def find_by_pos(tag, pos):
    it = tag.stripped_strings
    try:
        item = next(x for i,x in enumerate(it) if i==pos)
    except:
        item = None
    return item

In [13]:
def find_img(tag, src):
    try:
        if src:
            img = tag.find('img')['src']
        else:
            img = tag.find('img')['srcset'].split()[0]
    except:
        img = None
    return img

## Using class name

In [14]:
# for each tag containing imgs, find their first parents that contains the the keyword 
# (professor, lecturer, student (some pages include grad students), ...) 
# My rational is since the tag includes the title already, it likely also includes the name of the person. 

def find_profile_class(tag):
    s = str(tag)
    if re.search(r'(Professor|Lecturer|Student|Director)', s):
        m = re.findall(r'class="(.+?)"', s)
        return tag, m[0]
    else:
        return find_profile_class(tag.parent)

In [15]:
def get_department_info_class_name(URL):
    # get the soup (entire webpage)
    soup = get_soup(URL)
    
    # get 3 headshot tags in different pos. Because all the pages have headshots ('img' in html), 
    # that's something in common for all pages regardless of the design 
    headshots = select_headshots(soup)
    
    # use the random headshot to find the name of the class containing profiles, name pos and title pos
    profile_classes = []
    tags = []
    src_or_not = None
    
    # for each tag containing imgs, find their first parents that contains the the keyword 
    # (professor, lecturer, student (some pages include grad students), ...) 
    # My rational is since the tag includes the title already, it likely also includes the name of the person. 
    for headshot in headshots:
        tag, profile_c = find_profile_class(headshot)
        profile_classes.append(profile_c)
        tags.append(tag)
        
    # compare the class name of tag containg profile found from ancestors of the 3 headshot tags, 
    # if 2 or more out of 3 are same, select it as the class name of profile tags. If all 3 are different, throw an error
    if len(set(profile_classes)) != 3:
        if profile_classes[0] == profile_classes[1] or profile_classes[0] == profile_classes[2]:
            profile_class = profile_classes[0]
            tmp_tag = tags[0]
            src_or_not = src_or_srcset(headshots[0])
        else:
            profile_class = profile_classes[1]
            tmp_tag = tags[1]
            src_or_not = src_or_srcset(headshots[1])
    else:
        raise Exception("class names for profiles are different")

    # find the position of name and title in the tmp_tag (from 1 of parents of headshots)
    name_pos = find_name_pos(tmp_tag)
    title_pos = find_title_pos(tmp_tag)

    
    # get all profiles
    profs = soup.find_all(attrs = {'class':profile_class})
    items = []
    # get info for each profile
    for p in profs:
        name = find_by_pos(p, name_pos)
        title = find_by_pos(p, title_pos)
        img = find_img(p, src_or_not)
        item = {'name': name, 'title': title, 'img':img}
        items.append(item)
    return items

## Using siblings

In [16]:
# similar to class name in using recursion, but get all the profiles using siblings instead
def find_profile_siblings(tag):
    s = str(tag)
    if re.search(r'(Professor|Lecturer|Student|Director)', s):
        tags = []
        p_siblings = tag.previous_siblings
        n_siblings = tag.next_siblings
        tags.extend(list(p_siblings))
        tags.append(tag)
        tags.extend(n_siblings)
        return (tag, tags)
    else:
        return find_profile_siblings(tag.parent)

In [17]:
def get_department_info_siblings(URL):
    # get the soup (entire webpage)
    soup = get_soup(URL)
    
    # get 3 headshot tags in different pos. Because all the pages have headshots ('img' in html), 
    # that's something in common for all pages regardless of the design 
    headshots = select_headshots(soup)
    
    # use the random headshot to find the name of the class containing profiles, name pos and title pos
    tmp_tags = []
    lsts_tags = []
    src_or_not = None
    
    for headshot in headshots:
        tmp_tag, lst_tags = find_profile_siblings(headshot)
        tmp_tags.append(tmp_tag)
        lsts_tags.append(lst_tags)
        
    # choose the longest list
    if len(lsts_tags[0]) >= len(lsts_tags[1]) and len(lsts_tags[0]) >= len(lsts_tags[2]):
        tmp_tag = tmp_tags[0]
        profs = lsts_tags[0]
        src_or_not = src_or_srcset(headshots[0])
    elif len(lsts_tags[1]) >= len(lsts_tags[0]) and len(lsts_tags[1]) >= len(lsts_tags[2]):
        tmp_tag = tmp_tags[1]
        profs = lsts_tags[1]
        src_or_not = src_or_srcset(headshots[1])
    else:
        tmp_tag = tmp_tags[2]
        profs = lsts_tags[2]
        src_or_not = src_or_srcset(headshots[2])

        
    # find the position of name and title in the tmp_tag (from 1 of parents of headshots)
    name_pos = find_name_pos(tmp_tag)
    title_pos = find_title_pos(tmp_tag)

    items = []
    # get info for each profile
    for p in profs:
        name = find_by_pos(p, name_pos)
        title = find_by_pos(p, title_pos)
        img = find_img(p, src_or_not)
        item = {'name': name, 'title': title, 'img':img}
        items.append(item)
    return items

In [40]:
get_department_info_siblings("https://www.brown.edu/academics/math/faculty")

[{'name': 'George Daskalopoulos',
  'title': 'Professor',
  'img': 'https://www.brown.edu/academics/math/sites/math/files/George.jpg'},
 {'name': 'Melody Chan',
  'title': 'Associate Professor',
  'img': 'https://www.brown.edu/academics/math/sites/math/files/Melody.jpg'},
 {'name': 'Christine Breiner',
  'title': 'Associate\xa0Professor',
  'img': 'https://www.brown.edu/academics/math/sites/math/files/Christine.png'},
 {'name': 'Madeline Brandt',
  'title': 'Tamarkin Assistant Professor',
  'img': 'https://www.brown.edu/academics/math/sites/math/files/Brandt.png'},
 {'name': 'Dan Abramovich',
  'title': 'L. Herbert Ballou University Professor',
  'img': 'https://www.brown.edu/academics/math/sites/math/files/Abramovich_1.jpg'},
 {'name': 'Charles Daly',
  'title': 'Tamarkin Assistant Professor',
  'img': 'https://www.brown.edu/academics/math/sites/math/files/Screen%20Shot%202021-07-15%20at%2010.44.28%20AM.png'},
 {'name': 'S. James Gates',
  'title': 'Ford Foundation Professor of Physic

## Testing

In [20]:
urls = ["https://stat.columbia.edu/department-directory/faculty-and-lecturers/", 
         "https://statistics.wharton.upenn.edu/faculty/faculty-list/",
         "https://statistics.fas.harvard.edu/faculty",
         "https://statistics.yale.edu/people",
         "https://www.brown.edu/academics/public-health/biostats/people/faculty",
         "https://stat.cornell.edu/people/faculty",
         "https://www.math.columbia.edu/people/faculty-by-rank/",
         "https://www.math.upenn.edu/people/standing-faculty",
         "https://www.math.harvard.edu/people/",
         "https://math.yale.edu/people/all-faculty",
         "https://www.brown.edu/academics/math/faculty"
        ]

In [55]:
class_name_works = []
class_name_fails = []

siblings_works = []
siblings_fails = []

for url in urls:
    # try class name
    try:
        res = get_department_info_class_name(url)
        item = {'url': url, 'res':res}
        class_name_works.append(item)
    except:
        class_name_fails.append(url)
        
    try:
        res = get_department_info_siblings(url)
        item = {'url': url, 'res':res}
        siblings_works.append(item)
    except:
        siblings_fails.append(url)

In [54]:
class_name_fails

['https://stat.cornell.edu/people/faculty',
 'https://www.brown.edu/academics/math/faculty']

In [50]:
siblings_fails

[]

In [61]:
for i in class_name_works:
    print("________________________________________________________________________________________________________")
    print(f"url: {i.get('url')}")
    res = i.get('res')
    print(f"length: {len(res)}")
    print("___________")
    print(res[0])
    print(res[len(res)//2])
    print(res[-1])

________________________________________________________________________________________________________
url: https://stat.columbia.edu/department-directory/faculty-and-lecturers/
length: 60
___________
{'name': 'David Blei', 'title': 'Professor', 'img': 'https://stat.columbia.edu/wp-content/uploads/connections-images/david-blei/Blei-David-1.jpg'}
{'name': 'Banu Baydil', 'title': 'Lecturer in Discipline, Co-Director of M.A. Programs', 'img': 'https://stat.columbia.edu/wp-content/uploads/connections-images/banu-baydil/banu_original.jpg'}
{'name': 'Rongning Wu', 'title': 'Adjunct Assistant Professor', 'img': None}
________________________________________________________________________________________________________
url: https://statistics.wharton.upenn.edu/faculty/faculty-list/
length: 52
___________
{'name': 'Bhaswar B. Bhattacharya', 'title': 'Assistant Professor of Statistics and Data Science', 'img': 'https://faculty.wharton.upenn.edu/wp-content/uploads/2016/09/Bhaswar_Homepage_Cro

In [62]:
# https://www.math.columbia.edu/people/faculty-by-rank/ 
# and https://math.yale.edu/people/all-faculty also doesn't work
class_name_fails.append("https://www.math.columbia.edu/people/faculty-by-rank/")
class_name_fails.append("https://math.yale.edu/people/all-faculty")

In [63]:
for i in siblings_works:
    print("________________________________________________________________________________________________________")
    print(f"url: {i.get('url')}")
    res = i.get('res')
    print(f"length: {len(res)}")
    print("___________")
    print(res[0])
    print(res[len(res)//2])
    print(res[-1])

________________________________________________________________________________________________________
url: https://stat.columbia.edu/department-directory/faculty-and-lecturers/
length: 5
___________
{'name': None, 'title': None, 'img': None}
{'name': None, 'title': None, 'img': None}
{'name': None, 'title': None, 'img': None}
________________________________________________________________________________________________________
url: https://statistics.wharton.upenn.edu/faculty/faculty-list/
length: 17
___________
{'name': 'Shane T. Jensen', 'title': 'Professor of Statistics and Data Science', 'img': 'https://faculty.wharton.upenn.edu/wp-content/uploads/2012/04/Jensen_shane.jpg'}
{'name': 'Zongming Ma', 'title': 'Associate Professor of Statistics and Data Science, Statistics PhD Coordinator for Admissions', 'img': 'https://faculty.wharton.upenn.edu/wp-content/uploads/2016/11/ZongmingMa.jpg'}
{'name': 'Linda Zhao', 'title': "Professor of Statistics and Data Science, Academic Director

In [66]:
siblings_works[-2]

{'url': 'https://math.yale.edu/people/all-faculty',
 'res': [{'name': None, 'title': None, 'img': None},
  {'name': 'Vincent Moncrief',
   'title': 'Professor of Physics and of Mathematics',
   'img': 'https://math.yale.edu/sites/default/files/styles/people_thumbnail/public/pictures/picture-246-1524160789.jpg?itok=7uaP4ADd'},
  {'name': None, 'title': None, 'img': None},
  {'name': 'Yair Minsky',
   'title': 'Einar Hille Professor of Mathematics',
   'img': 'https://math.yale.edu/sites/default/files/styles/people_thumbnail/public/pictures/picture-286-1512077508.jpg?itok=agFFE8ZQ'},
  {'name': None, 'title': None, 'img': None},
  {'name': 'Ivan Loseu',
   'title': 'Director of Graduate Studies, Professor of Mathematics',
   'img': 'https://math.yale.edu/sites/default/files/styles/people_thumbnail/public/pictures/picture-1399-1562000240.jpg?itok=_iXuFibC'},
  {'name': None, 'title': None, 'img': None},
  {'name': 'Richard Kenyon',
   'title': 'Erastus L. DeForest Professor of Mathematics