In [1]:
import requests 
from bs4 import BeautifulSoup 
import re
from statistics import mode
import unidecode

## Common functions

In [2]:
def get_soup(URL):
    r = requests.get(URL) 
    soup = BeautifulSoup(r.content, 'html5lib')
    return soup.body

In [3]:
def select_headshots(soup):
    headshots = soup.find_all('img')
    if len(headshots) < 3:
        raise Exception("Not enough imgs")
    elif len(headshots) < 10:
        items = [headshots[len(headshots)//3], headshots[len(headshots)//3*2], headshots[len(headshots)-2]]
    elif len(headshots) > 50:
        items = [headshots[len(headshots)//10*i] for i in range(1,10)]
    else:
        items = [headshots[len(headshots)//7*i] for i in range(1,7)]
    return items

In [4]:
# check if the link for imgs are src or srcset
def src_or_srcset(tag):
    try:
        tag['src']
        return True
    except:
        return False

In [5]:
def find_name_pos(tag):
    pos = 0
    strs = tag.stripped_strings
    while True:
        try:
            s = next(strs)
            if re.match("^([a-zA-Z\.\-]*[,]* [a-zA-Z\.]* [a-zA-Z\.\-]*|[a-zA-Z\.\-]*[,]* [a-zA-Z\.\-]*)[, Ph\.D\.]*$",
                      unidecode.unidecode(" ".join(s.split()))):
                if "University" not in s and "College" not in s and "Department" not in s\
                and "Faculty" not in s and "Research" not in s and "Interest" not in s and "Profile" not in s:
                    return pos
            pos += 1
        except:
            raise Exception("name position not found")

In [6]:
def find_title_pos(tag):
    pos = 0
    strs = tag.stripped_strings
    while True:
        try:
            s = next(strs)
            if re.match("(?i).*(Professor|Lecturer|Student|Director|Fellow|Adjunct|Assistant|Coordinator|Postgraduate|Postdoctoral|Scientist|Visiting|Associate|Staff|PhD).*", s):
                return pos
            else:
                pos += 1
        except:
            raise Exception("title position not found")

In [7]:
def find_pos(tags):
    name_positions = []
    title_positions = []
    for tag in tags:
        try:
            name_pos = find_name_pos(tag)
            title_pos = find_title_pos(tag)
            name_positions.append(name_pos)
            title_positions.append(title_pos)
        except:
            pass
        
    if len(name_positions) == 0:
        raise Exception("failed to find positions")
    name_pos = mode(name_positions)
    title_pos = mode(title_positions)
    return name_pos, title_pos

In [8]:
def is_name(s):
    if s is None:
        return None
    elif re.match("^([a-zA-Z\.\-]*[,]* [a-zA-Z\.]* [a-zA-Z\.\-]*|[a-zA-Z\.\-]*[,]* [a-zA-Z\.\-]*)[, Ph\.D\.]*$",
                  unidecode.unidecode(" ".join(s.split()))):
        return s
    else:
        return None

In [9]:
def is_title(s):
    if s is None:
        return None
    elif re.match("(?i).*(Professor|Lecturer|Student|Director|Fellow|Adjunct|Assistant|Coordinator|Postgraduate|Postdoctoral|Scientist|Visiting|Associate|Staff|PhD).*", s):
        return s
    else:
        return None

In [10]:
def find_by_pos(tag, pos):
    it = tag.stripped_strings
    try:
        item = next(x for i,x in enumerate(it) if i==pos)
    except:
        item = None
    return item

In [11]:
def find_img(tag, src):
    try:
        if src:
            img = tag.find('img')['src']
        else:
            img = tag.find('img')['srcset'].split()[0]
    except:
        pass
    
    img_tag = tag.find('img')
    if img_tag is not None:
        if src:
            img = img_tag['src']
        else:
            img = img_tag['srcset'].split()[0]
    else:
        img = None
    return img

In [12]:
def get_info(profs, name_pos, title_pos, src_or_not):
    items = []
    total = 0
    no_fails = 0
    for p in profs:
        name = find_by_pos(p, name_pos)
        #name = is_name(name)
        title = find_by_pos(p, title_pos)
        #title = is_title(title)
        img = find_img(p, src_or_not)
        if name is not None and title is not None:
            item = {'name': name, 'title': title, 'img':img}
            items.append(item)
            total += 1
            if name is None or title is None:
                no_fails += 1
    if no_fails/total > 1/3 or total < 5:
        raise Exception("FAILED")
    return items

## Using tr

In [13]:
def find_profs_tr(soup):
    profs = soup.find_all('tr')
    if len(profs) == 0:
        raise Exception("No tr")
    else:
        return profs

In [14]:
def get_department_info_tr(URL):
    # get the soup (entire webpage)
    soup = get_soup(URL)

    profs = find_profs_tr(soup)
    
    src_or_not = None
    while src_or_not is None:
        if tag.find('img'):
            img_tag = tag.find('img')
            src_or_not = src_or_srcset(img_tag)
            
    tags = [profs[len(profs)//5*i] for i in range(1,6)]

    name_pos, title_pos = find_pos(tags)
        
    items = get_info(profs, name_pos, title_pos, src_or_not)
    return items

## Using class name

In [15]:
# for each tag containing imgs, find their first parents that contains the the keyword 
# (professor, lecturer, student (some pages include grad students), ...) 
# My rational is since the tag includes the title already, it likely also includes the name of the person. 

def find_profile_class(tag, levels_allowed):
    if levels_allowed == 0:
        raise Exception("Class name not found")
    s = ' '.join(list(tag.stripped_strings))
    if re.search(r"(?i)(Professor|Lecturer|Student|Director|Fellow|Adjunct|Assistant|Coordinator|Postgraduate|Postdoctoral|Scientist|Visiting|Associate|Staff|PhD|Dean|Senior|Preceptor)", s):
        tag_s = str(tag)
        m = re.findall(r'class="(.+?)"', tag_s)
        # allow going up 2 more levels if 'class=' is not found 
        if len(m) == 0:
            return find_profile_class(tag.parent, levels_allowed - 1)
        else:
            return tag, m[0]
    else:
        return find_profile_class(tag.parent, levels_allowed)

In [16]:
def get_department_info_class(URL):
    soup = get_soup(URL)
    
    headshots = select_headshots(soup)
    src_or_not = src_or_srcset(headshots[0])

    profile_classes = []
    tags = []
    for headshot in headshots:
        try:
            tag, profile_c = find_profile_class(headshot, 2)
            profile_classes.append(profile_c)
            tags.append(tag)
        except:
            pass
        
    if len(profile_classes) == 0:
        raise Exception("class name not found")
    elif len(set(profile_classes)) == len(profile_classes):
        raise Exception("class names for profiles are different")
    else:
        profile_class = mode(profile_classes)
    
    tags = [t for (t, c) in zip(tags, profile_classes) if c == profile_class]
    name_positions = []
    title_positions = []
    for (tag,p_name) in zip(tags, profile_classes):
        if p_name == profile_class:
            try:
                name_pos = find_name_pos(tag)
                title_pos = find_title_pos(tag)
                name_positions.append(name_pos)
                title_positions.append(title_pos)
            except:
                pass
    
    if len(name_positions) == 0:
        raise Exception("failed to find positions")
    name_pos = mode(name_positions)
    title_pos = mode(title_positions)
    
    profs = soup.find_all(attrs = {'class':profile_class})

    items = get_info(profs, name_pos, title_pos, src_or_not)
    return items

# children

In [17]:
def find_profile_children(tags):
    parents = []
    parent = None
    while parent is None:
        new_tags = []
        for tag in tags:
            tmp_parent = tag.parent
            if tmp_parent in parents:
                target_tag = tag
                parent = tmp_parent
                break
            else:
                parents.append(tmp_parent)
                new_tags.append(tmp_parent)
        tags = new_tags
            
    
    tag_name = target_tag.name
    children = parent.findChildren(recursive=False)
    items = []
    for c in children:
        if c.name == tag_name:
            items.append(c)
    return items

In [18]:
def get_department_info_children(URL):
    # get the soup (entire webpage)
    soup = get_soup(URL)
    
    headshots = select_headshots(soup)
        
    profs = find_profile_children(headshots)
    src_or_not = src_or_srcset(headshots[0])
    
    tags = []
    for h in headshots:
        for p in profs:
            if str(h) in str(p):
                tags.append(p)
                break
    tags = list(set(tags))
    if len(tags) / len(headshots) < 2/3:
        raise Exception("children failed")
    
    name_positions = []
    title_positions = []
    for tag in tags:
        try:
            name_pos = find_name_pos(tag)
            title_pos = find_title_pos(tag)
            name_positions.append(name_pos)
            title_positions.append(title_pos)
        except:
            pass

    if len(name_positions) == 0:
        raise Exception("failed to find positions")
    name_pos = mode(name_positions)
    title_pos = mode(title_positions)

    items = get_info(profs, name_pos, title_pos, src_or_not)
    return items

## Brute Force

In [19]:
def find_profile_bf(tag, count):
    s = ' '.join(list(tag.stripped_strings))
    if re.search(r"(?i)(Professor|Lecturer|Student|Director|Fellow|Adjunct|Assistant|Coordinator|Postgraduate|Postdoctoral|Scientist|Visiting|Associate|Staff|PhD|Dean|Senior|Preceptor)", s):
        return tag, count
    else:
        count = count + 1
        return find_profile_bf(tag.parent, count)

In [20]:
def get_department_info_bf(URL):
    soup = get_soup(URL)
    
    headshots = select_headshots(soup)
    src_or_not = src_or_srcset(headshots[0])
    
    counts = []
    tags = []
    for headshot in headshots:
        try:
            tag, count = find_profile_bf(headshot, 0)
            counts.append(count)
            tags.append(tag)
        except:
            pass
    
    
    if len(counts) == 0:
        raise Exception("bf failed")
    elif len(set(counts)) == len(counts):
        raise Exception("counts are all different")
    else:
        count = mode(counts)
    
    name_positions = []
    title_positions = []
    for (tag, c) in zip(tags, counts):
        if c == count:
            try:
                name_pos = find_name_pos(tag)
                title_pos = find_title_pos(tag)
                name_positions.append(name_pos)
                title_positions.append(title_pos)
            except:
                pass
    
    if len(name_positions) == 0:
        raise Exception("failed to find positions")
    name_pos = mode(name_positions)
    title_pos = mode(title_positions)
    
    all_headshots = soup.find_all('img')
    profs = []
    for t in all_headshots:
        for i in range(count):
            t = t.parent
        profs.append(t)
    items = get_info(profs, name_pos, title_pos, src_or_not)
    return items

## Testing

In [28]:
url = "https://arts.columbia.edu/film/faculty"
soup = get_soup(url)

In [35]:
t = soup.find_all('div',style=lambda value: value and 'background-image' in value)[0]
url = re.findall('\((.*?)\)', t['style'])[0]

In [49]:
url

'https://arts.columbia.edu/sites/arts.columbia.edu/files/film_hs_apetri_bogdan_2.jpeg'

In [231]:
bf_works = []
bf_fails = []
for url in urls[:10]:
    try:
        res = get_department_info_bf(url)
        item = {'url': url, 'res':res}
        bf_works.append(item)
    except:
        bf_fails.append(url)

In [217]:
children_works = []
children_fails = []
for url in urls[:10]:
    try:
        res = get_department_info_children(url)
        item = {'url': url, 'res':res}
        children_works.append(item)
    except:
        children_fails.append(url)

In [234]:
children_works

[{'url': 'https://statistics.fas.harvard.edu/faculty',
  'res': [{'name': 'Morgane Austern',
    'title': 'Assistant Professor',
    'img': '//static.hwpi.harvard.edu/files/styles/profile_thumbnail/public/statistics-2/files/img-20210401-wa0022.jpg?m=1647356154&itok=X-5OHRTO'},
   {'name': 'Joseph K. Blitzstein',
    'title': 'Professor of the Practice in Statistics,',
    'img': '//static.hwpi.harvard.edu/files/styles/profile_thumbnail/public/statistics-2/files/blitzstein_1_0.jpg?m=1629481923&itok=K83gCeQ4'},
   {'name': 'Stephen Blyth',
    'title': 'Professor of the Practice in Statistics',
    'img': '//static.hwpi.harvard.edu/files/styles/profile_thumbnail/public/statistics-2/files/blyth_1.jpg?m=1626193422&itok=ZmDqLNx4'},
   {'name': 'Mark E. Glickman',
    'title': 'Senior Lecturer in Statistics,',
    'img': '//static.hwpi.harvard.edu/files/styles/profile_thumbnail/public/statistics-2/files/glickman_resize.jpg?m=1597767942&itok=kwDV0EHW'},
   {'name': 'Kosuke Imai',
    'title':

In [161]:
class_name_works = []
class_name_fails = []
for url in urls:
    try:
        res = get_department_info_class(url)
        item = {'url': url, 'res':res}
        class_name_works.append(item)
    except:
        class_name_fails.append(url)

In [175]:
len(class_name_works)

32

In [22]:
tr_works = []
tr_fails = []

for url in urls:
    try:
        res = get_department_info_tr(url)
        item = {'url': url, 'res':res}
        tr_works.append(item)
    except:
        tr_fails.append(url)

In [233]:
tr_works

In [43]:
for i in class_name_works:
    print("________________________________________________________________________________________________________")
    print(f"url: {i.get('url')}")
    res = i.get('res')
    print(f"length: {len(res)}")
    print("___________")
    print(res[0])
    print(res[len(res)//2])
    print(res[-1])

In [62]:
# https://www.math.columbia.edu/people/faculty-by-rank/ 
# and https://math.yale.edu/people/all-faculty also doesn't work
class_name_fails.append("https://www.math.columbia.edu/people/faculty-by-rank/")
class_name_fails.append("https://math.yale.edu/people/all-faculty")

## Using siblings

In [19]:
# similar to class name in using recursion, but get all the profiles using siblings instead
def find_profile_siblings(tags):
    parents = []
    for tag in tags:
        parents.append(tag.parent)
    if len(set(parents)) == 1:
        tag = tags[1]
        tag_lst = []
        p_siblings = tag.previous_siblings
        n_siblings = tag.next_siblings
        tag_lst.extend(list(p_siblings))
        tag_lst.append(tag)
        tag_lst.extend(list(n_siblings))
        return tag, tag_lst
    else:
        return find_profile_siblings(parents)

In [20]:
def get_department_info_siblings(URL):
    # get the soup (entire webpage)
    soup = get_soup(URL)
    
    # get 3 headshot tags in different pos. Because all the pages have headshots ('img' in html), 
    # that's something in common for all pages regardless of the design 
    headshots = select_headshots(soup)
    
    tmp_tag, profs = find_profile_siblings(headshots)
    src_or_not = src_or_srcset(headshots[1])
        
    # find the position of name and title in the tmp_tag (from 1 of parents of headshots)
    name_pos = find_name_pos(tmp_tag)
    title_pos = find_title_pos(tmp_tag)

    items = []
    # get info for each profile
    for p in profs:
        name = find_by_pos(p, name_pos)
        title = find_by_pos(p, title_pos)
        img = find_img(p, src_or_not)
        item = {'name': name, 'title': title, 'img':img}
        items.append(item)
    return items

In [21]:
urls = ["https://stat.columbia.edu/department-directory/faculty-and-lecturers/", 
         "https://statistics.wharton.upenn.edu/faculty/faculty-list/",
         "https://statistics.fas.harvard.edu/faculty",
         "https://statistics.yale.edu/people",
         "https://www.brown.edu/academics/public-health/biostats/people/faculty",
         "https://stat.cornell.edu/people/faculty",
         "https://www.math.columbia.edu/people/faculty-by-rank/",
         "https://www.math.upenn.edu/people/standing-faculty",
         "https://www.math.harvard.edu/people/",
         "https://math.yale.edu/people/all-faculty",
         "https://www.brown.edu/academics/math/faculty", 
         "https://math.dartmouth.edu/people/people-select.php?list=permanent",
         "https://economics.dartmouth.edu/people",
         "https://economics.yale.edu/people/faculty",
         "https://english.columbia.edu/content/faculty#!#%2Fviews-display-37",
         "http://ealac.columbia.edu/people/professors/",
         "https://english.yale.edu/people/ladder-faculty",
         "https://www.ieor.columbia.edu/directory?gsarqfields%5Bbiotypetid%5D=30",
         "https://www.cs.columbia.edu/people/faculty/",
         "https://afamstudies.columbia.edu/content/people",
         "https://anthropology.columbia.edu/content/faculty-directory",
         "https://www.biochem.cuimc.columbia.edu/research/research-faculty",
         "https://www8.gsb.columbia.edu/faculty-research/divisions/decision-risk-operations/people/faculty",
         "https://arts.columbia.edu/film/",
         "https://arth.sas.upenn.edu/people/standing-faculty",
         "https://www.english.upenn.edu/people/faculty",
         "https://www.ling.upenn.edu/people/faculty",
         "https://hss.sas.upenn.edu/people",
         "https://english.dartmouth.edu/people",
         "https://german.dartmouth.edu/people",
         "https://ascl.dartmouth.edu/people",
         "https://chemistry.dartmouth.edu/people",
         "https://web.cs.dartmouth.edu/people",
         "https://ealc.fas.harvard.edu/people/taxonomy/term/6536",
         "https://english.fas.harvard.edu/our-people",
         "https://philosophy.fas.harvard.edu/faculty-1",
         "https://cals.cornell.edu/global-development/about/people/faculty",
         "https://cals.cornell.edu/landscape-architecture/people",
         "https://aap.cornell.edu/academics/faculty",
         "https://anthropology.cornell.edu/anthropology-faculty"
        ]