In [1]:
import requests 
from bs4 import BeautifulSoup 
import re
from statistics import mode
import unidecode

# Common functions

In [2]:
def get_soup(URL):
    r = requests.get(URL) 
    soup = BeautifulSoup(r.content, 'html5lib')
    return soup.body

In [3]:
def select_headshots(soup):
    headshots = soup.find_all('img')
    if len(headshots) < 3:
        raise Exception("Not enough imgs")
    elif len(headshots) < 10:
        items = [headshots[len(headshots)//3], headshots[len(headshots)//3*2], headshots[len(headshots)-2]]
    elif len(headshots) > 50:
        items = [headshots[len(headshots)//10*i] for i in range(1,10)]
    else:
        items = [headshots[len(headshots)//7*i] for i in range(1,7)]
    return items

In [30]:
def find_name_pos(tag):
    pos = 0
    strs = tag.stripped_strings
    while True:
        try:
            s = next(strs)
            if re.match("^([a-zA-Z\.\-]*[,]* [a-zA-Z\.]* [a-zA-Z\.\-]*|[a-zA-Z\.\-]*[,]* [a-zA-Z\.\-]*)[, Ph\.D\.]*$",
                      unidecode.unidecode(" ".join(s.split()))):
                if "University" not in s and "College" not in s and "Department" not in s\
                and "Faculty" not in s and "Research" not in s and "Interest" not in s and "Staff" not in s\
                and "Profile" not in s and "Student" not in s:
                    return pos
            pos += 1
        except:
            raise Exception("name position not found")

In [6]:
def find_title_pos(tag):
    pos = 0
    strs = tag.stripped_strings
    while True:
        try:
            s = next(strs)
            if re.match("(?i).*(Professor|Lecturer|Student|Director|Fellow|Adjunct|Assistant|Coordinator|Postgraduate|Postdoctoral|Scientist|Visiting|Associate|Staff|PhD).*", s):
                return pos
            else:
                pos += 1
        except:
            raise Exception("title position not found")

In [7]:
def find_pos(tags):
    name_positions = []
    title_positions = []
    for tag in tags:
        try:
            name_pos = find_name_pos(tag)
            title_pos = find_title_pos(tag)
            name_positions.append(name_pos)
            title_positions.append(title_pos)
        except:
            pass
        
    if len(name_positions) == 0:
        raise Exception("failed to find positions")
    name_pos = mode(name_positions)
    title_pos = mode(title_positions)
    return name_pos, title_pos

In [8]:
def is_name(s):
    if s is None:
        return None
    elif re.match("^([a-zA-Z\.\-]*[,]* [a-zA-Z\.]* [a-zA-Z\.\-]*|[a-zA-Z\.\-]*[,]* [a-zA-Z\.\-]*)[, Ph\.D\.]*$",
                  unidecode.unidecode(" ".join(s.split()))):
        return s
    else:
        return None

In [9]:
def is_title(s):
    if s is None:
        return None
    elif re.match("(?i).*(Professor|Lecturer|Student|Director|Fellow|Adjunct|Assistant|Coordinator|Doctoral|Postgraduate|Postdoctoral|Scientist|Visiting|Associate|Staff|Dean|Senior|Preceptor).*", s):
        return s
    else:
        return None

In [10]:
def find_by_pos(tag, pos):
    it = tag.stripped_strings
    try:
        item = next(x for i,x in enumerate(it) if i==pos)
    except:
        item = None
    return item

In [72]:
def find_img(tag):    
    img_tag = tag.find('img')
    if img_tag is not None:
        try:
            return img_tag['src']
        except:
            pass

        try:
            return img_tag['srcset'].split()[0]
        except:
            pass
    return None

In [73]:
def get_info(profs, name_pos, title_pos):
    items = []
    total = 0
    no_fails = 0
    for p in profs:
        name = find_by_pos(p, name_pos)
        name = is_name(name)
        title = find_by_pos(p, title_pos)
        title = is_title(title)
        img = find_img(p)
        if name is not None and title is not None:
            item = {'name': name, 'title': title, 'img':img}
            items.append(item)
            total += 1
            if name is None or title is None:
                no_fails += 1
    if no_fails/total > 1/3 or total < 5:
        raise Exception("FAILED")
    return items

# Different methods

## tr

In [74]:
def find_profs_tr(soup):
    profs = soup.find_all('tr')
    
    if len(profs) == 0:
        raise Exception("No tr")
    return profs

In [75]:
def get_department_info_tr(soup):
    profs = find_profs_tr(soup)
    
    tags = [profs[len(profs)//5*i] for i in range(1,6)]

    name_pos, title_pos = find_pos(tags)
        
    items = get_info(profs, name_pos, title_pos)
    return items

In [78]:
#get_department_info_tr(get_soup("https://statistics.yale.edu/people"))

## Profile class name

In [16]:
def find_profile_class(tag, levels_allowed):
    if levels_allowed == 0:
        raise Exception("Class name not found")
    s = ' '.join(list(tag.stripped_strings))
    if re.search(r"(?i)(Professor|Lecturer|Student|Director|Fellow|Adjunct|Assistant|Coordinator|Postgraduate|Postdoctoral|Scientist|Visiting|Associate|Staff|PhD|Dean|Senior|Preceptor)", s):
        tag_s = str(tag)
        m = re.findall(r'class="(.+?)"', tag_s)
        # allow going up 2 more levels if 'class=' is not found 
        if len(m) == 0:
            return find_profile_class(tag.parent, levels_allowed - 1)
        else:
            return tag, m[0]
    else:
        return find_profile_class(tag.parent, levels_allowed)

In [80]:
def get_department_info_class(soup):
    headshots = select_headshots(soup)

    profile_classes = []
    tags = []
    for headshot in headshots:
        try:
            tag, profile_c = find_profile_class(headshot, 2)
            profile_classes.append(profile_c)
            tags.append(tag)
        except:
            pass
        
    if len(profile_classes) == 0:
        raise Exception("class name not found")
    elif len(set(profile_classes)) == len(profile_classes):
        raise Exception("class names for profiles are different")
    else:
        profile_class = mode(profile_classes)
    
    tags = [t for (t, c) in zip(tags, profile_classes) if c == profile_class]
    
    name_pos, title_pos = find_pos(tags)

    profs = soup.find_all(attrs = {'class':profile_class})
    items = get_info(profs, name_pos, title_pos)
    return items

In [82]:
#get_department_info_class(get_soup("https://stat.columbia.edu/department-directory/faculty-and-lecturers/"))

## Children

In [83]:
def find_profile_children(tags):
    parents = []
    parent = None
    while parent is None:
        new_tags = []
        for tag in tags:
            tmp_parent = tag.parent
            if tmp_parent in parents:
                target_tag = tag
                parent = tmp_parent
                break
            else:
                parents.append(tmp_parent)
                new_tags.append(tmp_parent)
        tags = new_tags
            
    tag_name = target_tag.name
    children = parent.findChildren(recursive=False)
    items = []
    for c in children:
        if c.name == tag_name:
            items.append(c)
    return items

In [84]:
def get_department_info_children(soup):
    headshots = select_headshots(soup)
        
    profs = find_profile_children(headshots)
    
    tags = []
    for h in headshots:
        for p in profs:
            if str(h) in str(p):
                tags.append(p)
                break
    tags = list(set(tags))
    if len(tags) / len(headshots) < 2/3:
        raise Exception("children failed")
    
    name_pos, title_pos = find_pos(tags)

    items = get_info(profs, name_pos, title_pos)
    return items

In [86]:
#get_department_info_children(get_soup("https://statistics.fas.harvard.edu/faculty/"))

## Brute Forcing

In [22]:
def find_profile_bf(tag, count):
    s = ' '.join(list(tag.stripped_strings))
    if re.search(r"(?i)(Professor|Lecturer|Student|Director|Fellow|Adjunct|Assistant|Coordinator|Postgraduate|Postdoctoral|Scientist|Visiting|Associate|Staff|PhD|Dean|Senior|Preceptor)", s):
        return tag, count
    else:
        count = count + 1
        return find_profile_bf(tag.parent, count)

In [89]:
def get_department_info_bf(soup):
    headshots = select_headshots(soup)
    
    counts = []
    tags = []
    for headshot in headshots:
        try:
            tag, count = find_profile_bf(headshot, 0)
            counts.append(count)
            tags.append(tag)
        except:
            pass
    
    
    if len(counts) == 0:
        raise Exception("bf failed")
    elif len(set(counts)) == len(counts):
        raise Exception("counts are all different")
    else:
        count = mode(counts)
    
    tags = [t for (t, c) in zip(tags, counts) if c == count]
    name_pos, title_pos = find_pos(tags)
    
    all_headshots = soup.find_all('img')
    profs = []
    for t in all_headshots:
        for i in range(count):
            t = t.parent
        profs.append(t)
    items = get_info(profs, name_pos, title_pos)
    return items

In [92]:
#get_department_info_bf(get_soup("https://cals.cornell.edu/global-development/about/people/faculty"))

# Putting everything together

In [31]:
def get_department_info(URL):
    soup = get_soup(URL)
    res_lst = []
    print("______________________")
    print(URL)
    try:
        res_lst.append(["tr", get_department_info_tr(soup)])
    except:
        pass
    
    try:
        res_lst.append(["children", get_department_info_children(soup)])
    except:
        pass
    
    try:
        res_lst.append(["class", get_department_info_class(soup)])
    except:
        pass
    
    if len(res_lst) == 0:
        try:
            return(['bf', get_department_info_bf(soup)])
        except:
            print("FAILED")
            raise Exception("FAILED")
    else:
        curr = None
        for res in res_lst:
            if curr is None or len(res[1]) > len(curr[1]):
                curr = res
        return curr

In [26]:
urls = ["https://stat.columbia.edu/department-directory/faculty-and-lecturers/", 
         "https://statistics.wharton.upenn.edu/faculty/faculty-list/",
         "https://statistics.fas.harvard.edu/faculty",
         "https://statistics.yale.edu/people",
         "https://www.brown.edu/academics/public-health/biostats/people/faculty",
         "https://stat.cornell.edu/people/faculty",
         "https://www.math.columbia.edu/people/faculty-by-rank/",
         "https://www.math.upenn.edu/people/standing-faculty",
         "https://www.math.harvard.edu/people/",
         "https://math.yale.edu/people/all-faculty",
         "https://www.brown.edu/academics/math/faculty", 
         "https://math.dartmouth.edu/people/people-select.php?list=permanent",
         "https://economics.dartmouth.edu/people",
         "https://economics.yale.edu/people/faculty",
         "https://english.columbia.edu/content/faculty#!#%2Fviews-display-37",
         "http://ealac.columbia.edu/people/professors/",
         "https://english.yale.edu/people/ladder-faculty",
         "https://www.ieor.columbia.edu/directory?gsarqfields%5Bbiotypetid%5D=30",
         "https://www.cs.columbia.edu/people/faculty/",
         "https://afamstudies.columbia.edu/content/people",
         "https://anthropology.columbia.edu/content/faculty-directory", # headshots are not img
         "https://www8.gsb.columbia.edu/faculty-research/divisions/decision-risk-operations/people/faculty",
         "https://arts.columbia.edu/faculty",
         "https://arth.sas.upenn.edu/people/standing-faculty",
         "https://www.english.upenn.edu/people/faculty",
         "https://www.ling.upenn.edu/people/faculty",
         "https://hss.sas.upenn.edu/people",
         "https://english.dartmouth.edu/people",
         "https://german.dartmouth.edu/people",
         "https://ascl.dartmouth.edu/people",
         "https://chemistry.dartmouth.edu/people",
         "https://web.cs.dartmouth.edu/people",
         "https://ealc.fas.harvard.edu/people/taxonomy/term/6536",
         "https://english.fas.harvard.edu/our-people",
         "https://philosophy.fas.harvard.edu/faculty-1",
         "https://cals.cornell.edu/global-development/about/people/faculty",
         "https://cals.cornell.edu/landscape-architecture/people",
         "https://aap.cornell.edu/academics/faculty",
         "https://anthropology.cornell.edu/anthropology-faculty"
        ]

In [32]:
works = []
fails = []
for url in urls:
    try:
        method, res = get_department_info(url)
        item = {'url': url, 'method': method, 'res':res}
        works.append(item)
    except:
        fails.append(url)

______________________
https://stat.columbia.edu/department-directory/faculty-and-lecturers/
______________________
https://statistics.wharton.upenn.edu/faculty/faculty-list/
______________________
https://statistics.fas.harvard.edu/faculty
______________________
https://statistics.yale.edu/people
______________________
https://www.brown.edu/academics/public-health/biostats/people/faculty
______________________
https://stat.cornell.edu/people/faculty
______________________
https://www.math.columbia.edu/people/faculty-by-rank/
______________________
https://www.math.upenn.edu/people/standing-faculty
______________________
https://www.math.harvard.edu/people/
______________________
https://math.yale.edu/people/all-faculty
______________________
https://www.brown.edu/academics/math/faculty
______________________
https://math.dartmouth.edu/people/people-select.php?list=permanent
______________________
https://economics.dartmouth.edu/people
______________________
https://economics.yale.edu/

In [33]:
for i in works:
    print("________________________________________________________________________________________________________")
    print(f"url: {i.get('url')}")
    res = i.get('res')
    method = i.get('method')
    print(f"method: {method}")
    print(f"length: {len(res)}")
    print("___________")
    print(res[0])
    print(res[len(res)//2])
    print(res[-1])

________________________________________________________________________________________________________
url: https://stat.columbia.edu/department-directory/faculty-and-lecturers/
method: class
length: 59
___________
{'name': 'David Blei', 'title': 'Professor', 'img': 'https://stat.columbia.edu/wp-content/uploads/connections-images/david-blei/Blei-David-1.jpg'}
{'name': 'Banu Baydil', 'title': 'Lecturer in Discipline, Co-Director of M.A. Programs', 'img': 'https://stat.columbia.edu/wp-content/uploads/connections-images/banu-baydil/banu_original.jpg'}
{'name': 'Rongning Wu', 'title': 'Adjunct Assistant Professor', 'img': None}
________________________________________________________________________________________________________
url: https://statistics.wharton.upenn.edu/faculty/faculty-list/
method: class
length: 51
___________
{'name': 'Bhaswar B. Bhattacharya', 'title': 'Assistant Professor of Statistics and Data Science', 'img': 'https://faculty.wharton.upenn.edu/wp-content/uploads/

In [34]:
also_fails = []
also_fails.append("https://www.math.columbia.edu/people/faculty-by-rank/") # name outside the class
fails.extend(also_fails)

In [41]:
success_rate = 1-len(fails)/len(urls)
print(f"sucess rate for ({len(urls)} pages) = {success_rate:.2f}")

sucess rate for (39 pages) = 0.92


In [36]:
fails

['https://anthropology.columbia.edu/content/faculty-directory',
 'https://arts.columbia.edu/faculty',
 'https://www.math.columbia.edu/people/faculty-by-rank/']

# Notes

Failed because imgs are embedded as background-image:  
https://anthropology.columbia.edu/content/faculty-directory  
https://arts.columbia.edu/faculty

Faculty name not in the class:
https://www.math.columbia.edu/people/faculty-by-rank/

All the ones that are not failed at least found profiles in the first page

--------------------------------------------------------------------------
Next step: find a way to load all the pages