In [None]:
import requests
from lxml import etree
from tqdm import tqdm
import pandas as pd
import traceback

In [2]:
base_url = "https://www.cs.jhu.edu/faculty/"             # CS
# base_url = "https://engineering.jhu.edu/ams/faculty/"  # AMS

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
}

In [3]:
response = requests.get(base_url, headers=headers)
response = etree.HTML(response.text)
entity_list = response.xpath("/html/body/div[2]/main/div/div[2]/div/div/div/ul/li")
len(entity_list)

51

In [4]:
all = pd.DataFrame()
for entity in tqdm(entity_list, desc="Scraping..."):
    try:
        # A dict to store all the details
        detail_dict = {}

        # Get the url and name from the first page
        url = entity.xpath('./div/div/div/div[1]/div/h2/a/@href')[0]
        name = entity.xpath('./div/div/div/div[1]/div/h2/a/text()')[0]
        detail_dict['name'] = name
        detail_dict['url'] = url

        # Get the details from the second page
        response = requests.get(url, headers=headers)
        response = etree.HTML(response.text)
        meta_details = response.xpath("/html/body/div[2]/main/div/div[1]/div/div/div/div/div/div")

        # Get the details
        for meta_detail in meta_details:
            meta_name = meta_detail.xpath("./div[1]/text()")[0]

            if meta_name == "Education":
                tel = meta_detail.xpath("./div[2]/a/text()")[0]
                detail_dict['tel'] = tel
                email = meta_detail.xpath("./div[3]/a/text()")[0]
                detail_dict['email'] = email

            elif meta_name == "Location":
                location = meta_detail.xpath("./div[2]/div[2]/text()")[0]
                detail_dict['location'] = location

            elif meta_name == "Research Areas":
                research_areas = meta_detail.xpath("./div")[1:]
                research_areas = [area.xpath("./text()")[0] for area in research_areas]
                research_areas = [area.strip() for area in research_areas]
                detail_dict['research_areas'] = ' | '.join(research_areas)

            elif meta_name == "Connect":
                connect_list = meta_detail.xpath("./ul/li")
                for connect in connect_list:
                    connect = connect.xpath("./a")[0]
                    url = connect.xpath("./@href")[0]
                    connect_name = connect.xpath("./span/span[1]/text()")[0]
                    detail_dict[connect_name] = url

        # Get the self introduction
        self_intro = response.xpath("/html/body/div[2]/main/div/div[2]/div/div/div//text()")
        detail_dict['self_intro'] = '\n'.join(self_intro)

        # concat the data
        tmp = pd.DataFrame(detail_dict, index=[0])
        all = pd.concat([all, tmp], ignore_index=True)
    
    except Exception as e:
        message = f"Something went wrong with {name} and the url is {url } \nError: {traceback.format_exc()}\n"
        print(message)

# Save the data
all.to_excel("cs.xlsx", index=False)
# all.to_excel("ams.xlsx", index=False)
all.shape

Scraping...:   0%|          | 0/51 [00:00<?, ?it/s]

Scraping...: 100%|██████████| 51/51 [00:08<00:00,  6.11it/s]


(51, 12)

In [5]:
all

Unnamed: 0,name,url,location,research_areas,Publications,Website,self_intro,Google Scholar,Twitter,LinkedIn,Personal Website,Lab Website
0,Yair Amir,https://www.cs.jhu.edu/faculty/yair-amir/,209 Malone Hall,Critical infrastructure; large-scale survivabl...,http://www.dsn.jhu.edu/publications.html,https://www.cs.jhu.edu/~yairamir/,"\n\t\t\t\t\t\nYair Amir, professor emeritus of...",,,,,
1,Raman Arora,https://www.cs.jhu.edu/faculty/raman-arora/,331 Malone Hall,Machine learning | Statistical signal processi...,,https://www.cs.jhu.edu/~raman/Home.html,\n\t\t\t\t\t\nRaman Arora is an assistant prof...,https://scholar.google.com/citations?user=Spe0...,https://twitter.com/RamanArora_JHU,,,
2,Vladimir Braverman,https://www.cs.jhu.edu/faculty/vladimir-braver...,,Algorithms for massive data | Randomized and s...,,https://www.cs.jhu.edu/~vova/,\n\t\t\t\t\t\nVladimir Braverman is an associa...,https://scholar.google.com/citations?user=DTth...,,,,
3,Randal Burns,https://www.cs.jhu.edu/faculty/randal-burns/,163 Malone Hall,,,https://randalburns.github.io/,\n\t\t\t\t\t\nRandal Burns is a professor and ...,https://scholar.google.com/citations?user=rTJT...,,,,
4,Yinzhi Cao,https://www.cs.jhu.edu/faculty/yinzhi-cao/,305 Malone Hall,Security and Privacy of Machine Learning | Web...,,https://yinzhicao.org/,\n\t\t\t\t\t\nYinzhi Cao is an assistant profe...,https://scholar.google.com/citations?user=0jBP...,,,,
5,Anton Dahbura,https://www.cs.jhu.edu/faculty/anton-dahbura/,167 Malone Hall,Assured Autonomy | Information Security | Faul...,,,\n\t\t\t\t\t\nAnton (Tony) Dahbura is the exec...,https://scholar.google.com/citations?user=-s3H...,,,,
6,Mohammad Ali Darvish,https://www.cs.jhu.edu/faculty/mohammad-ali-da...,205 Malone Hall,,,https://www.cs.jhu.edu/~darvish/,\n\t\t\t\t\t\nMohammed Ali Darvish is a senior...,https://scholar.google.com/citations?user=0Xk_...,,,,
7,Michael Dinitz,https://www.cs.jhu.edu/faculty/michael-dinitz/,217 Malone Hall,,,https://www.cs.jhu.edu/~mdinitz/,\n\t\t\t\t\t\nMichael Dinitz is an associate p...,https://scholar.google.com/citations?user=Q2yN...,https://twitter.com/mdinitz,,,
8,Mark Dredze,https://www.cs.jhu.edu/faculty/mark-dredze/,,,,https://www.cs.jhu.edu/~mdredze/,"\n\t\t\t\t\t\nMark Dredze, the John C. Malone ...",https://scholar.google.com/citations?user=7jNk...,https://twitter.com/mdredze,,,
9,Kevin Duh,https://www.cs.jhu.edu/faculty/kevin-duh/,226 Hackerman Hall,,,http://cs.jhu.edu/~kevinduh/,\n\t\t\t\t\t\nKevin Duh is an assistant resear...,,,,,
