# **02 Doximity Ranker**

In [8]:
# Importing required packages
import os
import glob
import requests
import re
import pandas as pd
pd.set_option('max_colwidth', 400)  #wider pandas columns upon render
import numpy as np
from bs4 import BeautifulSoup
from itertools import islice
import thefuzz
from thefuzz import fuzz, process
from datetime import date
import sklearn

# Printing the working directory
os.getcwd()

'/home/toofastdan/Documents/APSA/2024-07-29_Texas-STAR'

In [9]:
# Finding out package versions
print(f"Pandas version: {pd.__version__}")
print(f"thefuzz version: {thefuzz.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")

Pandas version: 2.2.3
thefuzz version: 0.22.1
Scikit-learn version: 1.6.0


### **Defining Functions for Parsing Doximity rank list html files**

In [3]:
def parse_dox(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, "html.parser")
    return soup

In [4]:
def residency_parser(file):
    # Beautiful Soup
    soup = parse_dox(filename=file)
    
    # Extracting the residency list div from the page
    res_list = soup.find("div", class_ = "residency-program-list")
    
    # Program names
    programs = res_list.find_all("a", class_ = "residency-result-program-title")
    programs = [p.text for p in programs]

    # Program locations
    locations = res_list.find_all("p", class_ = "residency-result-location")
    locations = [l.text for l in locations]

    # Program Sizes
    program_size = res_list.find_all("p", class_ = "residency-result-stats-program-size")
    program_size = [ps.text for ps in program_size]
    pattern = r'\d+'
    program_size = [int(re.findall(pattern, ps)[0]) for ps in program_size]

    # Program ranks
    ranks = np.arange(1, len(programs) + 1, 1)
    
    # Making a dataframe
    df = pd.DataFrame({"program": programs, "location": locations, "program_size": program_size, "rank": ranks})
    return df

### **Parsing all doximity html files**

In [5]:
# Getting all .html files from residency rankings from doximity
html_list = glob.glob("dox_rankings/*.html")
html_list[10:13]

['dox_rankings\\Internal Medicine-Pediatrics_research-output.html',
 'dox_rankings\\Internal Medicine_reputation.html',
 'dox_rankings\\Internal Medicine_research-output.html']

In [6]:
# Looping over each file to get rankings
df_list = []
for h in html_list:
    # Getting specialty and sort metrics from the html file names
    file_name = h.replace("dox_rankings\\", "").replace(".html", "")
    specialty, sort_by = file_name.split("_")
    
    # Beatiful Soup parsing
    print(specialty, sort_by)
    df = residency_parser(file=h)
    df["specialty"] = specialty
    df["sort_by"] = sort_by
    df_list.append(df)
    
# Concatenating things to a single dataframe
df_complete = pd.concat(df_list)
df_complete

Anesthesiology reputation
Anesthesiology research-output
Child Neurology reputation
Child Neurology research-output
Dermatology reputation
Dermatology research-output
Emergency Medicine reputation
Emergency Medicine research-output
Family Medicine reputation
Internal Medicine-Pediatrics reputation
Internal Medicine-Pediatrics research-output
Internal Medicine reputation
Internal Medicine research-output
Neurological Surgery reputation
Neurological Surgery research-output
Neurology reputation
Neurology research-output
Nuclear Medicine reputation
Nuclear Medicine research-output
Obstetrics and Gynecology reputation
Obstetrics and Gynecology research-output
Occupational Medicine reputation
Occupational Medicine research-output
Ophthalmology reputation
Ophthalmology research-output
Orthopaedic Surgery reputation
Orthopaedic Surgery research-output
Otolaryngology reputation
Otolaryngology research-output
Pathology reputation
Pathology research-output
Pediatrics-Medical Genetics size-of-prog

Unnamed: 0,program,location,program_size,rank,specialty,sort_by
0,Mass General Brigham/Massachusetts General Hos...,Boston,116,1,Anesthesiology,reputation
1,University of California (San Francisco),San Francisco,96,2,Anesthesiology,reputation
2,Stanford Health Care-Sponsored Stanford Univer...,Stanford,108,3,Anesthesiology,reputation
3,Mass General Brigham/Brigham and Women's Hospital,Boston,128,4,Anesthesiology,reputation
4,Duke University Hospital,Durham,60,5,Anesthesiology,reputation
...,...,...,...,...,...,...
72,Wake Forest University Baptist Medical Center,Winston Salem,5,73,Vascular Surgery,size-of-program
73,Washington University/B-JH/SLCH Consortium,Saint Louis,5,74,Vascular Surgery,size-of-program
74,Yale-New Haven Medical Center,New Haven,5,75,Vascular Surgery,size-of-program
75,Zucker School of Medicine at Hofstra/Northwell,New Hyde Park,5,76,Vascular Surgery,size-of-program


In [7]:
df_complete[["specialty", "sort_by"]].value_counts()

specialty                             sort_by        
Family Medicine                       reputation         766
Internal Medicine                     research-output    639
                                      reputation         639
Surgery                               research-output    358
                                      reputation         358
Psychiatry                            reputation         317
                                      research-output    317
Obstetrics and Gynecology             research-output    294
                                      reputation         294
Emergency Medicine                    research-output    286
                                      reputation         286
Pediatrics                            research-output    218
                                      reputation         218
Orthopaedic Surgery                   reputation         207
                                      research-output    207
Radiology-Diagnostic           

In [None]:
# Exporting to excel doc
#df_complete.to_excel("tables/doximity_rankings.xlsx", index=False)

#### Test Code for above

In [59]:
soup = parse_dox(filename="dox_rankings/Internal Medicine_research-output.html")
#print(soup)

In [60]:
# Extracting the residency list div from the page
res_list = soup.find("div", class_ = "residency-program-list")

In [61]:
# Program names
programs = res_list.find_all("a", class_ = "residency-result-program-title")
programs = [p.text for p in programs]

# Program locations
locations = res_list.find_all("p", class_ = "residency-result-location")
locations = [l.text for l in locations]

# Program Sizes
program_size = res_list.find_all("p", class_ = "residency-result-stats-program-size")
program_size = [ps.text for ps in program_size]
pattern = r'\d+'
program_size = [int(re.findall(pattern, ps)[0]) for ps in program_size]

# Program ranks
ranks = np.arange(1, len(programs) + 1, 1)

In [62]:
# Making a dataframe
df = pd.DataFrame({"program": programs, "location": locations, "program_size": program_size, "rank": ranks})
df["specialty"] = "Anesthesiology"
df["sort_by"] = "reputation"
df

Unnamed: 0,program,location,program_size,rank,specialty,sort_by
0,Mass General Brigham/Massachusetts General Hos...,Boston,168,1,Anesthesiology,reputation
1,Mass General Brigham/Brigham and Women's Hospital,Boston,174,2,Anesthesiology,reputation
2,University of California (San Francisco),San Francisco,185,3,Anesthesiology,reputation
3,University of Pennsylvania Health System,Philadelphia,182,4,Anesthesiology,reputation
4,Johns Hopkins University,Baltimore,145,5,Anesthesiology,reputation
...,...,...,...,...,...,...
634,WakeMed Health and Hospitals,Raleigh,15,635,Anesthesiology,reputation
635,Western Reserve Hospital,Cuyahoga Falls,18,636,Anesthesiology,reputation
636,Willis-Knighton Health System,Shreveport,30,637,Anesthesiology,reputation
637,Zucker School of Medicine at Hofstra/Northwell...,Mount Kisco,32,638,Anesthesiology,reputation


## **AAMC Table B4 Parser**

Source: https://www.aamc.org/data-reports/students-residents/data/report-residents/2024/table-b4-md-phd-residents-gme-specialty

In [13]:
b4 = pd.read_excel("AAMC_data/AAMC_Table-B4.xlsx", sheet_name="table_b4")
b4 = b4.set_index("Specialties")
b4.head()

Unnamed: 0_level_0,MD-PhD First-Year Residents 2016,MD-PhD Active Residents 2016,Total Active Residents 2016,MD-PhD First-Year Residents 2017,MD-PhD Active Residents 2017,Total Active Residents 2017,MD-PhD First-Year Residents 2018,MD-PhD Active Residents 2018,Total Active Residents 2018,MD-PhD First-Year Residents 2019,...,Total Active Residents 2020,MD-PhD First-Year Residents 2021,MD-PhD Active Residents 2021,Total Active Residents 2021,MD-PhD First-Year Residents 2022,MD-PhD Active Residents 2022,Total Active Residents 2022,MD-PhD First-Year Residents 2023,MD-PhD Active Residents 2023,Total Active Residents 2023
Specialties,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aerospace Medicine,,,,,,,,,,,...,,0.0,1.0,7.0,0.0,2.0,5.0,0.0,1.0,4.0
Allergy and Immunology,0.0,15.0,212.0,0.0,16.0,221.0,0.0,17.0,221.0,1.0,...,224.0,0.0,20.0,236.0,0.0,22.0,240.0,0.0,22.0,228.0
Anesthesiology,16.0,90.0,4487.0,17.0,89.0,4444.0,14.0,81.0,4491.0,24.0,...,4441.0,14.0,90.0,4795.0,18.0,94.0,4969.0,14.0,92.0,5209.0
Adult Cardiothoracic Anesthesiology (Anesthesiology),0.0,5.0,145.0,0.0,0.0,136.0,0.0,2.0,134.0,0.0,...,147.0,0.0,6.0,141.0,0.0,3.0,149.0,0.0,2.0,135.0
Critical Care Medicine (Anesthesiology),0.0,3.0,136.0,0.0,7.0,131.0,0.0,6.0,126.0,0.0,...,130.0,0.0,5.0,140.0,0.0,3.0,142.0,0.0,5.0,127.0


In [16]:
# Formatting to long format per year
year_list = ["2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
b4_list = []
for year in year_list:
    b4_filt = b4.filter(like=year, axis=1).reset_index()
    b4_filt.columns = ["specialties", "mdphd_firstyear", "mdphd_active", "total_active"]
    b4_filt["year"] = int(year)
    b4_list.append(b4_filt)
b4_complete = pd.concat(b4_list)
b4_complete

Unnamed: 0,specialties,mdphd_firstyear,mdphd_active,total_active,year
0,Aerospace Medicine,,,,2016
1,Allergy and Immunology,0.0,15.0,212.0,2016
2,Anesthesiology,16.0,90.0,4487.0,2016
3,Adult Cardiothoracic Anesthesiology (Anesthesiology),0.0,5.0,145.0,2016
4,Critical Care Medicine (Anesthesiology),0.0,3.0,136.0,2016
...,...,...,...,...,...
160,Pediatrics/Physical Medicine and Rehabilitation,0.0,1.0,11.0,2023
161,Pediatrics/Psychiatry/Child and Adolescent Psychiatry,0.0,2.0,97.0,2023
162,Psychiatry/Family Practice,0.0,1.0,49.0,2023
163,Psychiatry/Neurology,,,,2023


In [15]:
# Exporting to excel
#b4_complete.to_excel("AAMC_data/AAMC_Table-B4_processed.xlsx", index=False)