In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import json
import re
import nethelp as nh
import powerlaw
from collections import Counter
import community
import pickle
import graph_tool.all as gt

In [3]:
def extract_years(date_string):
    """
    Extracts start and end years from a string formatted as 'YYYY - YYYY'.

    This function looks for a date range in the format 'YYYY - YYYY' within the given string.
    If found, it extracts and returns the start year and end year as integers.

    Parameters
    ----------
    date_string : str
        The string that may contain a date range in the 'YYYY - YYYY' format.

    Returns
    -------
    tuple or None
        A tuple containing two integers: (start_year, end_year) if the pattern is found.
        Returns None if the pattern is not found.

    Examples
    --------
    >>> extract_years("Event happened between 1995 - 2000")
    (1995, 2000)

    >>> extract_years("No date here")
    None
    """
    # Define a regex pattern for 'YYYY - YYYY'
    pattern = r'(\d{4})\s*-\s*(\d{4})'
    
    # Search for the pattern in the input string
    match = re.search(pattern, date_string)
    
    if match:
        # Extract the start and end years from the match groups
        start_year = int(match.group(1))
        end_year = int(match.group(2))
        return start_year, end_year
    else:
        return None  # Return None if no match is found


In [4]:
def parse_training_data(link_attributes, institution, order):
    """
    Parses a list of educational or training attributes to extract relevant details such as 
    degree level, subject, start year, end year, and awards. The function handles various 
    formats and patterns in the input data and applies specific rules based on the presence 
    of keywords like 'residency', 'fellowship', and 'class of', and adjusts for special cases 
    such as osteopathic institutions ('DO' degrees).

    Parameters:
    -----------
    link_attributes : list of str
        A list containing details about an individual's training or education at a particular 
        institution. Each entry can represent degree level, subject, year range, or other details.
        
    institution : str
        The name of the institution where the training or education took place. This is 
        checked for keywords such as 'osteopath' to infer specific degree levels like 'DO'.
        
    order : int
        An integer representing the order of the training or education in the individual's 
        trajectory, used for tracking sequence in a graph or timeline.

    Returns:
    --------
    dict
        A dictionary with the following keys:
        - 'level' : str or None
            The training level or degree (e.g., 'MD', 'PhD', 'Fellowship', 'Residency').
        - 'order' : int
            The order of the training or education, as passed in the input.
        - 'subject' : str or None
            The subject or field of study (e.g., 'Biology', 'Cardiology').
        - 'start' : int or None
            The start year of the training/education.
        - 'end' : int or None
            The end year of the training/education.
        - 'award' : str or None
            Any awards or honors mentioned (e.g., 'Cum Laude', 'Honors').

    Example:
    --------
    >>> parse_training_data(['Fellowship', 'Cardiology', '2005 - 2007'], 'Harvard University', 1)
    {'level': 'fellowship', 'order': 1, 'subject': 'cardiology', 'start': 2005, 'end': 2007, 'award': None}

    >>> parse_training_data(['Class of 2005'], 'NYIT College Of Osteopathic Medicine', 2)
    {'level': 'do', 'order': 2, 'subject': None, 'start': 2001, 'end': 2005, 'award': None}
    """
    level = None
    subject = None
    start_date = None
    end_date = None
    award = None 
    
    l = -1
    for entry in link_attributes:
        
        # training level 
        l = l + 1
        result = extract_years(entry)
        if 'class of' in entry.lower():
            level = 'md'
            # topic = None
            start_date = int(entry.split()[-1]) - 4
            end_date = int(entry.split()[-1])

        elif l == 0:

            if 'residency' in entry.lower():
                level = 'residency' 
                # start = start = int(entry.split()[-3]) # this needs to be its own if clause becuase the date is not going to be in the same entry as residency
                
            elif 'postdoc' in entry.lower().replace('-', ''):
                level = 'postdoc'
            
            elif 'fellowship' in entry.lower(): 
                level = 'fellowship'
            
            elif 'intern' in entry.lower():
                level = 'internship'

            elif 'intern' in entry.lower():
                level = 'internship'

            elif 'phd' in entry.lower().replace('.', ''):
                level = 'phd'

            elif 'mph' in entry.lower().replace('.', ''):
                level = 'mph'

            elif 'ba' in entry.lower().replace('.', ''):
                level = 'ba'
            
            elif 'bs' in entry.lower().replace('.', ''):
                level = 'bs'
            
            else:
                level = entry.lower().replace('.','')
        
        elif 'do' == entry.lower().replace('.',''):
            level = 'do'
        # start and end dates
        elif result:
            start_date, end_date = result
        
        # awards
        elif 'cum' in entry.lower():
            award = entry.lower()
        elif 'honor' in entry.lower() or 'honour' in entry.lower():
            award = entry.lower()

        # assume anything else is subject 
        else:
            subject = entry.lower()

    if 'osteopath' in institution.lower():
        level = 'do'
        
    parsed_dict = {
        'level': level,
        'order': order,
        'subject': subject,
        'start': start_date,
        'end': end_date,
        'award': award
    }
    return parsed_dict


In [14]:
i = 2
filepath = f'../../physician_trajectories/data/card/{i}.json'
with open(filepath, 'r') as f:
    data = json.load(f) 
state =data[str(i)]['office'].split()[-2]
zipcode = data[str(i)]['office'].split()[-1]
print(state,zipcode)

MN 55905


In [23]:
training_data = data[str(i)]['training']
[list(item.keys())[0] for item in training_data.values()]

['University of Missouri-Kansas City School of Medicine',
 'University of Missouri-Kansas City School of Medicine',
 'Mayo Clinic College of Medicine and Science (Rochester) A Aerospace Medicine',
 'University of Missouri-Kansas City School of Medicine',
 'University of Colorado School of Medicine Anschutz Medical Campus',
 'Mayo Clinic College of Medicine',
 'Zagazig University Faculty of Medicine']

In [41]:
import os
import json
import pandas as pd

base_path = '../../physician_trajectories/data'
folders = [folder for folder in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, folder))]

errors = 0
# tdf_dict = {}  # Dictionary to store tdf for each folder

ids = []
states = []
zipcodes = []
specialties = []
institutions = []

for folder in folders:
    folder_path = os.path.join(base_path, folder)
    sdf = pd.DataFrame({'doctor_id': [], 'state': [], 'zipcode': [], 'specialty': [], 'institutions': []})
    print(f"Processing folder: {folder}")
    
    # Iterate through files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json') and 'failed' not in file_name and 'next' not in file_name:
            try:
                file_path = os.path.join(folder_path, file_name)
                with open(file_path, 'r') as f:
                    data = json.load(f)
                
                # Assuming the file name format includes the ID
                i = int(file_name.split('.')[0])
                
                id = i
                state = data[str(i)]['office'].split()[-2]
                zipcode = data[str(i)]['office'].split()[-1]
                specialty = data[str(i)]['specialty']
                institution = [list(item.keys())[0] for item in data[str(i)]['training'].values()]
                if state and zipcode and specialty and institution and id:
                    unique_id = f"{id}_{folder}"
                    ids.append(unique_id)
                    states.append(state)
                    zipcodes.append(zipcode)
                    specialties.append(specialty)
                    institutions.append(institution)
                
            except Exception as e:
                print(f"[Error] Error processing file {file_name} in folder {folder}: {e}")
                errors += 1
                continue
    

    #tdf_dict[folder] = tdf  # Store the tdf for this folder
    print(f"Completed processing folder: {folder}.")

sdf = pd.DataFrame({'doctor_id': ids, 'state': states, 'zipcode': zipcodes, 'specialty': specialties, 'institutions': institutions})   

# Example: Access the tdf for a specific folder
# print(tdf_dict['card'])



Processing folder: neurosurgery
[Error] Error processing file 4267.json in folder neurosurgery: string indices must be integers, not 'str'
[Error] Error processing file 791.json in folder neurosurgery: string indices must be integers, not 'str'
[Error] Error processing file 3609.json in folder neurosurgery: string indices must be integers, not 'str'
[Error] Error processing file 2761.json in folder neurosurgery: string indices must be integers, not 'str'
[Error] Error processing file 8442.json in folder neurosurgery: string indices must be integers, not 'str'
[Error] Error processing file 343.json in folder neurosurgery: string indices must be integers, not 'str'
[Error] Error processing file 355.json in folder neurosurgery: string indices must be integers, not 'str'
[Error] Error processing file 617.json in folder neurosurgery: string indices must be integers, not 'str'
[Error] Error processing file 4070.json in folder neurosurgery: string indices must be integers, not 'str'
[Error] E

In [42]:
sdf

Unnamed: 0,doctor_id,state,zipcode,specialty,institutions
0,3721_neurosurgery,CA,93301,Neurosurgery,"[National Capital Consortium, National Capital..."
1,2833_neurosurgery,WA,98101,Neurosurgery,"[University of Washington, University of Texas..."
2,729_neurosurgery,OK,73104,Neurosurgery,[University of California (San Francisco) Scho...
3,6523_neurosurgery,FL,32611,Neurosurgery,[University of Florida College of Medicine]
4,6489_neurosurgery,CA,93291,Neurosurgery,"[University of Miami/Jackson Health System, UC..."
...,...,...,...,...,...
205985,2631_hematology,CA,91316,Hematology,[Other]
205986,2261_hematology,KS,66205,Hematology,[Vidant Medical Center/East Carolina Universit...
205987,881_hematology,OR,97239,Hematology,[Brigham and Women's Hospital/Massachusetts Ge...
205988,182_hematology,IL,60611,Hematology,[McGaw Medical Center of Northwestern Universi...


In [43]:
Counter(sdf['specialty'])

Counter({'Obstetrics & Gynecology': 57954,
         'Cardiology': 38337,
         'Ophthalmology': 25567,
         'Gastroenterology': 21538,
         'Dermatology': 18181,
         'Geriatrics': 10923,
         'Neurosurgery': 8718,
         'Allergy & Immunology': 6238,
         'Psychiatry': 4933,
         'Interventional Radiology': 3642,
         'Child Neurology': 2909,
         'Colon & Rectal Surgery': 2812,
         'Hematology': 2664,
         'Medical Genetics': 1529,
         'Endocrinology': 27,
         'Internal Medicine': 7,
         'Other MD/DO': 5,
         'Resident Physician': 4,
         'Pediatric Endocrinology': 1,
         'Emergency Medicine': 1})

In [None]:
Counter(sdf['state'])

Counter({'CA': 24324,
         'NY': 18189,
         'TX': 14400,
         'FL': 13556,
         'PA': 10062,
         'IL': 8136,
         'OH': 7226,
         'MA': 7105,
         'NJ': 6526,
         'NC': 6396,
         'MI': 6198,
         'GA': 5356,
         'VA': 5230,
         'MD': 5059,
         'WA': 4352,
         'MN': 3881,
         'TN': 3867,
         'MO': 3692,
         'AZ': 3685,
         'CO': 3558,
         'IN': 3403,
         'WI': 3378,
         'CT': 3284,
         'LA': 2710,
         'OR': 2657,
         'SC': 2634,
         'AL': 2234,
         'KY': 2218,
         'OK': 1637,
         'UT': 1523,
         'PR': 1427,
         'IA': 1389,
         'KS': 1359,
         'AR': 1344,
         'DC': 1301,
         'NV': 1242,
         'MS': 1215,
         'HI': 981,
         'NM': 970,
         'NE': 969,
         'WV': 952,
         'NH': 921,
         'RI': 914,
         'ME': 837,
         'ID': 679,
         'DE': 601,
         'MT': 549,
         'SD': 441

In [47]:
Counter(sdf['zipcode'])

Counter({'77030': 1451,
         '55905': 888,
         '02115': 882,
         '02114': 864,
         '19104': 853,
         '10016': 809,
         '60611': 759,
         '60612': 719,
         '44195': 704,
         '10029': 675,
         '10032': 577,
         '80045': 567,
         '15213': 559,
         '90095': 551,
         '19107': 547,
         '75390': 544,
         '10021': 542,
         '78229': 537,
         '94305': 527,
         '46202': 519,
         '21287': 515,
         '48109': 508,
         '63110': 468,
         '33136': 463,
         '97239': 459,
         '37232': 454,
         '02215': 445,
         '10065': 435,
         '72205': 432,
         '94143': 409,
         '53226': 409,
         '92868': 399,
         '92037': 398,
         '90048': 386,
         '92103': 383,
         '52242': 372,
         '90033': 372,
         '30322': 369,
         '06510': 367,
         '76104': 363,
         '35233': 355,
         '94115': 353,
         '44106': 351,
         '