1. Combine Medicare data with ADI

In [None]:
import pandas as pd

# Load the data
file_path = '/content/drive/My Drive/JAMA-ADI-data/DAC_NationalDownloadableFile.csv'
df = pd.read_csv(file_path)

# Select relevant columns based on your instructions
selected_columns = [
    'NPI', 'gndr', 'Med_sch', 'Grd_yr',  'pri_spec', 'ZIP Code', 'City/Town', 'State'
]

df_filtered = df[selected_columns]



# Display the filtered data
df_filtered.head()


import os
import pandas as pd

# Define directories for 2015 and 2020 ADI data
adi_2015_dir = '/content/drive/My Drive/JAMA-ADI-data/2015/'

# Function to load and append state data
def load_and_merge_adi_files(directory):
    merged_data = pd.DataFrame()  # Initialize empty DataFrame
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):  # Check for .txt files
            # Extract state from filename (first two letters of the filename)
            state = filename.split('_')[0]
            file_path = os.path.join(directory, filename)

            # Load the data
            adi_data = pd.read_csv(file_path)

            # Add state column
            adi_data['state'] = state

            # Merge with the main DataFrame
            merged_data = pd.concat([merged_data, adi_data], ignore_index=True)

    return merged_data

# Load and merge both 2015 and 2020 ADI data
adi_2015_data = load_and_merge_adi_files(adi_2015_dir)

# Check the structure of the merged data
adi_2015_data.head()


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load dataset
file_path = '/content/cleaned_medical_population_data.csv'
medical_data = pd.read_csv(file_path)

warnings.filterwarnings('ignore')

# Step 2: Data Preprocessing

medical_data['Grd_yr'] = medical_data['Grd_yr'].astype(int)


medical_data = medical_data.dropna(subset=['gndr', 'Med_sch', 'pri_spec', 'Avg_ADI_NATRANK_2015', 'Avg_ADI_NATRANK_2020', 'Estimate!!SEX AND AGE!!Total population'])

# Step 2a: Ensure Data Includes Only First Job Placements


medical_data['Practice_Year_2015'] = 2015
medical_data['Practice_Year_2020'] = 2020


cohort_2015 = medical_data[medical_data['Grd_yr'] == 2015].copy()
cohort_2015 = cohort_2015[cohort_2015['Grd_yr'] == cohort_2015['Practice_Year_2015']]


cohort_2020 = medical_data[medical_data['Grd_yr'] == 2020].copy()
cohort_2020 = cohort_2020[cohort_2020['Grd_yr'] == cohort_2020['Practice_Year_2020']]

# Step 3: Create Low_ADI_Placement Variables

adi_threshold_low_2015 = medical_data['Avg_ADI_NATRANK_2015'].quantile(0.2)
adi_threshold_low_2020 = medical_data['Avg_ADI_NATRANK_2020'].quantile(0.2)
cohort_2015['Low_ADI_Placement'] = (cohort_2015['Avg_ADI_NATRANK_2015'] <= adi_threshold_low_2015).astype(int)
cohort_2020['Low_ADI_Placement'] = (cohort_2020['Avg_ADI_NATRANK_2020'] <= adi_threshold_low_2020).astype(int)
adi_threshold_high_2015 = cohort_2015['Avg_ADI_NATRANK_2015'].quantile(0.8)
cohort_2015['High_ADI_Placement'] = (cohort_2015['Avg_ADI_NATRANK_2015'] >= adi_threshold_high_2015).astype(int)
adi_threshold_high_2020 = cohort_2020['Avg_ADI_NATRANK_2020'].quantile(0.8)
cohort_2020['High_ADI_Placement'] = (cohort_2020['Avg_ADI_NATRANK_2020'] >= adi_threshold_high_2020).astype(int)


# Step 4: Group Primary Specializations
specialty_group_mapping = {
    'FAMILY PRACTICE': 'Primary Care',
    'GENERAL PRACTICE': 'Primary Care',
    'INTERNAL MEDICINE': 'Primary Care',
    'PEDIATRIC MEDICINE': 'Primary Care',
    'GERIATRIC MEDICINE': 'Primary Care',

    'GENERAL SURGERY': 'Surgery',
    'ORTHOPEDIC SURGERY': 'Surgery',
    'NEUROSURGERY': 'Surgery',
    'CARDIAC SURGERY': 'Surgery',
    'VASCULAR SURGERY': 'Surgery',
    'THORACIC SURGERY': 'Surgery',
    'PLASTIC AND RECONSTRUCTIVE SURGERY': 'Surgery',
    'COLORECTAL SURGERY (PROCTOLOGY)': 'Surgery',
    'HAND SURGERY': 'Surgery',
    'MAXILLOFACIAL SURGERY': 'Surgery',
    'SURGICAL ONCOLOGY': 'Surgery',
    'UROLOGY': 'Surgery',
    'OTOLARYNGOLOGY': 'Surgery',

    'CARDIOVASCULAR DISEASE (CARDIOLOGY)': 'Medical Specialties',
    'ENDOCRINOLOGY': 'Medical Specialties',
    'GASTROENTEROLOGY': 'Medical Specialties',
    'INFECTIOUS DISEASE': 'Medical Specialties',
    'NEPHROLOGY': 'Medical Specialties',
    'PULMONARY DISEASE': 'Medical Specialties',
    'RHEUMATOLOGY': 'Medical Specialties',
    'HEMATOLOGY': 'Medical Specialties',
    'HEMATOLOGY/ONCOLOGY': 'Medical Specialties',
    'MEDICAL ONCOLOGY': 'Medical Specialties',
    'NEUROLOGY': 'Medical Specialties',
    'DERMATOLOGY': 'Medical Specialties',
    'ALLERGY/IMMUNOLOGY': 'Medical Specialties',
    'HOSPITALIST': 'Medical Specialties',
    'CRITICAL CARE (INTENSIVISTS)': 'Medical Specialties',
    'SLEEP MEDICINE': 'Medical Specialties',
    'PAIN MANAGEMENT': 'Medical Specialties',
    'SPORTS MEDICINE': 'Medical Specialties',
    'CARDIAC ELECTROPHYSIOLOGY': 'Medical Specialties',
    'INTERVENTIONAL CARDIOLOGY': 'Medical Specialties',
    'ADVANCED HEART FAILURE AND TRANSPLANT CARDIOLOGY': 'Medical Specialties',
    'PERIPHERAL VASCULAR DISEASE': 'Medical Specialties',
    'MEDICAL GENETICS AND GENOMICS': 'Medical Specialties',
    'UNDERSEA AND HYPERBARIC MEDICINE': 'Medical Specialties',
    'MEDICAL TOXICOLOGY': 'Medical Specialties',
    'INTERVENTIONAL PAIN MANAGEMENT': 'Medical Specialties',
    'ADULT CONGENITAL HEART DISEASE (ACHD)': 'Medical Specialties',
    'HEMATOPOIETIC CELL TRANSPLANTATION AND CELLULAR THERAPY': 'Medical Specialties',

    'PSYCHIATRY': 'Psychiatry',
    'GERIATRIC PSYCHIATRY': 'Psychiatry',
    'NEUROPSYCHIATRY': 'Psychiatry',
    'ADDICTION MEDICINE': 'Psychiatry',
    'CLINICAL PSYCHOLOGIST': 'Psychiatry',
    'MENTAL HEALTH COUNSELOR': 'Psychiatry',
    'MARRIAGE AND FAMILY THERAPIST': 'Psychiatry',
    'CLINICAL SOCIAL WORKER': 'Psychiatry',

    'EMERGENCY MEDICINE': 'Emergency Medicine',

    'ANESTHESIOLOGY': 'Anesthesiology',
    'ANESTHESIOLOGY ASSISTANT': 'Anesthesiology',

    'DIAGNOSTIC RADIOLOGY': 'Radiology',
    'INTERVENTIONAL RADIOLOGY': 'Radiology',
    'NUCLEAR MEDICINE': 'Radiology',
    'RADIATION ONCOLOGY': 'Radiology',

    'OBSTETRICS/GYNECOLOGY': 'Obstetrics/Gynecology',
    'GYNECOLOGICAL ONCOLOGY': 'Obstetrics/Gynecology',
    'CERTIFIED NURSE MIDWIFE (CNM)': 'Obstetrics/Gynecology',

    'OPHTHALMOLOGY': 'Ophthalmology/ENT',
    'OTOLARYNGOLOGY': 'Ophthalmology/ENT',
    'QUALIFIED AUDIOLOGIST': 'Ophthalmology/ENT',

    'PHYSICAL THERAPY': 'Rehabilitation/Therapy',
    'OCCUPATIONAL THERAPY': 'Rehabilitation/Therapy',
    'PHYSICAL MEDICINE AND REHABILITATION': 'Rehabilitation/Therapy',
    'QUALIFIED SPEECH LANGUAGE PATHOLOGIST': 'Rehabilitation/Therapy',

    'PATHOLOGY': 'Pathology',

    'NURSE PRACTITIONER': 'Advanced Practice Providers',
    'PHYSICIAN ASSISTANT': 'Advanced Practice Providers',
    'CERTIFIED REGISTERED NURSE ANESTHETIST (CRNA)': 'Advanced Practice Providers',
    'CERTIFIED CLINICAL NURSE SPECIALIST (CNS)': 'Advanced Practice Providers',

    'DENTIST': 'Dentistry',
    'ORAL SURGERY': 'Dentistry',
    'ORAL MEDICINE': 'Dentistry',
    'ORAL AND MAXILLOFACIAL RADIOLOGY': 'Dentistry',

    'OPTOMETRY': 'Other Specialties',
    'REGISTERED DIETITIAN OR NUTRITION PROFESSIONAL': 'Other Specialties',
    'PODIATRY': 'Other Specialties',
    'HOSPICE/PALLIATIVE CARE': 'Other Specialties',
}
cohort_2015['pri_spec_grouped'] = cohort_2015['pri_spec'].map(specialty_group_mapping)
cohort_2020['pri_spec_grouped'] = cohort_2020['pri_spec'].map(specialty_group_mapping)

cohort_2015['pri_spec_grouped'] = cohort_2015['pri_spec_grouped'].fillna('Other Specialties')
cohort_2020['pri_spec_grouped'] = cohort_2020['pri_spec_grouped'].fillna('Other Specialties')

# Step 5: Incorporate Top 20 Medical Schools

top_20_institutions = {
    "HARVARD MEDICAL SCHOOL": "MA",
    "JOHNS HOPKINS UNIVERSITY SCHOOL OF MEDICINE": "MD",
    "PERELMAN SCHOOL OF MED AT THE UNIVERSITY OF PENNSYLVANIA": "PA",
    "COLUMBIA UNIVERSITY COLLEGE OF PHYSICIANS AND SURGEONS": "NY",
    "DUKE UNIVERSITY SCHOOL OF MEDICINE": "NC",
    "STANFORD UNIVERSITY SCHOOL OF MEDICINE": "CA",
    "UNIVERSITY OF CALIFORNIA, SAN FRANCISCO SCHOOL OF MEDICINE": "CA",
    "VANDERBILT UNIVERSITY SCHOOL OF MEDICINE": "TN",
    "WASHINGTON UNIVERSITY SCHOOL OF MEDICINE": "MO",
    "JS WEILL MEDICAL COLLEGE, CORNELL UNIVERSITY": "NY",
    "NEW YORK UNIVERSITY SCHOOL OF MEDICINE": "NY",
    "YALE UNIVERSITY SCHOOL OF MEDICINE": "CT",
    "MAYO MEDICAL SCHOOL": "MN",
    "NORTHWESTERN UNIVERSITY FEINBERG MEDICAL SCHOOL": "IL",
    "UNIVERSITY OF MICHIGAN MEDICAL SCHOOL": "MI",
    "UNIVERSITY OF PITTSBURGH SCHOOL OF MEDICINE": "PA",
    "UNIVERSITY OF WASHINGTON SCHOOL OF MEDICINE": "WA",
    "ICAHN SCHOOL OF MEDICINE AT MOUNT SINAI": "NY",
    "UNIVERSITY OF CALIFORNIA, GEFFEN SCHOOL OF MEDICINE": "CA",
    "UNIVERSITY OF CHICAGO, PRITZKER SCHOOL OF MEDICINE": "IL",
    "UNIVERSITY OF CALIFORNIA, SAN DIEGO SCHOOL OF MEDICINE": "CA"
}

top_20_schools_set = set(top_20_institutions.keys())
cohort_2015['Med_sch_cleaned'] = cohort_2015['Med_sch'].str.upper().str.strip()
cohort_2020['Med_sch_cleaned'] = cohort_2020['Med_sch'].str.upper().str.strip()
cohort_2015['top20_institution'] = cohort_2015['Med_sch_cleaned'].apply(lambda x: 1 if x in top_20_schools_set else 0)
cohort_2020['top20_institution'] = cohort_2020['Med_sch_cleaned'].apply(lambda x: 1 if x in top_20_schools_set else 0)

# Step 6: Prepare Data for Logistic Regression

cohort_2015['gndr'] = cohort_2015['gndr'].str.upper().str.strip()
cohort_2020['gndr'] = cohort_2020['gndr'].str.upper().str.strip()
cohort_2015['pri_spec_grouped'] = cohort_2015['pri_spec_grouped'].str.strip()
cohort_2020['pri_spec_grouped'] = cohort_2020['pri_spec_grouped'].str.strip()

cohort_2015['Total_Population'] = cohort_2015['Estimate!!SEX AND AGE!!Total population']
cohort_2020['Total_Population'] = cohort_2020['Estimate!!SEX AND AGE!!Total population']
cohort_2015 = cohort_2015[cohort_2015['Total_Population'] > 0].copy()
cohort_2020 = cohort_2020[cohort_2020['Total_Population'] > 0].copy()


# Apply log transformation
cohort_2015['Log_Total_Population'] = np.log(cohort_2015['Total_Population'])
cohort_2020['Log_Total_Population'] = np.log(cohort_2020['Total_Population'])
