In [1]:
# ==========================================================
# FDA Medical Device Ranking System
# Uses Recall + 510(k) + PMA
# ==========================================================

# -----------------------> Initialize Libraries
import os
import json
import pandas as pd
from urllib.request import urlretrieve
from zipfile import ZipFile
import ijson
import numpy as np

In [2]:
# ==========================================================
# SECTION 1: Download and parse FDA Device Recall JSON
# ==========================================================
url = "https://download.open.fda.gov/device/recall/device-recall-0001-of-0001.json.zip"
zip_filename = "device-recall-0001-of-0001.json.zip"
json_filename = "device-recall-0001-of-0001.json"

if not os.path.exists(zip_filename):
    print("Downloading FDA recall zip...")
    urlretrieve(url, zip_filename)
else:
    print("ZIP already downloaded.")

if not os.path.exists(json_filename):
    print("Extracting JSON...")
    with ZipFile(zip_filename, 'r') as z:
        z.extract(json_filename)
else:
    print("JSON already extracted.")

records = []
with open(json_filename, 'r', encoding='utf-8') as f:
    parser = ijson.items(f, 'results.item')
    for entry in parser:
        manufacturer = entry.get('recalling_firm', '').strip() or "Unknown"

        recall_status = entry.get('recall_status', '').strip().lower()
        if 'terminated' in recall_status:
            rating = 1000
        elif 'completed' in recall_status:
            rating = 800
        elif 'ongoing' in recall_status:
            rating = 500
        elif 'pending' in recall_status:
            rating = 300
        else:
            rating = 600

        records.append({
            'manufacturer': manufacturer,
            'product_description': entry.get('product_description', '').replace('\n', ' ').strip(),
            'recall_status': recall_status,
            'reason_for_recall': entry.get('reason_for_recall', '').replace('\n', ' ').strip(),
            'event_date_initiated': entry.get('event_date_initiated', ''),
            'recall_number': entry.get('recall_number', '').strip(),
            'rating': rating
        })

df = pd.DataFrame(records)
print(f"Parsed {len(df)} recalls")

ZIP already downloaded.
JSON already extracted.
Parsed 56268 recalls


In [3]:
# ==========================================================
# SECTION 2: Download and parse 510(k) approvals
# ==========================================================
k510_url = "https://download.open.fda.gov/device/510k/device-510k-0001-of-0001.json.zip"
k510_zip = "device-510k-0001-of-0001.json.zip"
k510_json = "device-510k-0001-of-0001.json"

if not os.path.exists(k510_zip):
    print("Downloading FDA 510(k) zip...")
    urlretrieve(k510_url, k510_zip)
else:
    print("510(k) ZIP already downloaded.")

if not os.path.exists(k510_json):
    print("Extracting 510(k) JSON...")
    with ZipFile(k510_zip, 'r') as z:
        z.extract(k510_json)
else:
    print("510(k) JSON already extracted.")

k510_records = []
with open(k510_json, 'r', encoding='utf-8') as f:
    parser = ijson.items(f, 'results.item')
    for entry in parser:
        manufacturer = (
            entry.get('applicant') or
            entry.get('applicant_name') or
            entry.get('manufacturer_name')
        )
        if not manufacturer:
            openfda = entry.get('openfda', {})
            manufacturer = openfda.get('manufacturer_name', [None])
            if isinstance(manufacturer, list):
                manufacturer = manufacturer[0]
        if not manufacturer:
            manufacturer = "Unknown"
        manufacturer = str(manufacturer).strip()
        k510_records.append({
            'manufacturer': manufacturer,
            'k_number': entry.get('k_number', '').strip(),
            'device_name': entry.get('device_name', '').strip(),
            'product_code': entry.get('product_code', '').strip(),
            'decision_date': entry.get('decision_date', '')
        })

k510_df = pd.DataFrame(k510_records)
k510_df['manufacturer'] = k510_df['manufacturer'].fillna('Unknown').astype(str).str.strip()

# Aggregate total 510(k) approvals per manufacturer
k510_summary = k510_df.groupby('manufacturer')['k_number'].nunique().reset_index(name='total_approvals')
print(f"Parsed {len(k510_summary)} unique 510(k) manufacturers")

510(k) ZIP already downloaded.
510(k) JSON already extracted.
Parsed 39429 unique 510(k) manufacturers


In [4]:
# ==========================================================
# SECTION 3: Download and parse PMA approvals
# ==========================================================
pma_url = "https://download.open.fda.gov/device/pma/device-pma-0001-of-0001.json.zip"
pma_zip = "device-pma-0001-of-0001.json.zip"
pma_json = "device-pma-0001-of-0001.json"

if not os.path.exists(pma_zip):
    print("Downloading FDA PMA zip...")
    urlretrieve(pma_url, pma_zip)
else:
    print("PMA ZIP already downloaded.")

if not os.path.exists(pma_json):
    print("Extracting PMA JSON...")
    with ZipFile(pma_zip, 'r') as z:
        z.extract(pma_json)
else:
    print("PMA JSON already extracted.")

pma_records = []
with open(pma_json, 'r', encoding='utf-8') as f:
    parser = ijson.items(f, 'results.item')
    for entry in parser:
        manufacturer = entry.get('applicant', '').strip() or "Unknown"
        pma_records.append({
            'manufacturer': manufacturer,
            'pma_number': entry.get('pma_number', '').strip(),
            'trade_name': entry.get('trade_name', '').strip(),
            'decision_date': entry.get('decision_date', '')
        })

pma_df = pd.DataFrame(pma_records)
pma_df['manufacturer'] = pma_df['manufacturer'].fillna('Unknown').astype(str).str.strip()

# Aggregate PMA approvals per manufacturer
pma_summary = pma_df.groupby('manufacturer')['pma_number'].nunique().reset_index(name='total_pma')
print(f"Parsed {len(pma_summary)} unique PMA manufacturers")

PMA ZIP already downloaded.
PMA JSON already extracted.
Parsed 874 unique PMA manufacturers


In [43]:
# ==========================================================
# SECTION 4: Aggregation by nickname
# ==========================================================

# ensure manufacturer cleaned and Company_Short exists
df['manufacturer'] = df['manufacturer'].fillna('Unknown').astype(str).str.strip()
df['Company_Short'] = df['manufacturer'].str.upper().str.slice(0, 4)

# Count rows per Company_Short (reliable total recalls)
recalls_by_rows = df.groupby('Company_Short', as_index=False).size().rename(columns={'size':'total_recalls'})

# Average rating per Company_Short
avg_rating = df.groupby('Company_Short', as_index=False)['rating'].mean().rename(columns={'rating':'avg_rating'})

# Representative manufacturer string (most common full name in group)
rep_manufacturer = df.groupby('Company_Short', as_index=False)['manufacturer'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else 'Unknown')

# Merge those into recall_summary
recall_summary = rep_manufacturer.merge(recalls_by_rows, on='Company_Short', how='left') \
                                 .merge(avg_rating, on='Company_Short', how='left') \
                                 .merge(recalls_by_nunique, on='Company_Short', how='left')

avg_rating_value = recall_summary['avg_rating'].mean()
recall_summary['elo'] = 1000 - 0.06 * (recall_summary['avg_rating'] - avg_rating_value)

# diagnostic print
print(f"Aggregated recall_summary rows: {len(recall_summary)}")
print("Top 10 recall_summary by total_recalls:")
print(recall_summary.sort_values('total_recalls', ascending=False).head(10).to_string(index=False))

Aggregated recall_summary rows: 1854
Top 10 recall_summary by total_recalls:
Company_Short                            manufacturer  total_recalls  avg_rating  unique_recall_numbers         elo
         STRY       Stryker Howmedica Osteonics Corp.           2077  966.201252                      1  996.712165
         PHIL Philips Medical Systems (Cleveland) Inc           2052  866.374269                      1 1002.701784
         SIEM      Siemens Medical Solutions USA, Inc           2020  941.584158                      1  998.189191
         ZIMM                     Zimmer Biomet, Inc.           2015  989.875931                      1  995.291684
         MEDT                      Medtronic Vascular           1815  891.239669                      1 1001.209860
         BIOM                            Biomet, Inc.           1543  960.985094                      1  997.025135
         BOST           Boston Scientific Corporation           1329  869.826938                      1 1002.49

In [44]:
# ==========================================================
# SECTION 5: Aggregate approvals by Company_Short then merge
# ==========================================================

# ensure Company_Short exists on the approvals tables (defensive)
k510_summary['Company_Short'] = k510_summary['manufacturer'].str.upper().str.slice(0, 4)
pma_summary['Company_Short'] = pma_summary['manufacturer'].str.upper().str.slice(0, 4)

# AGGREGATE each approvals table by Company_Short (this is the important change)
k510_agg = k510_summary.groupby('Company_Short', as_index=False).agg(
    total_approvals=('total_approvals', 'sum')
)

pma_agg = pma_summary.groupby('Company_Short', as_index=False).agg(
    total_pma=('total_pma', 'sum')
)

# Merge the two aggregated approvals tables (one row per Company_Short)
approvals_combined = pd.merge(k510_agg, pma_agg, on='Company_Short', how='outer').fillna(0)

# Compute total_devices using the aggregated totals
approvals_combined['total_devices'] = approvals_combined['total_approvals'].astype(int) + approvals_combined['total_pma'].astype(int)

# Merge recall_summary with aggregated approvals
merged_df = pd.merge(recall_summary, approvals_combined[['Company_Short', 'total_devices']], on='Company_Short', how='left')

# Fill missing approvals with zero and compute ratio safely
merged_df['total_devices'] = merged_df['total_devices'].fillna(0).astype(int)
merged_df['recalls_per_device'] = merged_df['total_recalls'] / merged_df['total_devices'].replace(0, np.nan)
merged_df['recalls_per_device'] = merged_df['recalls_per_device'].fillna(0)

In [45]:
# ==========================================================
# SECTION 6: Market Tier Assignment (Fixed Thresholds)
# ==========================================================
print("Assigning market tiers based on total approved devices...")

# Ensure total_devices column exists
if 'total_devices' not in merged_df.columns:
    raise KeyError("Column 'total_devices' not found. Make sure the 510(k) merge step ran successfully.")

# Fill any missing counts with zero
merged_df['total_devices'] = merged_df['total_devices'].fillna(0)

# --- Define fixed threshold logic
def assign_tier_fixed(devices):
    if devices > 800:
        return 'tier1_large'
    elif devices >= 50:
        return 'tier2_medium'
    else:
        return 'tier3_small'

# Apply the function
merged_df['Matched_Tier'] = merged_df['total_devices'].apply(assign_tier_fixed)

# Sort by tier and total devices for readability
tier_order = ['tier1_large', 'tier2_medium', 'tier3_small']
merged_df['Matched_Tier'] = pd.Categorical(merged_df['Matched_Tier'], categories=tier_order, ordered=True)
merged_df = merged_df.sort_values(['Matched_Tier', 'total_devices'], ascending=[True, False]).reset_index(drop=True)

print(" Tier assignment complete using fixed thresholds:")
print("   Tier 1 (Large): > 800 approved devices")
print("   Tier 2 (Medium): 50–1000 approved devices")
print("   Tier 3 (Small): < 50 approved devices")

Assigning market tiers based on total approved devices...
 Tier assignment complete using fixed thresholds:
   Tier 1 (Large): > 800 approved devices
   Tier 2 (Medium): 50–1000 approved devices
   Tier 3 (Small): < 50 approved devices


In [46]:
# ==========================================================
# SECTION 7: Rating Categories + Export
# ==========================================================
percentiles = merged_df['elo'].rank(pct=True)
conditions = [
    percentiles >= 0.8,
    (percentiles >= 0.6) & (percentiles < 0.8),
    (percentiles >= 0.4) & (percentiles < 0.6),
    (percentiles >= 0.2) & (percentiles < 0.4),
    percentiles < 0.2
]
ratings = ['Excellent', 'Great', 'Average', 'Poor', 'Avoid']
merged_df['Rating_Category'] = np.select(conditions, ratings, default='Unrated')

output_file = "medical_device_manufacturer_rankingsDemo.xlsx"
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    merged_df.to_excel(writer, sheet_name="Full_Summary", index=False)
    recall_summary.to_excel(writer, sheet_name="Recalls_Only", index=False)
    k510_summary.to_excel(writer, sheet_name="510k_Only", index=False)
    pma_summary.to_excel(writer, sheet_name="PMA_Only", index=False)

print(f" Aggregated correctly — saved to {output_file}")

 Aggregated correctly — saved to medical_device_manufacturer_rankingsDemo.xlsx
