In [1]:
# -----------------------> initialize libraries
import os
import json
import pandas as pd
from urllib.request import urlretrieve
from zipfile import ZipFile
import ijson
import numpy as np

In [2]:
# -----------------------> Download FDA recall ZIP file
url = "https://download.open.fda.gov/device/recall/device-recall-0001-of-0001.json.zip"
zip_filename = "device-recall-0001-of-0001.json.zip"
json_filename = "device-recall-0001-of-0001.json"

if not os.path.exists(zip_filename):
    print("Downloading FDA recall zip...")
    urlretrieve(url, zip_filename)
else:
    print("ZIP already downloaded.")

# -----------------------> Extract JSON
if not os.path.exists(json_filename):
    print("Extracting JSON...")
    with ZipFile(zip_filename, 'r') as z:
        z.extract(json_filename)
else:
    print("JSON already extracted.")

ZIP already downloaded.
JSON already extracted.


In [42]:
# -----------------------> Stream JSON to avoid MemoryError
records = []
with open(json_filename, 'r', encoding='utf-8') as f:
    parser = ijson.items(f, 'results.item')
    for entry in parser:
        manufacturer = entry.get('recalling_firm', '').strip()
        if manufacturer == '':
            manufacturer = "Unknown"

        recall_status = entry.get('recall_status', '').strip().lower()
        # Compute rating
        if 'terminated' in recall_status:
            rating = 1000
        elif 'completed' in recall_status:
            rating = 800
        elif 'ongoing' in recall_status:
            rating = 500
        elif 'pending' in recall_status:
            rating = 300
        else:
            rating = 600  # unknown/missing

        records.append({
            'manufacturer': manufacturer,
            'product_description': entry.get('product_description', '').replace('\n',' ').strip(),
            'recall_status': recall_status,
            'reason_for_recall': entry.get('reason_for_recall','').replace('\n',' ').strip(),
            'event_date_initiated': entry.get('event_date_initiated',''),
            'recall_number': entry.get('recall_number','').strip(),
            'rating': rating
        })

print(f"Parsed {len(records)} recalls")

# -----------------------> Convert to DataFrame
df = pd.DataFrame(records)

Parsed 56268 recalls


In [59]:
# -----------------------> Save Raw Recalls Sheet
output_file = "recalls_with_ratings_summary.xlsx"
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    df.to_excel(writer, sheet_name="Raw_Recalls", index=False)

print(f"Saved raw recalls to {output_file}") #save columns w/ manufacturer name (mult. instances)

Saved raw recalls to recalls_with_ratings_summary.xlsx


In [61]:
# -----------------------> Aggregate per manufacturer
short=pd.DataFrame({
    'Company Nicknames': df['manufacturer'].astype(str).str.slice(0,4) # grab first 4 letters of each company
})
df.agg(total_recalls=('manufacturer','count'), avg_rating=('rating', 'mean'))
df.reset_index()
df['manufacturer']=short['Company Nicknames']
output=df.groupby('manufacturer', as_index=False)['rating'].sum()
output=output.sort_values('rating', ascending=False)
output.to_excel(output_file, sheet_name="Manufacturer_Summary", index=False) 
#saved new sheet in workbook with shortened names + rating combined

In [66]:
# -----------------------> Compute ELO-style score
newdf= pd.read_excel('recalls_with_ratings_summary.xlsx', sheet_name = 'Manufacturer_Summary')
avg_rating = newdf['rating'].mean()
newdf['elo'] = 1000 - 0.04 * (newdf['rating'] - rating) # k value arbitrary, company rating compared to average rating, 1000 gives us chess-style looking number
#hieuristics mostly here, vibes-based coding for decision on k-value

In [68]:
# -----------------------> Classify companies based on ELO percentiles
percentiles = newdf['elo'].rank(pct=True)

def classify(p):
    if p >= 0.8:
        return "Excellent"
    elif p >= 0.6:
        return "Great"
    elif p >= 0.4:
        return "Average"
    elif p >= 0.2:
        return "Poor"
    else:
        return "Avoid"

newdf['rating_class'] = percentiles.apply(classify)

In [69]:
with pd.ExcelWriter(output_file, engine='openpyxl', mode='a') as writer:
    newdf.to_excel(writer, sheet_name="Results", index=False)

print(f"Saved manufacturer summary to {output_file}")

Saved manufacturer summary to recalls_with_ratings_summary.xlsx
