In [1]:
import os

# Identify the data directory and working directory
data_directory = './data'
working_directory = './2015_reprocessed'

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")

In [2]:
from zipfile import ZipFile

# Identify the data directory, working directory, and data files
data_directory = './data'
working_directory = './2015_Reprocessed'

data_files = [
    'foidevproblem.zip',       # 1
    'deviceproblemcodes.zip',  # 2
    'foitext2015.zip',         # 3
    'device2015.zip',          # 4
    'mdrfoithru2021.zip'       # 5
]

# Unzip the data files into the working directory
for i in data_files:
    print(f"Unzipping {i}")
    with ZipFile(f"{data_directory}/{i}", "r") as zip:
        zip.extractall(f"{working_directory}")

print("Unzip complete.")

# Change file names in working directory to lower case
for file in os.listdir(working_directory):
        os.rename(f"{working_directory}/{file}", f"{working_directory}/{file.lower()}")


Unzipping foidevproblem.zip
Unzipping deviceproblemcodes.zip
Unzipping foitext2015.zip
Unzipping device2015.zip
Unzipping mdrfoithru2021.zip
Unzip complete.


In [3]:
import pandas as pd
import csv

working_directory = './2015_reprocessed'

# Read the foidevproblem and deviceproblemcodes files into data frames
foidevproblem = pd.read_csv(f"{working_directory}/foidevproblem.txt", 
        sep="|", 
        encoding="ISO-8859-1", 
        on_bad_lines='warn', 
        quoting=csv.QUOTE_NONE, 
        names=["MDR_REPORT_KEY","DEVICE_PROBLEM_CODE"], # This names the columns manually
        header=None)

deviceproblemcodes = pd.read_csv(f"{working_directory}/deviceproblemcodes.txt", 
        sep="|", 
        encoding="ISO-8859-1", 
        on_bad_lines='warn', 
        quoting=csv.QUOTE_NONE,
        names=["DEVICE_PROBLEM_CODE","DEVICE_PROBLEM_TEXT"], # This names the columns manually
        header=None)

# Use an inner join to combine the device code data frames on the 'DEVICE_PROBLEM_CODE' column
# The new data frame will include: 
#       MDR_REPORT_KEY
#       DEVICE_PROBLEM_CODE
#       DEVICE_PROBLEM_TEXT
combined_foidevproblem = pd.merge(
        foidevproblem, 
        deviceproblemcodes, 
        on="DEVICE_PROBLEM_CODE", 
        how="inner")

combined_foidevproblem.set_index('MDR_REPORT_KEY', inplace=True)

print(f"combined_foidevproblem data frame creation complete: {combined_foidevproblem.shape}")

  foidevproblem = pd.read_csv(f"{working_directory}/foidevproblem.txt",


combined_foidevproblem data frame creation complete: (14916688, 2)


In [4]:
import pandas as pd
import csv

# Identify the working directory
working_directory = './2015_reprocessed'

# Create a dataframe for the device file
foitext2015 = pd.read_csv(f"{working_directory}/foitext2015.txt",
        sep="|", 
        encoding="ISO-8859-1", 
        on_bad_lines='warn', 
        quoting=csv.QUOTE_NONE)

foitext2015.set_index('MDR_REPORT_KEY', inplace=True)
foitext2015.fillna('', inplace=True)

print(f"foitext2015 data frame creation complete: {foitext2015.shape}")

foitext2015 data frame creation complete: (2074007, 5)


In [5]:
import pandas as pd
import csv

# Identify the working directory
working_directory = './2015_reprocessed'

# Create a dataframe for the device file
device2015 = pd.read_csv(f"{working_directory}/device2015.txt",
        sep="|", 
        encoding="ISO-8859-1", 
        on_bad_lines='warn', 
        quoting=csv.QUOTE_NONE)

device2015.set_index('MDR_REPORT_KEY', inplace=True)

# Remove the unwanted columns from the device dataframe
unwanted_columns = [
    'DEVICE_EVENT_KEY',
    'IMPLANT_FLAG',
    'DATE_REMOVED_FLAG',
    'DATE_RECEIVED',
    'MANUFACTURER_D_ADDRESS_1',
    'MANUFACTURER_D_ADDRESS_2',
    'MANUFACTURER_D_CITY',
    'MANUFACTURER_D_STATE_CODE',
    'MANUFACTURER_D_ZIP_CODE',
    'MANUFACTURER_D_ZIP_CODE_EXT',
    'MANUFACTURER_D_COUNTRY_CODE',
    'MANUFACTURER_D_POSTAL_CODE',
    'DEVICE_OPERATOR',
    'EXPIRATION_DATE_OF_DEVICE',
    'CATALOG_NUMBER',
    'LOT_NUMBER',
    'OTHER_ID_NUMBER',
    'DATE_RETURNED_TO_MANUFACTURER',
    'DEVICE_AGE_TEXT',
    'DEVICE_EVALUATED_BY_MANUFACTUR',
    'COMBINATION_PRODUCT_FLAG']

device2015.drop(unwanted_columns, axis=1, inplace=True)
device2015.fillna('', inplace=True)

print(f"device2015 data frame creation complete: {device2015.shape}")

device2015 data frame creation complete: (862585, 7)


In [6]:
import pandas as pd
import csv

# Identify the working directory
working_directory = './2015_reprocessed'

# Create a dataframe for the device file
mdrfoiThru2021 = pd.read_csv(f"{working_directory}/mdrfoiThru2021.txt",
        sep="|", 
        encoding="ISO-8859-1", 
        on_bad_lines='warn', 
        quoting=csv.QUOTE_NONE)

mdrfoiThru2021.set_index('MDR_REPORT_KEY', inplace=True)

# Remove the unwanted columns from the device dataframe
unwanted_columns = [
    'EVENT_KEY',
    'MANUFACTURER_LINK_FLAG_',
    'NUMBER_PATIENTS_IN_EVENT',
    'ADVERSE_EVENT_FLAG',
    'PRODUCT_PROBLEM_FLAG',
    'DATE_REPORT',
    'DATE_OF_EVENT',
    'REPROCESSED_AND_REUSED_FLAG',
    'REPORTER_OCCUPATION_CODE',
    'HEALTH_PROFESSIONAL',
    'DATE_FACILITY_AWARE',
    'REPORT_DATE',
    'REPORT_TO_FDA',
    'DATE_REPORT_TO_FDA',
    'EVENT_LOCATION',
    'DATE_REPORT_TO_MANUFACTURER',
    'MANUFACTURER_CONTACT_T_NAME',
    'MANUFACTURER_CONTACT_F_NAME',
    'MANUFACTURER_CONTACT_L_NAME',
    'MANUFACTURER_CONTACT_STREET_1',
    'MANUFACTURER_CONTACT_STREET_2',
    'MANUFACTURER_CONTACT_CITY',
    'MANUFACTURER_CONTACT_STATE',
    'MANUFACTURER_CONTACT_ZIP_CODE',
    'MANUFACTURER_CONTACT_ZIP_EXT',
    'MANUFACTURER_CONTACT_COUNTRY',
    'MANUFACTURER_CONTACT_POSTAL',
    'MANUFACTURER_CONTACT_AREA_CODE',
    'MANUFACTURER_CONTACT_EXCHANGE',
    'MANUFACTURER_CONTACT_PHONE_NO',
    'MANUFACTURER_CONTACT_EXTENSION',
    'MANUFACTURER_CONTACT_PCOUNTRY',
    'MANUFACTURER_CONTACT_PCITY',
    'MANUFACTURER_CONTACT_PLOCAL',
    'MANUFACTURER_G1_STREET_1',
    'MANUFACTURER_G1_STREET_2',
    'MANUFACTURER_G1_CITY',
    'MANUFACTURER_G1_STATE_CODE',
    'MANUFACTURER_G1_ZIP_CODE',
    'MANUFACTURER_G1_ZIP_CODE_EXT',
    'MANUFACTURER_G1_COUNTRY_CODE',
    'MANUFACTURER_G1_POSTAL_CODE',
    'DATE_MANUFACTURER_RECEIVED',
    'DEVICE_DATE_OF_MANUFACTURE',
    'SINGLE_USE_FLAG',
    'PREVIOUS_USE_CODE',
    'REMOVAL_CORRECTION_NUMBER',
    'DISTRIBUTOR_NAME',
    'DISTRIBUTOR_ADDRESS_1',
    'DISTRIBUTOR_ADDRESS_2',
    'DISTRIBUTOR_CITY',
    'DISTRIBUTOR_STATE_CODE',
    'DISTRIBUTOR_ZIP_CODE',
    'DISTRIBUTOR_ZIP_CODE_EXT',
    'REPORT_TO_MANUFACTURER',
    'MANUFACTURER_ADDRESS_1',
    'MANUFACTURER_ADDRESS_2',
    'MANUFACTURER_CITY',
    'MANUFACTURER_STATE_CODE',
    'MANUFACTURER_ZIP_CODE',
    'MANUFACTURER_ZIP_CODE_EXT',
    'MANUFACTURER_COUNTRY_CODE',
    'MANUFACTURER_POSTAL_CODE',
    'SOURCE_TYPE',
    'DATE_ADDED',
    'DATE_CHANGED',
    'REPORTER_COUNTRY_CODE',
    'PMA_PMN_NUM',
    'EXEMPTION_NUMBER'
]

mdrfoiThru2021.drop(unwanted_columns, axis=1, inplace=True)
mdrfoiThru2021.fillna('', inplace=True)

print(f"mdrfoiThru2021 data frame creation complete: {mdrfoiThru2021.shape}")

b'Skipping line 11971429: expected 82 fields, saw 83\n'
  mdrfoiThru2021 = pd.read_csv(f"{working_directory}/mdrfoiThru2021.txt",


mdrfoiThru2021 data frame creation complete: (12830731, 12)


In [7]:
# Reduce the mdrfoi data to only include reports received in 2015
mdrfoi2015 = mdrfoiThru2021[mdrfoiThru2021['DATE_RECEIVED'].str.contains('2015')]
print(f"mdrfoi2015 data frame creation complete: {mdrfoi2015.shape}")

mdrfoi2015 data frame creation complete: (861630, 12)


In [8]:
# merge text with combined problem code and text
pass1 = pd.merge(
        foitext2015, 
        combined_foidevproblem, 
        on="MDR_REPORT_KEY", 
        how="inner")

In [9]:
pass1.shape

(1771658, 7)

In [10]:
# pass1

In [11]:
# merge text with combined problem code and text
pass2 = pd.merge(
        pass1, 
        device2015, 
        on="MDR_REPORT_KEY", 
        how="inner")

In [12]:
pass2.shape

(1773340, 14)

In [13]:
# pass2

In [14]:
# merge text with combined problem code and text
pass3 = pd.merge(
        pass2, 
        mdrfoi2015, 
        on="MDR_REPORT_KEY", 
        how="inner")

In [15]:
pass3.shape

(1773340, 26)

In [37]:
# pass3

In [36]:
import pandas as pd

# Identify the working directory
working_directory = './2015_reprocessed'

generic_names = pd.DataFrame(pass3['GENERIC_NAME'].value_counts(), columns=['GENERIC_NAME','COUNT'])

generic_names.to_csv(f"{working_directory}/generic_names.csv")
generic_names.shape

(12687, 2)