In [3]:
import pandas as pd
import numpy as np
import csv
import os 
import xml.etree.ElementTree as ET
import xml.dom.minidom
import zipfile
import glob
import re
import spacy
import time

In [2]:
os.chdir('/Users/veliristimaki/Code/Summer Paper')

In [None]:
tree = ET.parse("0f1653d8-d6bf-4123-a6b0-e165e941d616.xml")
root = tree.getroot()

In [None]:
text_data = []
for element in root.iter():
    text = element.text
    if text is not None:
        text_data.append(text.strip())

Here I am printing the raw text removing the xml formatting to see what is contained within the drug label information files. As you can see there are a lot of useless text information that will be uninformative when applying text analysis. I will have to parse through the xml attributes to get the elements of interest.

In [None]:
for text in text_data:
    print(text)

Here I am setting up the parsing object for the xml files.

In [None]:
domtree = xml.dom.minidom.parse("0f1653d8-d6bf-4123-a6b0-e165e941d616.xml")
group = domtree.documentElement

Here is a function that extracts all the text from the paragraph tags in the xml file format.

In [8]:
def xml_to_str(filename):
    
    domtree = xml.dom.minidom.parse(filename)
    group = domtree.documentElement
    paragraph_elements = group.getElementsByTagName("paragraph")
    
    text_data = []
    
    for i in range(len(paragraph_elements)):
        #if paragraph_elements[i].firstChild is None:
        #    continue
        #print(paragraph_elements[i].firstChild.nodeValue)
        if paragraph_elements[i].firstChild is not None:
            text_data.append(paragraph_elements[i].firstChild.nodeValue)
    
    #Dropping Elements of text_data that only contain empty spaces and \n
    text_filtered = [i for i in text_data if not all(j in ["\n", " "] for j in i)]
    #Dropping empty space and \n within each element of text_data
    text_within = [i.replace("\n", "").replace("  ", " ").strip() for i in text_filtered if not (i.replace("\n", "").replace("  ", " ").strip() == "")]
    
    text_str = ''.join(text_within)
    
    return text_str


In [None]:
text_data = xml_to_str("0f1653d8-d6bf-4123-a6b0-e165e941d616.xml")

In [None]:
text_data

Now that we are able to extract the necessary information from each drug label file, we can move on to the next step. First, there is some pre-processing we have to do. The files that were scraped from DailyMed still need to renamed so that we know what drug each file is associated with. From the scraping script I also generated a dictionary file to match NDC codes with drug label files. I will use this dictionary key to rename the zip files. 

In [10]:
column_types = {
    'NDC': str,
    'filename': str,

}

# Read the CSV file and specify the column types
ref_file = pd.read_csv("D:\\data\\clean data\\ref.csv", dtype=column_types)

# Perform operations on the DataFrame
# ...


In [11]:
ref_file

Unnamed: 0,NDC,filename
0,00228202950,a23063c0-099a-4256-b95f-3a857bbf704b
1,00228305911,c5c2d060-452c-4f76-ad93-2e41fd8f06c2
2,10702010101,1bf37f9f-29cf-4b09-9c81-eb369e35a042
3,13107008305,5b851f67-309d-4ef3-8b4b-ade8908256ea
4,13811070910,c45dc1de-adfa-4b3c-a7dc-cffbc8eac74f
...,...,...
1218,24338003501,1e25f905-6c0b-4b19-a3b6-b2a386afa1c3
1219,65162023509,34de1806-d921-4d2f-8c7f-03625b717a70
1220,53746011005,e0cefc7b-e81c-4e15-89a0-fe71dc94338b
1221,70165010030,33f70f58-c871-42c8-8adb-345caeafefcd


As we can see there are multiple NDC codes related to the some package label. This is because 11-digit NDC codes are associated with a particular package size which is not informative for our purposes. Therefore we can drop some NDC codes from the table. 

In [8]:
print(f"There are {len(ref_file['NDC'].unique())} unique NDC codes")
print(f"There are {len(ref_file['filename'].unique())} drug label files")

There are 1223 unique NDC codes
There are 481 drug label files


In [12]:
grouped_df = ref_file.groupby("filename").agg({'NDC': list})
grouped_df

Unnamed: 0_level_0,NDC
filename,Unnamed: 1_level_1
007bf37f-0e46-426a-ac8c-be63d4b7414c,"[29300035505, 29300035501]"
010905f9-3bcb-4b50-9fe8-a3ad0010f14c,[00054024425]
01db4606-d49c-4b10-a78a-1cf41880a9fb,[31722091701]
01f6690c-1410-42ad-8eb3-e7df32eab3ba,"[42043016103, 42043016003]"
021153ce-fe27-4ed1-8d88-b4157b0ed734,"[71093015504, 71093015604, 71093015606]"
...,...
fcd2b59e-8087-475e-9e6b-911bd846ea96,"[71930002012, 71930001912, 71930001952, 719300..."
fe126254-6ee6-48c3-965c-96e4276a5314,"[58657050016, 58657050004]"
fe874f8a-4b56-4d2b-abf5-8742e2633a3b,[16571067516]
ff1630b4-2044-433c-96bc-0bb4ae8d42a0,"[49884011174, 49884011074]"


In [13]:
for i in range(len(grouped_df)):
    grouped_df['NDC'][i] = list(grouped_df.iloc[i])[0][0]

In [14]:
grouped_df = grouped_df.reset_index()
grouped_df

Unnamed: 0,filename,NDC
0,007bf37f-0e46-426a-ac8c-be63d4b7414c,29300035505
1,010905f9-3bcb-4b50-9fe8-a3ad0010f14c,00054024425
2,01db4606-d49c-4b10-a78a-1cf41880a9fb,31722091701
3,01f6690c-1410-42ad-8eb3-e7df32eab3ba,42043016103
4,021153ce-fe27-4ed1-8d88-b4157b0ed734,71093015504
...,...,...
476,fcd2b59e-8087-475e-9e6b-911bd846ea96,71930002012
477,fe126254-6ee6-48c3-965c-96e4276a5314,58657050016
478,fe874f8a-4b56-4d2b-abf5-8742e2633a3b,16571067516
479,ff1630b4-2044-433c-96bc-0bb4ae8d42a0,49884011174


In [15]:
claims_2021 = pd.read_csv('D:\\data\\clean data\\NH_MA_data_2021.csv', 
                          dtype={"NDC":str}, 
                          usecols = ["NDC", "NDC_PROD_NAME"])

In [16]:
claims_2021

Unnamed: 0,NDC,NDC_PROD_NAME
0,00228202950,ALPRAZOLAM
1,00228202950,ALPRAZOLAM
2,00228305911,AMPHETAMINE/DEXTROAMPHETA
3,10702010101,METHYLPHENIDATE HYDROCHLO
4,13107008305,LORAZEPAM
...,...,...
80206,69315090601,LORAZEPAM
80207,69315090601,LORAZEPAM
80208,69315090601,LORAZEPAM
80209,43547040610,CLONAZEPAM


In [17]:
merged_df = pd.merge(grouped_df, claims_2021, on='NDC', how = 'left').drop_duplicates()
merged_df

Unnamed: 0,filename,NDC,NDC_PROD_NAME
0,007bf37f-0e46-426a-ac8c-be63d4b7414c,29300035505,TRAMADOL HYDROCHLORIDE
10,010905f9-3bcb-4b50-9fe8-a3ad0010f14c,00054024425,CODEINE SULFATE
14,01db4606-d49c-4b10-a78a-1cf41880a9fb,31722091701,OXYCODONE HYDROCHLORIDE
43,01f6690c-1410-42ad-8eb3-e7df32eab3ba,42043016103,MODAFINIL
73,021153ce-fe27-4ed1-8d88-b4157b0ed734,71093015504,ZOLPIDEM TARTRATE
...,...,...,...
29517,fcd2b59e-8087-475e-9e6b-911bd846ea96,71930002012,HYDROCODONE BITARTRATE/AC
29533,fe126254-6ee6-48c3-965c-96e4276a5314,58657050016,CODEINE/GUAIFENESIN
29591,fe874f8a-4b56-4d2b-abf5-8742e2633a3b,16571067516,PHENOBARBITAL
29595,ff1630b4-2044-433c-96bc-0bb4ae8d42a0,49884011174,ALPRAZOLAM ODT


In [18]:
len(merged_df['NDC_PROD_NAME'].unique())

164

In [19]:
grouped_by_prod_name = merged_df.groupby('NDC_PROD_NAME').agg({'filename': list}).reset_index()

In [17]:
print(grouped_by_prod_name.iloc[0][0])
print(grouped_by_prod_name.iloc[0][1])

ACETAMINOPHEN/CODEINE
['02386806-02d4-47e3-9c8b-7810146ee795', '83a536d8-385d-46ed-9cd2-47b7efe96ccf', 'eadc80fa-13c3-471a-a22f-6fe7e1ab290f']


In [18]:
grouped_by_prod_name

Unnamed: 0,NDC_PROD_NAME,filename
0,ACETAMINOPHEN/CODEINE,"[02386806-02d4-47e3-9c8b-7810146ee795, 83a536d..."
1,ACETAMINOPHEN/CODEINE PHO,"[79377617-686b-4640-b2b8-e1358cf82358, 8bc4108..."
2,ADDERALL,[f22635fe-821d-4cde-aa12-419f8b53db81]
3,ADDERALL XR,[aff45863-ffe1-4d4f-8acf-c7081512a6c0]
4,ADHANSIA XR,[80e15bd0-4d56-75f0-ab5a-885879fc56e9]
...,...,...
159,XYOSTED,[8a3d204c-be26-49e0-8599-0ac12a272e81]
160,ZALEPLON,"[47303145-6788-46bd-b9cc-37b59b145cea, 57a1a11..."
161,ZOLPIDEM TARTRATE,"[021153ce-fe27-4ed1-8d88-b4157b0ed734, 0553e26..."
162,ZOLPIDEM TARTRATE ER,"[0c64bc71-2e7f-4b15-a8e1-acb8de04395f, 1a32885..."


From the output below we can see that for some drugs there are multiple products offered. These products tend to be very similar from one another but may differ in their rate of absorption and metabolism, method of ingestion, and other characteristics. 

In [30]:
for i in range(len(grouped_by_prod_name)):
    if len(grouped_by_prod_name.iloc[i][1]) > 1:
        print(f"{grouped_by_prod_name.iloc[i][0]} has {len(grouped_by_prod_name.iloc[i][1])} different products")

ACETAMINOPHEN/CODEINE has 3 different products
ACETAMINOPHEN/CODEINE PHO has 2 different products
ALPRAZOLAM has 5 different products
ALPRAZOLAM ER has 2 different products
ALPRAZOLAM ODT has 2 different products
AMPHETAMINE SULFATE has 2 different products
AMPHETAMINE/DEXTROAMPHETA has 22 different products
ARMODAFINIL has 4 different products
BUPRENORPHINE has 3 different products
BUPRENORPHINE HCL has 5 different products
BUPRENORPHINE HCL/NALOXON has 4 different products
BUPRENORPHINE HYDROCHLORI has 5 different products
BUTALBITAL/ACETAMINOPHEN/ has 10 different products
BUTALBITAL/ASPIRIN/CAFFEI has 3 different products
BUTORPHANOL TARTRATE has 2 different products
CARISOPRODOL has 8 different products
CHLORDIAZEPOXIDE HCL has 2 different products
CLOBAZAM has 9 different products
CLONAZEPAM has 3 different products
CLONAZEPAM ODT has 3 different products
CODEINE/GUAIFENESIN has 2 different products
DEXMETHYLPHENIDATE HCL has 2 different products
DEXMETHYLPHENIDATE HCL ER has 4 d

In [24]:
ref_table = grouped_by_prod_name.head()

In [20]:
def extract_xml_files(folder_path, extract_path, ref_table):
    
    start_time = time.time()
    
    drug_path_list = []
    
    for i in range(len(ref_table)):
        drug_name = ref_table.iloc[i][0]
        sanitized_drug_name = re.sub(r'[\/:*?"<>| ]+', '_', drug_name)
        
        for j in range(len(ref_table.iloc[i][1])):
        #print(ref_table.iloc[i][1][j])
            zip_file_path = glob.glob(f"{folder_path}/*{ref_table.iloc[i][1][j]}*.zip")
            if len(zip_file_path) > 0:
                zip_file_path = zip_file_path[0]
            #print(zip_file_path)
            
                with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
                # Find the XML file within the zip file
                    xml_files = [file for file in zip_ref.namelist() if file.endswith('.xml')]

                # Verify that there is only one XML file
                    if len(xml_files) == 1:
                        xml_file_path = xml_files[0]
                        #print(xml_file_path)
                # Extract the XML file to a temporary location
                        zip_ref.extract(xml_file_path, extract_path)

                        extracted_xml_file = os.path.join(extract_path, xml_file_path)
                        #print(extracted_xml_file)
                        new_file_path = os.path.join(extract_path, f"{sanitized_drug_name}_prod_no_{j+1}.xml")
                        #print(new_file_path)
                        # Rename the extracted XML file
                        os.rename(extracted_xml_file, new_file_path)

                        #print(f"XML file '{xml_file_path}' extracted successfully.")
                    #else:
                        #print("There should be only one XML file in the zip.")

                        drug_path_list.append(new_file_path)
    
    end_time = time.time()
    execution_time = end_time - start_time
    
    print("Files Extracted to", extract_path)
    print("Execution Time:", execution_time, "seconds")
    
    return drug_path_list

In [26]:
folder_path = "D:\\data\\XML"
extract_path =  "C:\\Users\\Arts User\\Downloads\\extract"

drug_path_list = extract_xml_files(folder_path,extract_path,ref_table)

Files Extracted to C:\Users\Arts User\Downloads\extract
Execution Time: 0.21714520454406738 seconds


In [27]:
str_dict = {}

for i in drug_path_list:
    result = xml_to_str(i)
    last_slash_index = i.rfind("/")
    substring = i[last_slash_index + 1:-4]
    str_dict[substring] = result

In [22]:
nlp = spacy.load('en_core_web_lg')

In [23]:
def create_similarity_matrix(str_dict):
    
    start_time = time.time()

    similarity_table = pd.DataFrame(columns=list(str_dict.keys()), index=list(str_dict.keys()))
    
    for i, str_i in enumerate(str_dict):
        for j, str_j in enumerate(str_dict):
            if j >= i:
                t1 = nlp(str_dict[str_i])
                t2 = nlp(str_dict[str_j])
                cell = t1.similarity(t2)
                similarity_table.loc[str_i, str_j] = cell
                if j > i:
                    similarity_table.loc[str_j, str_i] = cell

    end_time = time.time()
    execution_time = end_time - start_time
    
    print("Similarity Table Constructed!")
    print("Execution Time:", execution_time, "seconds")
    
    return similarity_table

In [29]:
similarity_table = create_similarity_matrix(str_dict)

Similarity Table Constructed!
Execution Time: 63.70603275299072 seconds


In [144]:
ref_table = grouped_by_prod_name.head(10)
list(ref_table['NDC_PROD_NAME'])

['ACETAMINOPHEN/CODEINE',
 'ACETAMINOPHEN/CODEINE PHO',
 'ADDERALL',
 'ADDERALL XR',
 'ADHANSIA XR',
 'ADZENYS XR-ODT',
 'ALPRAZOLAM',
 'ALPRAZOLAM ER',
 'ALPRAZOLAM ODT',
 'ALPRAZOLAM XR']

In [143]:
avg_similarity_table = pd.DataFrame(columns=list(ref_table['NDC_PROD_NAME']), index=list(ref_table['NDC_PROD_NAME']))
avg_similarity_table

Unnamed: 0,ACETAMINOPHEN/CODEINE,ACETAMINOPHEN/CODEINE PHO,ADDERALL,ADDERALL XR,ADHANSIA XR,ADZENYS XR-ODT,ALPRAZOLAM,ALPRAZOLAM ER,ALPRAZOLAM ODT,ALPRAZOLAM XR
ACETAMINOPHEN/CODEINE,,,,,,,,,,
ACETAMINOPHEN/CODEINE PHO,,,,,,,,,,
ADDERALL,,,,,,,,,,
ADDERALL XR,,,,,,,,,,
ADHANSIA XR,,,,,,,,,,
ADZENYS XR-ODT,,,,,,,,,,
ALPRAZOLAM,,,,,,,,,,
ALPRAZOLAM ER,,,,,,,,,,
ALPRAZOLAM ODT,,,,,,,,,,
ALPRAZOLAM XR,,,,,,,,,,


In [145]:
for i in range(len(ref_table)):

        i_drug_name = ref_table.iloc[i][0]
        i_sanitized_drug_name = re.sub(r'[\/:*?"<>| ]+', '_', i_drug_name) + "_p"
        
        for g in range(len(ref_table)):
            
            g_drug_name = ref_table.iloc[g][0]
            g_sanitized_drug_name = re.sub(r'[\/:*?"<>| ]+', '_', g_drug_name) + "_p"
            
            if i == g:
                avg_similarity_table.loc[i_drug_name, g_drug_name] = 1
            
            else:
                if i < g:    

                    i_drug_list = []
                    g_drug_list = []

                    for j, j_pkg_path in enumerate(drug_dict):

                        j_pkg_name = os.path.basename(j_pkg_path)

                        if i_sanitized_drug_name in j_pkg_name:
                            i_drug_list.append(drug_dict[j_pkg_path])

                    for k, k_pkg_path in enumerate(drug_dict):
                        k_pkg_name = os.path.basename(k_pkg_path)

                        if g_sanitized_drug_name in k_pkg_name:
                            g_drug_list.append(drug_dict[k_pkg_path])

                    #print(i_drug_name, i_drug_list,"\n")
                    #print(g_drug_name, g_drug_list,"\n\n")

                    total_similarity = 0
                    num_pairs = 0

                    for i_drug in i_drug_list:
                        for g_drug in g_drug_list:
                            i_drug_doc = nlp(i_drug)
                            g_drug_doc = nlp(g_drug)
                            similarity = i_drug_doc.similarity(g_drug_doc)
                            total_similarity += similarity
                            num_pairs += 1

                    average_similarity = total_similarity / num_pairs

                    avg_similarity_table.loc[i_drug_name, g_drug_name] = average_similarity

                    #print(f"The average similarity between {i_drug_name} and {g_drug_name} is {average_similarity} \n\n")
                
                else:
                    avg_similarity_table.loc[i_drug_name, g_drug_name] = avg_similarity_table.loc[g_drug_name, i_drug_name]

In [146]:
avg_similarity_table

Unnamed: 0,ACETAMINOPHEN/CODEINE,ACETAMINOPHEN/CODEINE PHO,ADDERALL,ADDERALL XR,ADHANSIA XR,ADZENYS XR-ODT,ALPRAZOLAM,ALPRAZOLAM ER,ALPRAZOLAM ODT,ALPRAZOLAM XR
ACETAMINOPHEN/CODEINE,1.0,0.998232,0.9943,0.984577,0.959558,0.971307,0.995214,0.991585,0.994113,0.993976
ACETAMINOPHEN/CODEINE PHO,0.998232,1.0,0.992909,0.983404,0.95827,0.969777,0.993816,0.990077,0.992409,0.992201
ADDERALL,0.9943,0.992909,1.0,0.993545,0.975677,0.983818,0.996394,0.995349,0.99613,0.995812
ADDERALL XR,0.984577,0.983404,0.993545,1.0,0.990673,0.994776,0.992633,0.995026,0.993365,0.993032
ADHANSIA XR,0.959558,0.95827,0.975677,0.990673,1.0,0.99297,0.976098,0.981177,0.977951,0.976959
ADZENYS XR-ODT,0.971307,0.969777,0.983818,0.994776,0.99297,1.0,0.981233,0.987774,0.982589,0.980932
ALPRAZOLAM,0.995214,0.993816,0.996394,0.992633,0.976098,0.981233,1.0,0.997204,0.999244,0.999119
ALPRAZOLAM ER,0.991585,0.990077,0.995349,0.995026,0.981177,0.987774,0.997204,1.0,0.997243,0.997137
ALPRAZOLAM ODT,0.994113,0.992409,0.99613,0.993365,0.977951,0.982589,0.999244,0.997243,1.0,0.99903
ALPRAZOLAM XR,0.993976,0.992201,0.995812,0.993032,0.976959,0.980932,0.999119,0.997137,0.99903,1.0


In [44]:
for file_path in list(str_dict.keys()):
    file_name = os.path.basename(file_path)
    print(file_name)

ACETAMINOPHEN_CODEINE_prod_no_1
ACETAMINOPHEN_CODEINE_prod_no_2
ACETAMINOPHEN_CODEINE_prod_no_3
ACETAMINOPHEN_CODEINE_PHO_prod_no_1
ACETAMINOPHEN_CODEINE_PHO_prod_no_2
ADDERALL_prod_no_1
ADDERALL_XR_prod_no_1
ADHANSIA_XR_prod_no_1


In [35]:
list(str_dict.keys())

['C:\\Users\\Arts User\\Downloads\\extract\\ACETAMINOPHEN_CODEINE_prod_no_1',
 'C:\\Users\\Arts User\\Downloads\\extract\\ACETAMINOPHEN_CODEINE_prod_no_2',
 'C:\\Users\\Arts User\\Downloads\\extract\\ACETAMINOPHEN_CODEINE_prod_no_3',
 'C:\\Users\\Arts User\\Downloads\\extract\\ACETAMINOPHEN_CODEINE_PHO_prod_no_1',
 'C:\\Users\\Arts User\\Downloads\\extract\\ACETAMINOPHEN_CODEINE_PHO_prod_no_2',
 'C:\\Users\\Arts User\\Downloads\\extract\\ADDERALL_prod_no_1',
 'C:\\Users\\Arts User\\Downloads\\extract\\ADDERALL_XR_prod_no_1',
 'C:\\Users\\Arts User\\Downloads\\extract\\ADHANSIA_XR_prod_no_1']

In [39]:
extract_path = 'C:\\Users\\Arts User\\Downloads\\extract'
drug_name = "ACETAMINOPHEN_CODEINE"

zip_file_path = glob.glob(f"{extract_path}/*{drug_name}*.zip")
zip_file_path

[]

In [138]:
#Here we do all the files

folder_path = "D:\\data\\XML"
extract_path =  "C:\\Users\\Arts User\\Downloads\\extract"

drug_list = extract_xml_files(folder_path,extract_path,grouped_by_prod_name)

start_time = time.time()

drug_dict = {}

for i in drug_list:
    result = xml_to_str(i)
    last_slash_index = i.rfind("/")
    substring = i[last_slash_index + 1:-4]
    drug_dict[substring] = result

end_time = time.time()
execution_time = end_time - start_time

print("XML to string complete.")
print("Execution Time:", execution_time, "seconds")

similarity_matrix = create_similarity_matrix(drug_dict)

Files Extracted to C:\Users\Arts User\Downloads\extract
Execution Time: 13.807796716690063 seconds
XML to string complete.
Execution Time: 22.31432604789734 seconds


KeyboardInterrupt: 