In [1]:
import glob

path_to_docx_drugs = 'Notebooks/installations/Data/Total-Lexi-links-docx/*.docx'
list_pdfs = glob.glob(path_to_docx_drugs)[:]

In [4]:
from docx import Document

def get_drug_interactions(paragraphs):
    found = False
    text = ""
    for paragraph in paragraphs:
        if paragraph.style.name.startswith('Heading 1') and paragraph.text == 'Drug Interactions':
            found = True
            continue
        if found and paragraph.style.name != "Heading 1":
            text += paragraph.text
            text += '\n\n'
        elif found and paragraph.style.name == "Heading 1":
            break
    return text

def get_paragraphs_lst(paragraphs):
    paragraph_list = []
    for paragraph in paragraphs:
        paragraph_list.append(paragraph.text)
    return paragraph_list

def get_drug_interactions2(paragraphs):
    found = False
    text = []
    for paragraph in paragraphs:
        if paragraph.style.name.startswith('Heading 1') and paragraph.text == 'Drug Interactions':
            found = True
            continue
        if found and paragraph.style.name != "Heading 1":
            text.append(paragraph.text)
        elif found and paragraph.style.name == "Heading 1":
            break
    return text

def get_heading_text(paragraphs, heading_name):
    '''
    :param paragraphs: get the whole document paragraphs
    :param heading_name: get the heading name
    :return: the paragraphs required.
    '''
    found = False
    text = ""
    for paragraph in paragraphs:
        if paragraph.style.name.startswith('Heading 1') and (heading_name in paragraph.text):
            found = True
            continue
        if found and paragraph.style.name != "Heading 1":
            text += paragraph.text
            text += '\n\n'
        elif found and paragraph.style.name == "Heading 1":
            break
    return text

def get_title_bypath(path1):
    '''
    :param path1: filepath
    :return: current filename which is in our case the title
    '''
    name = path1.split('\\')[-1].split(".")[0]
    return name

def fill_drug_interaction_data(drug_interactions, title):
    '''
    :param drug_interactions: a text from heading Drug Interactions until its end, the text has the following format:
        Drug Interactive Name: Interaction. Risk X: Required action.
    :param title:  title for the drug that has the drug interactions section.
    :return: four lists; Drug_Name : list for the title names.
                         Drug_Interactive: List for the interactive drug names.
    '''
    Drug_Name, Drug_Interactive, Current_Interaction, Risk = [], [], [], []
    for j in range(len(drug_interactions)):
        interactions_per_drug = drug_interactions[j]

        interactions_per_drug = interactions_per_drug.split('\n')

        for elem in interactions_per_drug[:]:

            if ":" in elem:
                if 'Risk A' in elem:
                    Risk.append('A')     ## Didn't exist on our dataset
                elif 'Risk B' in elem:
                    Risk.append('B')     ## Didn't exist on our dataset
                elif 'Risk C' in elem:
                    Risk.append('C')
                elif 'Risk D' in elem:
                    Risk.append('D')
                elif 'Risk X' in elem:
                    Risk.append('X')
                else:
                    Risk.append('Not determined') ## Should get handled later

                elem = elem.split(':')
                Drug_Name.append(title)
                Drug_Interactive.append(elem[0])
                if len(elem) > 2:
                    elems = ':'.join(elem[1:])
                else:
                    elems = elem[1]
                Current_Interaction.append(elems)

    return Drug_Name, Drug_Interactive, Current_Interaction, Risk

def clean_drug_interaction(drug_interactions):
    while ("" in drug_interactions):
        drug_interactions.remove("")

    while (" " in drug_interactions):
        drug_interactions.remove(" ")

    while ("  " in drug_interactions):
        drug_interactions.remove("  ")

    return drug_interactions

In [5]:
Drug_Name, Drug_Interactive, Current_Interaction, Risk = [], [], [], []

for i in range(len(list_pdfs)):
    document = Document(list_pdfs[i])  ## Read the document.
    title = get_title_bypath(list_pdfs[i])

    drug_interactions = get_drug_interactions(document.paragraphs).split('\n\n')[1:]  ## Get the required section, first line is empty

    drug_interactions = clean_drug_interaction(drug_interactions) ## Get ready of split lines \n
    X = fill_drug_interaction_data(drug_interactions, title)
    Drug_Name = Drug_Name + X[0]
    Drug_Interactive = Drug_Interactive + X[1]
    Current_Interaction = Current_Interaction + X[2]
    Risk = Risk + X[3]

In [6]:
Drug_Name

['Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir and Lamivudine',
 'Abacavir, Dolutegravir, and Lamivudine',
 'Abacavir, Dolutegravir, and Lamivudine',
 'Abacavir, Dolutegravir, and Lamivudine',
 'Abacavir, Dolutegravir, and Lamivudine',
 'Abacavir, Dolutegravir, and Lamivudine',
 'Abacavir, Dolutegravir, and Lamivudine',
 'Abacavir, Dolutegravir, and Lamivudine',
 'Abacavir, Dolutegravir, and Lamivudine',
 'Abacavir, Dolutegravir, and Lamivudine',
 'Abacavir, Dolutegravir, and Lamivudine',
 'Abacavir, Dolutegravir, and Lamivudi

In [7]:
Drug_Interactive

['Note',
 'Betibeglogene Autotemcel',
 'Cabozantinib',
 'Cladribine',
 'Elivaldogene Autotemcel',
 'Erdafitinib',
 'Fexinidazole',
 'Fexinidazole',
 'Gilteritinib',
 'Levomethadone',
 'Methadone',
 'Orlistat',
 'Pacritinib',
 'Riociguat',
 'Risdiplam',
 'Sorbitol',
 'Tafenoquine',
 'Tafenoquine',
 'Trimethoprim',
 'Note',
 'Aluminum Hydroxide',
 'Antidiabetic Agents',
 'Calcium Salts',
 'CarBAMazepine',
 'Increase weight-based dose to twice daily in pediatric patients. Specific recommendations vary for combination products; see interaction monograph for details. Risk D',
 'Cladribine',
 'Clofarabine',
 'Dalfampridine',
 'Dofetilide',
 'Efavirenz',
 'Elivaldogene Autotemcel',
 'Erdafitinib',
 'Etravirine',
 'Fexinidazole',
 'Fexinidazole',
 'Fosamprenavir',
 'Fosphenytoin-Phenytoin',
 'Gilteritinib',
 'Iron Preparations',
 'Isoniazid',
 'Risk C',
 'Levomethadone',
 'Magnesium Salts',
 'MetFORMIN',
 'Methadone',
 'Mitapivat',
 'Multivitamins/Minerals (with ADEK, Folate, Iron)',
 'Multivi

In [8]:
Drug_Interactive0 = Drug_Interactive[:]
Current_Interaction0 = Current_Interaction[:]

cs = ['Monitor therapy', 'Consider monitor therapy', 'Avoid combination', 'Consider therapy modification',
      'Consider therapy', 'Consider']
ds = ['Risk C', 'Risk D', 'Risk X', 'C', 'D', 'X', 'Management']

for i in range(1, len(Drug_Interactive)):
    d = Drug_Interactive[i];
    c = Current_Interaction[i]
    if d.strip() in ds or c.strip() in cs:
        Current_Interaction[i - 1] = str(Current_Interaction[i - 1]) + ' ' + str(d) + ':' + str(c)
        Drug_Interactive[i] = 'none';
        Current_Interaction[i] = 'none'

for i in range(len(Current_Interaction)): ## For the undetermined Risk types, we search first for the identical mathces.
    elem = Current_Interaction[i]
    if Risk[i] == 'Not determined':
        if 'Risk A' in elem:
            Risk[i] = 'A'
        elif 'Risk B' in elem:
            Risk[i] = 'B'
        elif 'Risk C' in elem:
            Risk[i] = 'C'
        elif 'Risk D' in elem or 'D: Consider therapy' in elem:
            Risk[i] = 'D'
        elif 'Risk X' in elem or 'X: Avoid combination' in elem:
            Risk[i] = 'X'
        else:
            Risk[i] = 'Not determined'

In [9]:
Risk  ## Still we have a percentage of the 'Not determined' value.

['Not determined',
 'X',
 'C',
 'X',
 'X',
 'D',
 'D',
 'D',
 'C',
 'C',
 'C',
 'C',
 'X',
 'C',
 'D',
 'D',
 'D',
 'D',
 'C',
 'Not determined',
 'D',
 'C',
 'D',
 'D',
 'D',
 'X',
 'C',
 'D',
 'X',
 'D',
 'X',
 'D',
 'D',
 'D',
 'D',
 'D',
 'X',
 'C',
 'D',
 'C',
 'C',
 'C',
 'D',
 'D',
 'C',
 'C',
 'D',
 'D',
 'C',
 'X',
 'X',
 'X',
 'C',
 'D',
 'C',
 'C',
 'D',
 'D',
 'D',
 'X',
 'D',
 'D',
 'D',
 'D',
 'C',
 'C',
 'D',
 'Not determined',
 'X',
 'C',
 'X',
 'X',
 'C',
 'C',
 'C',
 'C',
 'Not determined',
 'Not determined',
 'X',
 'X',
 'C',
 'C',
 'X',
 'X',
 'X',
 'X',
 'D',
 'X',
 'C',
 'X',
 'X',
 'D',
 'D',
 'X',
 'D',
 'D',
 'X',
 'C',
 'X',
 'C',
 'C',
 'D',
 'D',
 'D',
 'D',
 'X',
 'X',
 'X',
 'X',
 'C',
 'C',
 'C',
 'C',
 'X',
 'C',
 'X',
 'X',
 'D',
 'D',
 'X',
 'D',
 'C',
 'X',
 'X',
 'X',
 'X',
 'X',
 'C',
 'C',
 'X',
 'X',
 'D',
 'X',
 'X',
 'Not determined',
 'X',
 'C',
 'C',
 'C',
 'C',
 'C',
 'D',
 'D',
 'C',
 'C',
 'C',
 'C',
 'C',
 'X',
 'C',
 'D',
 'D',
 'C',
 'C'

In [10]:
Risk2=  Risk[:]; Drug_Interactive2 = Drug_Interactive[:]; Current_Interaction2 = Current_Interaction[:]; Drug_Name2 = Drug_Name[:]

Risk=  Risk2[:]; Drug_Interactive = Drug_Interactive2[:]; Current_Interaction = Current_Interaction2[:]; Drug_Name = Drug_Name2[:]
for i in range(len(Risk)): ###  Collect the next items for the not determined cases in order to grasp what's missing
    if Risk[i] == 'Not determined' and len(Drug_Interactive[i+1]) > 40:
        Current_Interaction[i] = Current_Interaction[i] + Drug_Interactive[i+1] + ':' + Current_Interaction[i+1]
        Risk[i] = Risk[i+1]
        Current_Interaction[i+1] = Drug_Interactive[i+1] = Risk[i+1] = Drug_Name[i+1] = ''

In [11]:
k=0
for j in Risk:
    if j == 'Not determined':
        k +=1
k

8

In [1]:
## put drug_name, interactive, interaction, risk type in a dataframe and concat it with the df_Notes
## call the dartaframe: df_interactions
df_interactions = pd.DataFrame()
df_interactions['Drug Name'] = Drug_Name
df_interactions['Drug_Interactive'] = Drug_Interactive
df_interactions['Current_Interaction'] = Current_Interaction
df_interactions['Risk'] = Risk

In [13]:
df_Notes = df_interactions[df_interactions['Drug interactive']== 'Note']
df_Notes ## Get a column for notes

Unnamed: 0,Drug Name,Drug interactive,Interaction,Risk
0,Abacavir and Lamivudine,Note,Interacting drugs may not be individually lis...,Not determined
19,"Abacavir, Dolutegravir, and Lamivudine",Note,Interacting drugs may not be individually lis...,Not determined
67,Abacavir,Note,Interacting drugs may not be individually lis...,Not determined
76,Abaloparatide,Note,Interacting drugs may not be individually lis...,Not determined
77,Abatacept,Note,Interacting drugs may not be individually lis...,Not determined
134,Abciximab,Note,Interacting drugs may not be individually lis...,Not determined
171,Abemaciclib,Note,Interacting drugs may not be individually lis...,Not determined
237,Abiraterone Acetate,Note,Interacting drugs may not be individually lis...,Not determined


## data review

In [15]:
import pandas as pd

df = pd.read_excel('Notebooks/installations/Data/Results - Drug Interaction/Extracted Drug Interaction.xlsx').dropna()

In [16]:
df

Unnamed: 0,Drug Name,Drug interactive,Interaction,Risk,Notes
0,Abacavir,Betibeglogene Autotemcel,Antiretroviral Agents may diminish the therape...,X,
1,Abacavir,Cabozantinib,MRP2 Inhibitors may increase the serum concent...,C,
2,Abacavir,Cladribine,Agents that Undergo Intracellular Phosphorylat...,X,
3,Abacavir,Elivaldogene Autotemcel,Antiretroviral Agents may diminish the therape...,X,
4,Abacavir,Levomethadone,May diminish the therapeutic effect of Abacavi...,C,Interacting drugs may not be individually lis...
...,...,...,...,...,...
123454,Zuclopenthixol,Thiazide and Thiazide-Like Diuretics,Anticholinergic Agents may increase the serum ...,C,
123455,Zuclopenthixol,Tiotropium,Anticholinergic Agents may enhance the anticho...,C,
123456,Zuclopenthixol,Umeclidinium,May enhance the anticholinergic effect of Anti...,X,
123457,Zuclopenthixol,Valerian,May enhance the CNS depressant effect of CNS D...,C,


In [19]:
df

Unnamed: 0,Drug Name,Drug interactive,Interaction,Risk,Notes
0,Abacavir,Betibeglogene Autotemcel,Antiretroviral Agents may diminish the therape...,X,
1,Abacavir,Cabozantinib,MRP2 Inhibitors may increase the serum concent...,C,
2,Abacavir,Cladribine,Agents that Undergo Intracellular Phosphorylat...,X,
3,Abacavir,Elivaldogene Autotemcel,Antiretroviral Agents may diminish the therape...,X,
4,Abacavir,Levomethadone,May diminish the therapeutic effect of Abacavi...,C,Interacting drugs may not be individually lis...
...,...,...,...,...,...
123454,Zuclopenthixol,Thiazide and Thiazide-Like Diuretics,Anticholinergic Agents may increase the serum ...,C,
123455,Zuclopenthixol,Tiotropium,Anticholinergic Agents may enhance the anticho...,C,
123456,Zuclopenthixol,Umeclidinium,May enhance the anticholinergic effect of Anti...,X,
123457,Zuclopenthixol,Valerian,May enhance the CNS depressant effect of CNS D...,C,


## DF misguiding errors and fixing Risks

In [None]:
## Fix errors due to the appending of rows together in the dataframe of the drug interaction
## required is the path for the excel sheet that has the Risk missing errors.
df_errors = pd.read_excel('Notebooks/installations/Data/Results - Drug Interaction/Extracted Drug Interaction - Final Copy.xlsx')
df_errors

In [None]:
import re

pattern = r"Risk [CDX]:"
appendix = []

for i in range(len(df_errors)):
    text = df_errors.iloc[i]['Interaction']
    try:
        matches = re.findall(pattern, text)
        count = len(matches)
        if count > 1:
            appendix.append(i)
    except:
        count = 2
appendix
appendix = {}

for i in range(len(df_errors)):
    text = df_errors.iloc[i]['Interaction']
    try:
        matches = re.findall(pattern, text)
        count = len(matches)
        if count > 1:
            appendix[i] = count
    except:
        count = 2
appendix

In [None]:
len(appendix)

In [None]:
df_errors

In [None]:
interaction_toedit = df_errors['Interaction'].tolist()[:]
interaction_toedit

In [None]:
s = 'May enhance the anticoagulant effect of Vitamin K Antagonists. Risk C: Monitor therapy Antithyroid Agents: May diminish the anticoagulant effect of Vitamin K Antagonists. Risk C: Monitor therapy Aprepitant: May decrease the serum concentration of Vitamin K Antagonists. Risk C: Monitor therapy AzaTHIOprine: May diminish the anticoagulant effect of Vitamin K Antagonists. Risk C: Monitor therapy'

pattern = r"(Risk [CX]:\s+\w+\s+\w+|Risk D:\s+\w+\s+\w+\s+\w+)"

result = re.split(pattern, s)
split_strings = [s.strip() for s in result if s.strip()]
result_list = []
for i in range(0, len(split_strings), 2):
    if i + 1 < len(split_strings):
        combined_element = split_strings[i] + split_strings[i + 1]
        result_list.append(combined_element)
interaction1 = result_list[0]
drug_inter = [];
interaction = [];
Risks = []
for sent in result_list[1:]:
    drug_inter.append(sent.split(':')[0]);
    interaction.append(sent.split(':')[1]);
    Risks.append(sent.split('.')[-1])
appendix.keys()

In [None]:
interaction_toedit[20]

In [None]:
titles = []; drug_inter = []; interaction = []; Risks = []

for i in appendix.keys():
    result = re.split(pattern, interaction_toedit[i])
    split_strings = [s.strip() for s in result if s.strip()]

    result_list = []
    for j in range(0, len(split_strings), 2):
        if j + 1 < len(split_strings):
            combined_element = split_strings[j] + split_strings[j + 1]
            result_list.append(combined_element)
    try:
        interaction1 = result_list[0]
        for sent in result_list[1:]:
            titles.append(df_errors.iloc[i]['Drug Name']);
            drug_inter.append(sent.split(':')[0]);
            interaction.append(sent.split(':')[1]);
            Risks.append(sent.split('.')[-1])
            interaction[-1] = interaction[-1] + ':' + sent.split('.')[-1].split(':')[-1]
    except:
        interaction1 = ''
titles

In [None]:
drug_inter

In [None]:
interaction[:]

In [None]:
Risks[:20]

In [None]:
for r in Risks:
    print(r[5])
for j in range(len(Risks)):
    Risks[j] = Risks[j][5]
len(Risks)
Notes = []

for j in range(len(Risks)):
    Notes.append('')
for j in range(len(interaction)):
    if interaction[j].endswith('Risk C:'):
        interaction[j] = interaction[j] + 'Monitor therapy'
    if interaction[j].endswith('Risk X:'):
        interaction[j] = interaction[j] + 'Avoid combination'
for j in range(len(interaction)):
    interaction[j] = interaction[j].replace('\n', '')

In [None]:
df_doubted = pd.DataFrame()
df_doubted['Drug Name'] = titles
df_doubted['Drug interactive'] = drug_inter
df_doubted['Interaction'] = interaction
df_doubted['Risk'] = Risks
df_doubted['Notes'] = Notes
df_doubted


In [None]:
df_doubted.to_excel('doubted-drug-interactions.xlsx')
interaction_toedit = df_interactions['Interaction'].tolist()

In [None]:
for i in range(len(interaction_toedit)):
    result = re.split(pattern, interaction_toedit[i])
    split_strings = [s.strip() for s in result if s.strip()]

    result_list = []
    for j in range(0, len(split_strings), 2):
        if j + 1 < len(split_strings):
            combined_element = split_strings[j] + split_strings[j + 1]
            result_list.append(combined_element)
    try:
        interaction_toedit[i] = result_list[0]
    except:
        result_list = ''


In [None]:
interaction_toedit

In [None]:
for i in appendix.keys():
    print(interaction_toedit[i])
    if i > 1000:
        break
df_interactions_fixed = df_interactions[:]
df_interactions_fixed.Interaction = interaction_toedit[:]
interaction_toedit[110999:]
df_interactions_fixed

In [None]:
df_interactions_fixed.to_excel('interactionsfixed.xlsx')