In [1]:
import os
import sys
from pathlib import Path
from IPython.display import display

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None) # Display all columns in a pandas dataframe
import matplotlib.pyplot as plt

cwd = Path(os.getcwd())
print(os.listdir("/"))

['bin', 'boot', 'dev', 'etc', 'home', 'lib', 'lib64', 'media', 'mnt', 'opt', 'proc', 'root', 'run', 'sbin', 'srv', 'sys', 'tmp', 'usr', 'var', 'get-docker.sh', 'dxdata-0.36.1-py2.py3-none-any.whl', 'install_r_kernel.R', 'install_r_packages.R', '.dockerenv']


In [2]:
print(os.listdir("/mnt/project/data"))

['anamnesis.csv', 'cohorts', 'densitometry_participant.csv', 'icd10-codes_participant.csv', 'medicaments_participant.csv', 'processed']


In [3]:
df_medicaments = pd.read_csv("/mnt/project/data/medicaments_participant.csv", parse_dates=["p53_i0", "p53_i1", "p53_i2"])
df_medicaments.rename(columns={
    "eid": "patientId",
    "p53_i0": "date_i0",
    "p53_i1": "date_i1", 
    "p53_i2": "date_i2",
    "p21003_i2": "age_i2"
}, inplace=True)
df_medicaments

Unnamed: 0,patientId,date_i0,date_i1,date_i2,p2724_i0,p2724_i1,p2724_i2,p20003_i0,p20003_i1,p20003_i2,p3536_i0,p3536_i1,p3536_i2,p3546_i0,p3546_i1,p3546_i2,age_i2
0,1000621,2007-10-29,2013-04-04,2014-10-30,Not sure - had a hysterectomy,Not sure - had a hysterectomy,Not sure - had a hysterectomy,ginkgo forte tablet|levothyroxine sodium|multi...,levothyroxine sodium|vagifem 25mcg pessary,levothyroxine sodium|vagifem 25mcg pessary,38,40,40,38,41,40,58
1,1001643,2009-06-15,NaT,2015-08-04,Yes,,Yes,ferrous sulphate|food supplement/plant/herbal ...,,ferrous sulphate|levothyroxine sodium|omeprazole,47,,49,48,,50,61
2,1004131,2009-11-09,NaT,2015-05-16,Yes,,Yes,calcium salts|glucosamine product|ibuprofen|mi...,,,,,,,,,65
3,1004471,2006-06-02,2012-09-13,2016-06-22,Yes,Yes,Yes,,,,,,,,,,67
4,1005492,2006-06-08,NaT,2016-06-13,No,,Yes,,,clopidogrel|perindopril|simvastatin|stemetil 5...,,,,,,,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28031,6018961,2008-11-05,2013-03-11,2016-01-23,No,Yes,Yes,,aciclovir|omeprazole|paracetamol|propranolol,,,,,,,,54
28032,6020935,2009-02-05,NaT,2017-10-17,Yes,,Yes,fybogel mebeverine sachet,,"Free-text entry, unable to be coded|aspirin|at...",48,,47,53,,50,69
28033,6021762,2008-04-01,NaT,2019-06-09,Yes,,Yes,,,,50,,50,50,,50,74
28034,6022133,2007-07-27,NaT,2016-12-12,No,,No,,,warfarin,,,,,,,53


In [4]:
def find_menopause_status(row):
    if row["p2724_i0"] == "Yes" or\
    row["p2724_i1"] == "Yes" or\
    row["p2724_i2"] == "Yes":
        return True
    else:
        return False

In [5]:
# Forward fill menopause status and store value in a new column
menopausal_variables = ["p2724_i0", "p2724_i1", "p2724_i2"]
df_medicaments["postmenopausal"] = df_medicaments[menopausal_variables].apply(lambda x: find_menopause_status(x), axis=1)

# Drop old columns
df_medicaments.drop(menopausal_variables, axis=1, inplace=True)

# Filter for menopausal women and drop columns
df_medicaments = df_medicaments[df_medicaments["postmenopausal"] == True].reset_index(drop=True)
df_medicaments

Unnamed: 0,patientId,date_i0,date_i1,date_i2,p20003_i0,p20003_i1,p20003_i2,p3536_i0,p3536_i1,p3536_i2,p3546_i0,p3546_i1,p3546_i2,age_i2,postmenopausal
0,1001643,2009-06-15,NaT,2015-08-04,ferrous sulphate|food supplement/plant/herbal ...,,ferrous sulphate|levothyroxine sodium|omeprazole,47,,49,48,,50,61,True
1,1004131,2009-11-09,NaT,2015-05-16,calcium salts|glucosamine product|ibuprofen|mi...,,,,,,,,,65,True
2,1004471,2006-06-02,2012-09-13,2016-06-22,,,,,,,,,,67,True
3,1005492,2006-06-08,NaT,2016-06-13,,,clopidogrel|perindopril|simvastatin|stemetil 5...,,,,,,,58,True
4,1007944,2009-11-26,NaT,2018-11-17,,,,,,,,,,73,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23532,6018670,2009-01-16,2013-04-10,2016-01-04,ibuprofen,multivitamins|omega-3/fish oil supplement,,53,56,52,56,58,56,65,True
23533,6018961,2008-11-05,2013-03-11,2016-01-23,,aciclovir|omeprazole|paracetamol|propranolol,,,,,,,,54,True
23534,6020935,2009-02-05,NaT,2017-10-17,fybogel mebeverine sachet,,"Free-text entry, unable to be coded|aspirin|at...",48,,47,53,,50,69,True
23535,6021762,2008-04-01,NaT,2019-06-09,,,,50,,50,50,,50,74,True


In [6]:
df_medicaments.loc[0, "p20003_i2"]

'ferrous sulphate|levothyroxine sodium|omeprazole'

In [7]:
# look for bisphosphonates (zoledron, ibandron, alendron, risedron)

def find_treatments(df, treatment_name, medicaments):
    print(f"Looking for {treatment_name} medicaments.")
    medicament_columns = ["p20003_i0", "p20003_i1", "p20003_i2"]

    df_treatment = pd.concat(
        [
            df[col].str.contains(r'\b(?:{})\b'.format('|'.join(medicaments)))
            for col in medicament_columns
        ], 
        axis=1,
    ).fillna(False)

    # prior treatment = (instance_0 OR instance_1) AND (NOT instance_2)
    df_treatment[f"{treatment_name}_prior"] = (df_treatment["p20003_i0"] | df_treatment["p20003_i1"]) & (df_treatment["p20003_i2"] - 1).astype(bool)
    df_treatment[f"{treatment_name}_current"] = df_treatment["p20003_i2"]
    df_treatment[f"{treatment_name}_new"] = False
    print(df_treatment.loc[:,f"{treatment_name}_prior":].sum())
    print()
    return df_treatment.drop(medicament_columns, axis=1)

In [8]:
# Define active components in treatments
treatments = {
    "bisphosphonates": ["zoledronic acid", "ibandronic acid", "alendronate sodium", "risedronate sodium"], 
    "serm": ["oestrogen product"],
    "teriparatide": ["teriparatide"],
}

for name, treatment in treatments.items():
    df_medicaments = pd.concat([df_medicaments, find_treatments(df_medicaments, name, treatment)], axis=1)
df_medicaments

Looking for bisphosphonates medicaments.
bisphosphonates_prior      339
bisphosphonates_current    424
bisphosphonates_new          0
dtype: int64

Looking for serm medicaments.
serm_prior      24
serm_current    72
serm_new         0
dtype: int64

Looking for teriparatide medicaments.
teriparatide_prior      0
teriparatide_current    0
teriparatide_new        0
dtype: int64



Unnamed: 0,patientId,date_i0,date_i1,date_i2,p20003_i0,p20003_i1,p20003_i2,p3536_i0,p3536_i1,p3536_i2,p3546_i0,p3546_i1,p3546_i2,age_i2,postmenopausal,bisphosphonates_prior,bisphosphonates_current,bisphosphonates_new,serm_prior,serm_current,serm_new,teriparatide_prior,teriparatide_current,teriparatide_new
0,1001643,2009-06-15,NaT,2015-08-04,ferrous sulphate|food supplement/plant/herbal ...,,ferrous sulphate|levothyroxine sodium|omeprazole,47,,49,48,,50,61,True,False,False,False,False,False,False,False,False,False
1,1004131,2009-11-09,NaT,2015-05-16,calcium salts|glucosamine product|ibuprofen|mi...,,,,,,,,,65,True,False,False,False,False,False,False,False,False,False
2,1004471,2006-06-02,2012-09-13,2016-06-22,,,,,,,,,,67,True,False,False,False,False,False,False,False,False,False
3,1005492,2006-06-08,NaT,2016-06-13,,,clopidogrel|perindopril|simvastatin|stemetil 5...,,,,,,,58,True,False,False,False,False,False,False,False,False,False
4,1007944,2009-11-26,NaT,2018-11-17,,,,,,,,,,73,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23532,6018670,2009-01-16,2013-04-10,2016-01-04,ibuprofen,multivitamins|omega-3/fish oil supplement,,53,56,52,56,58,56,65,True,False,False,False,False,False,False,False,False,False
23533,6018961,2008-11-05,2013-03-11,2016-01-23,,aciclovir|omeprazole|paracetamol|propranolol,,,,,,,,54,True,False,False,False,False,False,False,False,False,False
23534,6020935,2009-02-05,NaT,2017-10-17,fybogel mebeverine sachet,,"Free-text entry, unable to be coded|aspirin|at...",48,,47,53,,50,69,True,False,False,False,False,False,False,False,False,False
23535,6021762,2008-04-01,NaT,2019-06-09,,,,50,,50,50,,50,74,True,False,False,False,False,False,False,False,False,False


In [9]:
def find_hrt_status(row):
    s = pd.Series(index=["hrt_prior", "hrt_current", "hrt_new"], data=[False, False, False])
    if row["p3546_i2"].isdigit() or row["p3546_i1"].isdigit() or row["p3546_i1"].isdigit():
        # The patient has a valid integer in the "age last used hrt" 
        # field and has thus ended the HRT therapy. 
        # No patients had an integer entered in instance 0 or 1 and 
        # then started a new therapy by specifying "Still taking HRT"
        # in instance 2
        s["hrt_prior"] = True
    elif row["p3546_i2"] == "Still taking HRT" or row["p3546_i1"] == "Still taking HRT" or row["p3546_i0"] == "Still taking HRT":
        s["hrt_current"] = True
    return s

In [10]:
df_medicaments[["p3546_i0", "p3546_i1", "p3546_i2"]] = df_medicaments[["p3546_i0", "p3546_i1", "p3546_i2"]].fillna("")
df_medicaments = pd.concat([df_medicaments, df_medicaments.apply(lambda x: find_hrt_status(x), axis=1)], axis=1)
df_medicaments.drop(["date_i0", "date_i1", "age_i2", "postmenopausal", "p20003_i0", "p20003_i1", "p20003_i2", "p3536_i0", "p3536_i1", "p3536_i2", "p3546_i0", "p3546_i1", "p3546_i2"], axis=1, inplace=True)
df_medicaments

Unnamed: 0,patientId,date_i2,bisphosphonates_prior,bisphosphonates_current,bisphosphonates_new,serm_prior,serm_current,serm_new,teriparatide_prior,teriparatide_current,teriparatide_new,hrt_prior,hrt_current,hrt_new
0,1001643,2015-08-04,False,False,False,False,False,False,False,False,False,True,False,False
1,1004131,2015-05-16,False,False,False,False,False,False,False,False,False,False,False,False
2,1004471,2016-06-22,False,False,False,False,False,False,False,False,False,False,False,False
3,1005492,2016-06-13,False,False,False,False,False,False,False,False,False,False,False,False
4,1007944,2018-11-17,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23532,6018670,2016-01-04,False,False,False,False,False,False,False,False,False,True,False,False
23533,6018961,2016-01-23,False,False,False,False,False,False,False,False,False,False,False,False
23534,6020935,2017-10-17,False,False,False,False,False,False,False,False,False,True,False,False
23535,6021762,2019-06-09,False,False,False,False,False,False,False,False,False,True,False,False


In [11]:
df_medicaments.loc[:,"bisphosphonates_prior":].sum()

bisphosphonates_prior       339
bisphosphonates_current     424
bisphosphonates_new           0
serm_prior                   24
serm_current                 72
serm_new                      0
teriparatide_prior            0
teriparatide_current          0
teriparatide_new              0
hrt_prior                  6613
hrt_current                1308
hrt_new                       0
dtype: int64

In [13]:
df_medicaments.to_csv("medicaments.csv", index=False)

In [14]:
%%bash
dx upload medicaments.csv --path /data/processed/

ID                          file-GPbq2J8Jjxx8FjY2Y2KxvQxv
Class                       file
Project                     project-GP77K38Jjxx9XzFP2KzPQyfG
Folder                      /data/processed
Name                        medicaments.csv
State                       closing
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Tue Feb 14 16:05:09 2023
Created by                  ollehman
 via the job                job-GPbkX0QJjxxJf40Z9b6yq9BX
Last modified               Tue Feb 14 16:05:11 2023
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"
