In [1]:
import glob
import numpy as np
import pandas as pd
import pickle

data_dir = "../../data/openFDA_drug_event/"
er_dir = data_dir+'er_tables/'

In [2]:
primarykey='safetyreportid'

In [3]:
patients = pd.read_csv(er_dir+'patient.csv.gz',
                       compression='gzip',
                       index_col=0,dtype={
                           'safetyreportid' : 'str',
                           'patient_custom_master_age' : 'float'
                       })

  mask |= (ar1 == a)


In [7]:
age_col='patient_onsetage'
aged = patients[patients[age_col].notnull()].reset_index(drop=True).copy()

In [8]:
col = 'nichd'

neonate = aged[age_col].apply(lambda x : float(x)>0 and float(x)<=(1/12))
infant = aged[age_col].apply(lambda x : float(x)>(1/12) and float(x)<=1)
toddler = aged[age_col].apply(lambda x : float(x)>1 and float(x)<=2)
echildhood = aged[age_col].apply(lambda x : float(x)>2 and float(x)<=5)
mchildhood = aged[age_col].apply(lambda x : float(x)>5 and float(x)<=11)
eadolescence = aged[age_col].apply(lambda x : float(x)>11 and float(x)<=18)
ladolescence = aged[age_col].apply(lambda x : float(x)>18 and float(x)<=21)

aged[col] = np.nan

aged.loc[neonate,col] = 'term_neonatal'
aged.loc[infant,col] = 'infancy'
aged.loc[toddler,col] = 'toddler'
aged.loc[echildhood,col] = 'early_childhood'
aged.loc[mchildhood,col] = 'middle_childhood'
aged.loc[eadolescence,col] = 'early_adolescence'
aged.loc[ladolescence,col] = 'late_adolescence'

In [9]:
col = 'ich_ema'

term_newborn_infants = (aged[age_col].
                        apply(lambda x : float(x)>0 and float(x)<=(1/12)))
infants_and_toddlers = (aged[age_col].
                       apply(lambda x : float(x)>(1/12) and float(x)<=2))
children = aged[age_col].apply(lambda x : float(x)>2 and float(x)<=11)
adolescents = aged[age_col].apply(lambda x : float(x)>11 and float(x)<=17)

aged[col] = np.nan

aged.loc[term_newborn_infants,col] = 'term_newborn_infants'
aged.loc[infants_and_toddlers,col] = 'infants_and_toddlers'
aged.loc[children,col] = 'children'
aged.loc[adolescents,col] = 'adolescents'

In [10]:
col = 'fda'

neonates = (aged[age_col].
                        apply(lambda x : float(x)>0 and float(x)<(1/12)))
infants = (aged[age_col].
                       apply(lambda x : float(x)>=(1/12) and float(x)<2))
children = aged[age_col].apply(lambda x : float(x)>=2 and float(x)<11)
adolescents = aged[age_col].apply(lambda x : float(x)>=11 and float(x)<16)

aged[col] = np.nan

aged.loc[neonates,col] = 'neonates'
aged.loc[infants,col] = 'infants'
aged.loc[children,col] = 'children'
aged.loc[adolescents,col] = 'adolescents'

In [11]:
pediatric_patients = (aged.
                      dropna(subset=['nichd']).
                      reset_index(drop=True))
print(pediatric_patients.shape)
print(pediatric_patients.head())

(505185, 8)
   patient_onsetage patient_onsetageunit patient_sex  patient_weight  \
0              10.0                 Year        Male            28.0   
1              19.0                 Year      Female             NaN   
2              18.0                 Year      Female             NaN   
3              10.0                 Year        Male             NaN   
4               4.0                 Year        Male             NaN   

  safetyreportid              nichd   ich_ema       fda  
0       10003357   middle_childhood  children  children  
1       10003388   late_adolescence       NaN       NaN  
2       10003401  early_adolescence       NaN       NaN  
3       10003430   middle_childhood  children  children  
4       10003517    early_childhood  children  children  


In [12]:
del patients
del aged

In [13]:
pediatric_patients.head()

Unnamed: 0,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid,nichd,ich_ema,fda
0,10.0,Year,Male,28.0,10003357,middle_childhood,children,children
1,19.0,Year,Female,,10003388,late_adolescence,,
2,18.0,Year,Female,,10003401,early_adolescence,,
3,10.0,Year,Male,,10003430,middle_childhood,children,children
4,4.0,Year,Male,,10003517,early_childhood,children,children


In [16]:
report = (pd.read_csv(er_dir+'report.csv.gz',
                      compression='gzip',
                     dtype={
                         'safetyreportid' : 'str'
                     }))
report.head()

Unnamed: 0_level_0,lastupdate_date,mostrecent_receive_date,receive_date
safetyreportid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10003300,20141002,20140306,20140306
10003301,20141002,20140228,20140228
10003302,20141002,20140312,20140312
10003304,20141212,20140424,20140312
10003305,20141002,20140312,20140312


In [19]:
df1 = pediatric_patients.copy()
ped_reports = df1.safetyreportid.unique()
df2 = report.copy()
print(df1.shape)
print(df2.shape)
df1[primarykey] = df1[primarykey].astype(str)
df2[primarykey] = df2[primarykey].astype(str)
pediatric_patients_report = \
pd.merge(df1,
         df2,
         on=primarykey,
         how='inner').query('safetyreportid in @ped_reports')
print(pediatric_patients_report.shape)

(505185, 8)
(11131248, 4)
(505185, 11)


In [20]:
del pediatric_patients
del report

In [22]:
report_serious = pd.read_csv(er_dir+'report_serious.csv.gz',compression='gzip')
report_serious.head()

Unnamed: 0_level_0,death,disabling,life_threatening,other,safetyreportid,serious
congenital_anomali,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,,1.0,,,10003300,"The adverse event resulted in death, a life th..."
,,,,1.0,10003301,"The adverse event resulted in death, a life th..."
,,,,,10003302,The adverse event did not result in any of the...
,,,,,10003304,The adverse event did not result in any of the...
,,,,,10003305,The adverse event did not result in any of the...


In [23]:
df1 = pediatric_patients_report.copy()
df2 = report_serious.copy()
print(df1.shape)
print(df2.shape)
df1[primarykey] = df1[primarykey].astype(str)
df2[primarykey] = df2[primarykey].astype(str)
pediatric_patients_report_serious = \
pd.merge(df1,
         df2,
         on=primarykey,
         how='inner')
print(pediatric_patients_report_serious.shape)

(505185, 11)
(11131248, 7)
(505185, 17)


In [24]:
pediatric_patients_report_serious.head()

Unnamed: 0,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid,nichd,ich_ema,fda,lastupdate_date,mostrecent_receive_date,receive_date,congenital_anomali,death,disabling,life_threatening,other,serious
0,10.0,Year,Male,28.0,10003357,middle_childhood,children,children,20141002,20140312,20140312,,,,,1.0,"The adverse event resulted in death, a life th..."
1,19.0,Year,Female,,10003388,late_adolescence,,,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...
2,18.0,Year,Female,,10003401,early_adolescence,,,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...
3,10.0,Year,Male,,10003430,middle_childhood,children,children,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...
4,4.0,Year,Male,,10003517,early_childhood,children,children,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...


In [25]:
del report_serious
del pediatric_patients_report

In [26]:
reporter = pd.read_csv(er_dir+'reporter.csv.gz',compression='gzip')
reporter.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0_level_0,reporter_country,reporter_qualification,safetyreportid
reporter_company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1289378,US,Consumer or non-health professional,10003300
US-JNJFOC-20130719067,US,Consumer or non-health professional,10003301
US-PFIZER INC-2014068976,US,Consumer or non-health professional,10003302
US-PFIZER INC-2014063856,US,Physician,10003304
US-PFIZER INC-2014069067,US,Physician,10003305


In [27]:
df1 = pediatric_patients_report_serious.copy()
df2 = reporter.copy()
print(df1.shape)
print(df2.shape)
df1[primarykey] = df1[primarykey].astype(str)
df2[primarykey] = df2[primarykey].astype(str)
pediatric_patients_report_serious_reporter = \
pd.merge(df1,
         df2,
         on=primarykey,
         how='inner')
print(pediatric_patients_report_serious_reporter.shape)

(505185, 17)
(11131248, 4)
(505185, 20)


In [28]:
pediatric_patients_report_serious_reporter.head()

Unnamed: 0,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid,nichd,ich_ema,fda,lastupdate_date,mostrecent_receive_date,receive_date,congenital_anomali,death,disabling,life_threatening,other,serious,reporter_company,reporter_country,reporter_qualification
0,10.0,Year,Male,28.0,10003357,middle_childhood,children,children,20141002,20140312,20140312,,,,,1.0,"The adverse event resulted in death, a life th...",US-ACTAVIS-2014-04163,US,Other health professional
1,19.0,Year,Female,,10003388,late_adolescence,,,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0061242,US,Consumer or non-health professional
2,18.0,Year,Female,,10003401,early_adolescence,,,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0063166,US,Consumer or non-health professional
3,10.0,Year,Male,,10003430,middle_childhood,children,children,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0061944,US,Consumer or non-health professional
4,4.0,Year,Male,,10003517,early_childhood,children,children,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0059314,US,Physician


In [29]:
pediatric_patients_report_serious_reporter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 505185 entries, 0 to 505184
Data columns (total 20 columns):
patient_onsetage           505185 non-null float64
patient_onsetageunit       504054 non-null object
patient_sex                489349 non-null object
patient_weight             156438 non-null float64
safetyreportid             505185 non-null object
nichd                      505185 non-null object
ich_ema                    367367 non-null object
fda                        301739 non-null object
lastupdate_date            505185 non-null int64
mostrecent_receive_date    505185 non-null int64
receive_date               505185 non-null int64
congenital_anomali         4607 non-null float64
death                      36443 non-null float64
disabling                  9541 non-null float64
life_threatening           22291 non-null float64
other                      194514 non-null float64
serious                    505185 non-null object
reporter_company           473715 non-nul

In [30]:
del reporter

In [31]:
del pediatric_patients_report_serious

In [32]:
(pediatric_patients_report_serious_reporter.
 to_csv('../../data/pediatric_patients_report_serious_reporter.csv.gz',
       compression='gzip')
)

In [33]:
ped_reports = pediatric_patients_report_serious_reporter.safetyreportid.astype(str).unique()
len(ped_reports)

505185

In [34]:
pediatric_patients_report_serious_reporter = \
(pd.
 read_csv('../../data/pediatric_patients_report_serious_reporter.csv.gz',
       compression='gzip',
         index_col=0)
)
pediatric_patients_report_serious_reporter.head()

Unnamed: 0,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid,nichd,ich_ema,fda,lastupdate_date,mostrecent_receive_date,receive_date,congenital_anomali,death,disabling,life_threatening,other,serious,reporter_company,reporter_country,reporter_qualification
0,10.0,Year,Male,28.0,10003357,middle_childhood,children,children,20141002,20140312,20140312,,,,,1.0,"The adverse event resulted in death, a life th...",US-ACTAVIS-2014-04163,US,Other health professional
1,19.0,Year,Female,,10003388,late_adolescence,,,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0061242,US,Consumer or non-health professional
2,18.0,Year,Female,,10003401,early_adolescence,,,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0063166,US,Consumer or non-health professional
3,10.0,Year,Male,,10003430,middle_childhood,children,children,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0061944,US,Consumer or non-health professional
4,4.0,Year,Male,,10003517,early_childhood,children,children,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0059314,US,Physician


In [35]:
pediatric_standard_drugs_atc = (pd.
                            read_csv('../../data/openFDA_drug_event/er_tables/standard_drugs_atc.csv.gz',
                                     compression='gzip',
                                    dtype={
                                        'safetyreportid' : 'str'
                                    }).
                            query('safetyreportid in @ped_reports')
                           )
pediatric_standard_drugs_atc.safetyreportid = pediatric_standard_drugs_atc.safetyreportid.astype(str) 
pediatric_standard_drugs_atc.ATC_concept_id = pediatric_standard_drugs_atc.ATC_concept_id.astype(int)
pediatric_standard_drugs_atc.head()

Unnamed: 0_level_0,ATC_concept_code,ATC_concept_id,ATC_concept_name,safetyreportid
ATC_concept_class_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ATC 5th,N05AC02,21604512,thioridazine,11660526
ATC 5th,N05AC02,21604512,thioridazine,15813768
ATC 5th,N05AF04,21604534,tiotixene,12082678
ATC 5th,N05AF04,21604534,tiotixene,4443927-7
ATC 5th,N05AF04,21604534,tiotixene,7483968-9


In [36]:
pediatric_standard_reactions = (pd.
                  read_csv(er_dir+'standard_reactions.csv.gz',
                           compression='gzip')
                      ).query('safetyreportid in @ped_reports')
pediatric_standard_reactions.safetyreportid = pediatric_standard_reactions.safetyreportid.astype(str) 
pediatric_standard_reactions.MedDRA_concept_id = pediatric_standard_reactions.MedDRA_concept_id.astype(int)
pediatric_standard_reactions.head()

Unnamed: 0_level_0,MedDRA_concept_code,MedDRA_concept_id,MedDRA_concept_name,reaction_outcome,safetyreportid
MedDRA_concept_class_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PT,10063264,36312139,17-Hydroxyprogesterone Decreased,,13096952
PT,10063263,36312140,17-Hydroxyprogesterone Increased,Recovered/resolved,10723227
PT,10063263,36312140,17-Hydroxyprogesterone Increased,Unknown,12434437
PT,10063263,36312140,17-Hydroxyprogesterone Increased,Unknown,15287850
PT,10063263,36312140,17-Hydroxyprogesterone Increased,,5211614-3


In [37]:
print(pediatric_patients_report_serious_reporter.head())
print(pediatric_standard_drugs_atc.head())
print(pediatric_standard_reactions.head())

   patient_onsetage patient_onsetageunit patient_sex  patient_weight  \
0              10.0                 Year        Male            28.0   
1              19.0                 Year      Female             NaN   
2              18.0                 Year      Female             NaN   
3              10.0                 Year        Male             NaN   
4               4.0                 Year        Male             NaN   

  safetyreportid              nichd   ich_ema       fda  lastupdate_date  \
0       10003357   middle_childhood  children  children         20141002   
1       10003388   late_adolescence       NaN       NaN         20151125   
2       10003401  early_adolescence       NaN       NaN         20151125   
3       10003430   middle_childhood  children  children         20151125   
4       10003517    early_childhood  children  children         20151125   

   mostrecent_receive_date  receive_date  congenital_anomali  death  \
0                 20140312      2014031

In [38]:
len(np.intersect1d(
    pediatric_standard_drugs_atc.safetyreportid.astype(str).unique(),
    pediatric_standard_reactions.safetyreportid.astype(str).unique()
))

343218

In [39]:
pediatric_patients_report_serious_reporter_drugs_reactions = \
(pediatric_patients_report_serious_reporter.
 set_index('safetyreportid').
 join(pediatric_standard_drugs_atc.
      set_index('safetyreportid')
     ).
 dropna(subset=['ATC_concept_id']).
 join(pediatric_standard_reactions.
     set_index('safetyreportid')
     ).
 dropna(subset=['MedDRA_concept_id']).
 reset_index()
)
pediatric_patients_report_serious_reporter_drugs_reactions = \
(pediatric_patients_report_serious_reporter_drugs_reactions.
 reindex(np.sort(pediatric_patients_report_serious_reporter_drugs_reactions.columns),axis=1))

pediatric_patients_report_serious_reporter_drugs_reactions.ATC_concept_id = \
pediatric_patients_report_serious_reporter_drugs_reactions.ATC_concept_id.astype(int).copy()

pediatric_patients_report_serious_reporter_drugs_reactions.MedDRA_concept_code = \
pediatric_patients_report_serious_reporter_drugs_reactions.MedDRA_concept_code.astype(int).copy()

pediatric_patients_report_serious_reporter_drugs_reactions.MedDRA_concept_id = \
pediatric_patients_report_serious_reporter_drugs_reactions.MedDRA_concept_id.astype(int).copy()

print(pediatric_patients_report_serious_reporter_drugs_reactions.shape)
print(pediatric_patients_report_serious_reporter_drugs_reactions.head())
print(pediatric_patients_report_serious_reporter_drugs_reactions.safetyreportid.nunique())

(966985, 27)
  ATC_concept_code  ATC_concept_id ATC_concept_name  MedDRA_concept_code  \
0          D10AD03        21602295        adapalene             10013709   
1          D10AD03        21602295        adapalene             10014184   
2          D10AD03        21602295        adapalene             10013786   
3          D10AD03        21602295        adapalene             10037867   
4          D10AD03        21602295        adapalene             10040880   

   MedDRA_concept_id MedDRA_concept_name  congenital_anomali  death  \
0           35809327    Drug Ineffective                 NaN    NaN   
1           37320143              Eczema                 NaN    NaN   
2           37320109            Dry Skin                 NaN    NaN   
3           37320214        Rash Macular                 NaN    NaN   
4           37320154     Skin Irritation                 NaN    NaN   

   disabling          fda  ... patient_onsetageunit  patient_sex  \
0        NaN  adolescents  ...     

In [40]:
(pediatric_patients_report_serious_reporter_drugs_reactions.
 to_csv('../../data/pediatric_patients_report_serious_reporter_drugs_reactions.csv.gz',
       compression='gzip')
)

In [41]:
del pediatric_patients_report_serious_reporter

In [42]:
pediatric_standard_drugs = (pd.
                            read_csv('../../data/openFDA_drug_event/er_tables/standard_drugs.csv.gz',
                                     compression='gzip',
                                    dtype={
                                        'safetyreportid' : 'str'
                                    }).
                            query('safetyreportid in @ped_reports')
                           )
pediatric_standard_drugs.safetyreportid = pediatric_standard_drugs.safetyreportid.astype(str) 
pediatric_standard_drugs.RxNorm_concept_id = pediatric_standard_drugs.RxNorm_concept_id.astype(int)
pediatric_standard_drugs.head()

Unnamed: 0,RxNorm_concept_class_id,RxNorm_concept_code,RxNorm_concept_id,RxNorm_concept_name,safetyreportid
1,Clinical Drug,91349,1776544,Hydrogen Peroxide 30 MG/ML Topical Solution,10037760
6,Clinical Drug,91349,1776544,Hydrogen Peroxide 30 MG/ML Topical Solution,10403090
7,Clinical Drug,91349,1776544,Hydrogen Peroxide 30 MG/ML Topical Solution,10417567
8,Clinical Drug,91349,1776544,Hydrogen Peroxide 30 MG/ML Topical Solution,10434099
37,Clinical Drug,91349,1776544,Hydrogen Peroxide 30 MG/ML Topical Solution,12963875


In [51]:
import os
rxfiles = os.listdir('../../RxNorm_relationships_tables/')
rxfile_dict={}
for rxfile in rxfiles:
    key=rxfile.split('.')[0]
    rxfile_dict[key] = pd.read_csv('../../RxNorm_relationships_tables/'+rxfile,engine='c',index_col=0)

In [58]:
tobrand=[]
for rxfile in rxfile_dict.keys():
    tobrand.append(rxfile_dict[rxfile].query('concept_class_id_2=="Brand Name"'))

In [69]:
a = pediatric_standard_drugs.copy()
print(a[primarykey].nunique())
m = \
(pd.merge(
    a,
    pd.concat(tobrand),
    left_on='RxNorm_concept_id',
    right_on='concept_id_1'
)
)
m[primarykey].nunique()

383470


340824

In [74]:
m_renamed = \
(m.
 loc[:,
     [primarykey,'concept_class_id_2','concept_code_2','concept_name_2','concept_id_2']
    ].
 rename(columns={
     'concept_class_id_2' : 'RxNorm_concept_class_id',
     'concept_code_2' : 'RxNorm_concept_code',
     'concept_name_2' : 'RxNorm_concept_name',
     'concept_id_2' : 'RxNorm_concept_id'})
)

In [75]:
(m_renamed.
 to_csv('../../data/pediatric_patients_report_drug_brands.csv.gz',
       compression='gzip')
)