In [1]:
import glob
import numpy as np
import pandas as pd
import pickle

data_dir = "../../data/openFDA_drug_event/"
er_dir = data_dir+'er_tables/'

In [2]:
primarykey='safetyreportid'

In [3]:
patients = pd.read_csv(er_dir+'patient.csv.gz',
                       compression='gzip',
                       index_col=0,dtype={
                           'safetyreportid' : 'str',
                           'patient_custom_master_age' : 'float'
                       })

In [4]:
age_col='patient_onsetage'
aged = patients[patients[age_col].notnull()].reset_index(drop=True).copy()

In [5]:
col = 'nichd'

neonate = aged[age_col].apply(lambda x : float(x)>0 and float(x)<=(1/12))
infant = aged[age_col].apply(lambda x : float(x)>(1/12) and float(x)<=1)
toddler = aged[age_col].apply(lambda x : float(x)>1 and float(x)<=2)
echildhood = aged[age_col].apply(lambda x : float(x)>2 and float(x)<=5)
mchildhood = aged[age_col].apply(lambda x : float(x)>5 and float(x)<=11)
eadolescence = aged[age_col].apply(lambda x : float(x)>11 and float(x)<=18)
ladolescence = aged[age_col].apply(lambda x : float(x)>18 and float(x)<=21)

aged[col] = np.nan

aged.loc[neonate,col] = 'term_neonatal'
aged.loc[infant,col] = 'infancy'
aged.loc[toddler,col] = 'toddler'
aged.loc[echildhood,col] = 'early_childhood'
aged.loc[mchildhood,col] = 'middle_childhood'
aged.loc[eadolescence,col] = 'early_adolescence'
aged.loc[ladolescence,col] = 'late_adolescence'

In [6]:
col = 'ich_ema'

term_newborn_infants = (aged[age_col].
                        apply(lambda x : float(x)>0 and float(x)<=(1/12)))
infants_and_toddlers = (aged[age_col].
                       apply(lambda x : float(x)>(1/12) and float(x)<=2))
children = aged[age_col].apply(lambda x : float(x)>2 and float(x)<=11)
adolescents = aged[age_col].apply(lambda x : float(x)>11 and float(x)<=17)

aged[col] = np.nan

aged.loc[term_newborn_infants,col] = 'term_newborn_infants'
aged.loc[infants_and_toddlers,col] = 'infants_and_toddlers'
aged.loc[children,col] = 'children'
aged.loc[adolescents,col] = 'adolescents'

In [7]:
col = 'fda'

neonates = (aged[age_col].
                        apply(lambda x : float(x)>0 and float(x)<(1/12)))
infants = (aged[age_col].
                       apply(lambda x : float(x)>=(1/12) and float(x)<2))
children = aged[age_col].apply(lambda x : float(x)>=2 and float(x)<11)
adolescents = aged[age_col].apply(lambda x : float(x)>=11 and float(x)<16)

aged[col] = np.nan

aged.loc[neonates,col] = 'neonates'
aged.loc[infants,col] = 'infants'
aged.loc[children,col] = 'children'
aged.loc[adolescents,col] = 'adolescents'

In [8]:
pediatric_patients = (aged.
                      dropna(subset=['nichd']).
                      reset_index(drop=True))
print(pediatric_patients.shape)
print(pediatric_patients.head())

(2478, 8)
   patient_onsetage patient_onsetageunit patient_sex  patient_weight  \
0               1.0                  Day        Male            2.87   
1              11.0                 Year      Female             NaN   
2               8.0               Decade      Female           59.42   
3               7.0                 Year        Male             NaN   
4              14.0                 Year        Male           42.30   

  safetyreportid              nichd               ich_ema          fda  
0       11176943            infancy  infants_and_toddlers      infants  
1       11794406   middle_childhood              children  adolescents  
2       11993585   middle_childhood              children     children  
3       12303048   middle_childhood              children     children  
4       12882341  early_adolescence           adolescents  adolescents  


In [9]:
del patients
del aged

In [10]:
pediatric_patients.head()

Unnamed: 0,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid,nichd,ich_ema,fda
0,1.0,Day,Male,2.87,11176943,infancy,infants_and_toddlers,infants
1,11.0,Year,Female,,11794406,middle_childhood,children,adolescents
2,8.0,Decade,Female,59.42,11993585,middle_childhood,children,children
3,7.0,Year,Male,,12303048,middle_childhood,children,children
4,14.0,Year,Male,42.3,12882341,early_adolescence,adolescents,adolescents


In [11]:
report = (pd.read_csv(er_dir+'report.csv.gz',
                      compression='gzip',
                     dtype={
                         'safetyreportid' : 'str'
                     }))
report.head()

Unnamed: 0,safetyreportid,lastupdate_date,mostrecent_receive_date,receive_date
0,10012555,20200713,20200427,20140314
1,10017377,20200713,20200609,20140318
2,10024623,20200713,20200608,20140320
3,10027457,20200713,20200603,20140321
4,10071936,20200713,20200527,20140411


In [12]:
df1 = pediatric_patients.copy()
ped_reports = df1.safetyreportid.unique()
df2 = report.copy()
print(df1.shape)
print(df2.shape)
df1[primarykey] = df1[primarykey].astype(str)
df2[primarykey] = df2[primarykey].astype(str)
pediatric_patients_report = \
pd.merge(df1,
         df2,
         on=primarykey,
         how='inner').query('safetyreportid in @ped_reports')
print(pediatric_patients_report.shape)

(2478, 8)
(35997, 4)
(2478, 11)


In [13]:
del pediatric_patients
del report

In [14]:
report_serious = pd.read_csv(er_dir+'report_serious.csv.gz',compression='gzip')
report_serious.head()

Unnamed: 0,congenital_anomali,death,disabling,life_threatening,other,safetyreportid,serious
0,,,,,1.0,10012555,"The adverse event resulted in death, a life th..."
1,,,,,,10017377,The adverse event did not result in any of the...
2,,,,,1.0,10024623,"The adverse event resulted in death, a life th..."
3,,,,,,10027457,The adverse event did not result in any of the...
4,,,,,1.0,10071936,"The adverse event resulted in death, a life th..."


In [15]:
df1 = pediatric_patients_report.copy()
df2 = report_serious.copy()
print(df1.shape)
print(df2.shape)
df1[primarykey] = df1[primarykey].astype(str)
df2[primarykey] = df2[primarykey].astype(str)
pediatric_patients_report_serious = \
pd.merge(df1,
         df2,
         on=primarykey,
         how='inner')
print(pediatric_patients_report_serious.shape)

(2478, 11)
(35997, 7)
(2478, 17)


In [16]:
pediatric_patients_report_serious.head()

Unnamed: 0,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid,nichd,ich_ema,fda,lastupdate_date,mostrecent_receive_date,receive_date,congenital_anomali,death,disabling,life_threatening,other,serious
0,1.0,Day,Male,2.87,11176943,infancy,infants_and_toddlers,infants,20200713,20200414,20150610,1.0,,,,,"The adverse event resulted in death, a life th..."
1,11.0,Year,Female,,11794406,middle_childhood,children,adolescents,20200713,20200611,20151202,,1.0,,,1.0,"The adverse event resulted in death, a life th..."
2,8.0,Decade,Female,59.42,11993585,middle_childhood,children,children,20200713,20200407,20160203,,,,,,The adverse event did not result in any of the...
3,7.0,Year,Male,,12303048,middle_childhood,children,children,20200713,20200401,20160426,,,,,,The adverse event did not result in any of the...
4,14.0,Year,Male,42.3,12882341,early_adolescence,adolescents,adolescents,20200713,20200403,20161025,,,,,1.0,"The adverse event resulted in death, a life th..."


In [17]:
del report_serious
del pediatric_patients_report

In [18]:
reporter = pd.read_csv(er_dir+'reporter.csv.gz',compression='gzip')
reporter.head()

Unnamed: 0,reporter_company,reporter_country,reporter_qualification,safetyreportid
0,JP-JNJFOC-20140308779,JP,Physician,10012555
1,US-PFIZER INC-2014074940,US,Physician,10017377
2,US-PFIZER INC-2014073489,US,Lawyer,10024623
3,US-PFIZER INC-2014081186,US,Physician,10027457
4,CA-009507513-1404CAN005185,CA,Consumer or non-health professional,10071936


In [19]:
df1 = pediatric_patients_report_serious.copy()
df2 = reporter.copy()
print(df1.shape)
print(df2.shape)
df1[primarykey] = df1[primarykey].astype(str)
df2[primarykey] = df2[primarykey].astype(str)
pediatric_patients_report_serious_reporter = \
pd.merge(df1,
         df2,
         on=primarykey,
         how='inner')
print(pediatric_patients_report_serious_reporter.shape)

(2478, 17)
(35997, 4)
(2478, 20)


In [20]:
pediatric_patients_report_serious_reporter.head()

Unnamed: 0,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid,nichd,ich_ema,fda,lastupdate_date,mostrecent_receive_date,receive_date,congenital_anomali,death,disabling,life_threatening,other,serious,reporter_company,reporter_country,reporter_qualification
0,1.0,Day,Male,2.87,11176943,infancy,infants_and_toddlers,infants,20200713,20200414,20150610,1.0,,,,,"The adverse event resulted in death, a life th...",DE-SUN PHARMACEUTICAL INDUSTRIES LTD-2015RR-98262,NL,Other health professional
1,11.0,Year,Female,,11794406,middle_childhood,children,adolescents,20200713,20200611,20151202,,1.0,,,1.0,"The adverse event resulted in death, a life th...",US-AUROBINDO-AUR-APL-2015-11183,US,Other health professional
2,8.0,Decade,Female,59.42,11993585,middle_childhood,children,children,20200713,20200407,20160203,,,,,,The adverse event did not result in any of the...,US-PFIZER INC-2016012645,US,Consumer or non-health professional
3,7.0,Year,Male,,12303048,middle_childhood,children,children,20200713,20200401,20160426,,,,,,The adverse event did not result in any of the...,US-JNJFOC-20160314552,US,Consumer or non-health professional
4,14.0,Year,Male,42.3,12882341,early_adolescence,adolescents,adolescents,20200713,20200403,20161025,,,,,1.0,"The adverse event resulted in death, a life th...",US-JNJFOC-20161017561,US,Physician


In [21]:
pediatric_patients_report_serious_reporter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2478 entries, 0 to 2477
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   patient_onsetage         2478 non-null   float64
 1   patient_onsetageunit     2478 non-null   object 
 2   patient_sex              2399 non-null   object 
 3   patient_weight           635 non-null    float64
 4   safetyreportid           2478 non-null   object 
 5   nichd                    2478 non-null   object 
 6   ich_ema                  1869 non-null   object 
 7   fda                      1484 non-null   object 
 8   lastupdate_date          2478 non-null   int64  
 9   mostrecent_receive_date  2478 non-null   int64  
 10  receive_date             2478 non-null   int64  
 11  congenital_anomali       12 non-null     float64
 12  death                    94 non-null     float64
 13  disabling                15 non-null     float64
 14  life_threatening        

In [22]:
del reporter

In [23]:
del pediatric_patients_report_serious

In [24]:
(pediatric_patients_report_serious_reporter.
 to_csv('../../data/pediatric_patients_report_serious_reporter.csv.gz',
       compression='gzip')
)

In [25]:
ped_reports = pediatric_patients_report_serious_reporter.safetyreportid.astype(str).unique()
len(ped_reports)

2478

In [26]:
pediatric_patients_report_serious_reporter = \
(pd.
 read_csv('../../data/pediatric_patients_report_serious_reporter.csv.gz',
       compression='gzip',
         index_col=0)
)
pediatric_patients_report_serious_reporter.head()

Unnamed: 0,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid,nichd,ich_ema,fda,lastupdate_date,mostrecent_receive_date,receive_date,congenital_anomali,death,disabling,life_threatening,other,serious,reporter_company,reporter_country,reporter_qualification
0,1.0,Day,Male,2.87,11176943,infancy,infants_and_toddlers,infants,20200713,20200414,20150610,1.0,,,,,"The adverse event resulted in death, a life th...",DE-SUN PHARMACEUTICAL INDUSTRIES LTD-2015RR-98262,NL,Other health professional
1,11.0,Year,Female,,11794406,middle_childhood,children,adolescents,20200713,20200611,20151202,,1.0,,,1.0,"The adverse event resulted in death, a life th...",US-AUROBINDO-AUR-APL-2015-11183,US,Other health professional
2,8.0,Decade,Female,59.42,11993585,middle_childhood,children,children,20200713,20200407,20160203,,,,,,The adverse event did not result in any of the...,US-PFIZER INC-2016012645,US,Consumer or non-health professional
3,7.0,Year,Male,,12303048,middle_childhood,children,children,20200713,20200401,20160426,,,,,,The adverse event did not result in any of the...,US-JNJFOC-20160314552,US,Consumer or non-health professional
4,14.0,Year,Male,42.3,12882341,early_adolescence,adolescents,adolescents,20200713,20200403,20161025,,,,,1.0,"The adverse event resulted in death, a life th...",US-JNJFOC-20161017561,US,Physician


In [27]:
pediatric_standard_drugs_atc = (pd.
                            read_csv('../../data/openFDA_drug_event/er_tables/standard_drugs_atc.csv.gz',
                                     compression='gzip',
                                    dtype={
                                        'safetyreportid' : 'str'
                                    }).
                            query('safetyreportid in @ped_reports')
                           )
pediatric_standard_drugs_atc.safetyreportid = pediatric_standard_drugs_atc.safetyreportid.astype(str) 
pediatric_standard_drugs_atc.ATC_concept_id = pediatric_standard_drugs_atc.ATC_concept_id.astype(int)
pediatric_standard_drugs_atc.head()

Unnamed: 0,ATC_concept_class_id,ATC_concept_code,ATC_concept_id,ATC_concept_name,safetyreportid
3,ATC 5th,N05AX13,21604563,paliperidone,17800752
5,ATC 5th,N05AX13,21604563,paliperidone,17954364
53,ATC 5th,N03AX09,21604431,lamotrigine,14509427
65,ATC 5th,N03AX09,21604431,lamotrigine,16794112
85,ATC 5th,N03AX09,21604431,lamotrigine,17664609


In [28]:
pediatric_standard_reactions = (pd.
                  read_csv(er_dir+'standard_reactions.csv.gz',
                           compression='gzip')
                      ).query('safetyreportid in @ped_reports')
pediatric_standard_reactions.safetyreportid = pediatric_standard_reactions.safetyreportid.astype(str) 
pediatric_standard_reactions.MedDRA_concept_id = pediatric_standard_reactions.MedDRA_concept_id.astype(int)
pediatric_standard_reactions.head()

Unnamed: 0,MedDRA_concept_class_id,MedDRA_concept_code,MedDRA_concept_id,MedDRA_concept_name,reaction_outcome,safetyreportid
2,PT,10049460,35809100,Abasia,,4401665-0
55,PT,10000059,35708164,Abdominal Discomfort,Not recovered/not resolved,16498682
97,PT,10000059,35708164,Abdominal Discomfort,Not recovered/not resolved,17646644
101,PT,10000059,35708164,Abdominal Discomfort,Not recovered/not resolved,17656319
128,PT,10000059,35708164,Abdominal Discomfort,Not recovered/not resolved,17723282


In [29]:
print(pediatric_patients_report_serious_reporter.head())
print(pediatric_standard_drugs_atc.head())
print(pediatric_standard_reactions.head())

   patient_onsetage patient_onsetageunit patient_sex  patient_weight  \
0               1.0                  Day        Male            2.87   
1              11.0                 Year      Female             NaN   
2               8.0               Decade      Female           59.42   
3               7.0                 Year        Male             NaN   
4              14.0                 Year        Male           42.30   

  safetyreportid              nichd               ich_ema          fda  \
0       11176943            infancy  infants_and_toddlers      infants   
1       11794406   middle_childhood              children  adolescents   
2       11993585   middle_childhood              children     children   
3       12303048   middle_childhood              children     children   
4       12882341  early_adolescence           adolescents  adolescents   

   lastupdate_date  mostrecent_receive_date  receive_date  congenital_anomali  \
0         20200713                 202004

In [30]:
len(np.intersect1d(
    pediatric_standard_drugs_atc.safetyreportid.astype(str).unique(),
    pediatric_standard_reactions.safetyreportid.astype(str).unique()
))

1865

In [31]:
pediatric_patients_report_serious_reporter_drugs_reactions = \
(pediatric_patients_report_serious_reporter.
 set_index('safetyreportid').
 join(pediatric_standard_drugs_atc.
      set_index('safetyreportid')
     ).
 dropna(subset=['ATC_concept_id']).
 join(pediatric_standard_reactions.
     set_index('safetyreportid')
     ).
 dropna(subset=['MedDRA_concept_id']).
 reset_index()
)
pediatric_patients_report_serious_reporter_drugs_reactions = \
(pediatric_patients_report_serious_reporter_drugs_reactions.
 reindex(np.sort(pediatric_patients_report_serious_reporter_drugs_reactions.columns),axis=1))

pediatric_patients_report_serious_reporter_drugs_reactions.ATC_concept_id = \
pediatric_patients_report_serious_reporter_drugs_reactions.ATC_concept_id.astype(int).copy()

pediatric_patients_report_serious_reporter_drugs_reactions.MedDRA_concept_code = \
pediatric_patients_report_serious_reporter_drugs_reactions.MedDRA_concept_code.astype(int).copy()

pediatric_patients_report_serious_reporter_drugs_reactions.MedDRA_concept_id = \
pediatric_patients_report_serious_reporter_drugs_reactions.MedDRA_concept_id.astype(int).copy()

print(pediatric_patients_report_serious_reporter_drugs_reactions.shape)
print(pediatric_patients_report_serious_reporter_drugs_reactions.head())
print(pediatric_patients_report_serious_reporter_drugs_reactions.safetyreportid.nunique())

(11493, 29)
  ATC_concept_class_id ATC_concept_code  ATC_concept_id      ATC_concept_name  \
0              ATC 5th          N03AX14        21604436         levetiracetam   
1              ATC 5th          N03AX14        21604436         levetiracetam   
2              ATC 5th          N03AX11        21604433            topiramate   
3              ATC 5th          N03AX11        21604433            topiramate   
4              ATC 5th          N02BA01        21604305  acetylsalicylic acid   

  MedDRA_concept_class_id  MedDRA_concept_code  MedDRA_concept_id  \
0                      PT             10066054           35306001   
1                      PT             10071404           42889636   
2                      PT             10066054           35306001   
3                      PT             10071404           42889636   
4                      PT             10066054           35306001   

                MedDRA_concept_name  congenital_anomali  death  ...  \
0              

In [32]:
(pediatric_patients_report_serious_reporter_drugs_reactions.
 to_csv('../../data/pediatric_patients_report_serious_reporter_drugs_reactions.csv.gz',
       compression='gzip')
)

In [33]:
del pediatric_patients_report_serious_reporter

In [34]:
pediatric_standard_drugs = (pd.
                            read_csv('../../data/openFDA_drug_event/er_tables/standard_drugs.csv.gz',
                                     compression='gzip',
                                    dtype={
                                        'safetyreportid' : 'str'
                                    }).
                            query('safetyreportid in @ped_reports')
                           )
pediatric_standard_drugs.safetyreportid = pediatric_standard_drugs.safetyreportid.astype(str) 
pediatric_standard_drugs.RxNorm_concept_id = pediatric_standard_drugs.RxNorm_concept_id.astype(int)
pediatric_standard_drugs.head()

Unnamed: 0,RxNorm_concept_class_id,RxNorm_concept_code,RxNorm_concept_id,RxNorm_concept_name,safetyreportid
68,Branded Drug,92752,955694,Fluorouracil 10 MG/ML Topical Cream [Fluoroplex],17657703
96,Branded Drug,92752,955694,Fluorouracil 10 MG/ML Topical Cream [Fluoroplex],17806705
118,Branded Drug,92752,955694,Fluorouracil 10 MG/ML Topical Cream [Fluoroplex],17857993
251,Branded Drug,102250,19000845,Hydrocortisone 1.67 MG/ML Enema [Cortenema],15411042
254,Branded Drug,102250,19000845,Hydrocortisone 1.67 MG/ML Enema [Cortenema],16188498


In [35]:
import os
rxfiles = os.listdir('../../RxNorm_relationships_tables/')
rxfile_dict={}
for rxfile in rxfiles:
    key=rxfile.split('.')[0]
    rxfile_dict[key] = pd.read_csv('../../RxNorm_relationships_tables/'+rxfile,engine='c',index_col=0)

In [36]:
tobrand=[]
for rxfile in rxfile_dict.keys():
    tobrand.append(rxfile_dict[rxfile].query('concept_class_id_2=="Brand Name"'))

In [37]:
a = pediatric_standard_drugs.copy()
print(a[primarykey].nunique())
m = \
(pd.merge(
    a,
    pd.concat(tobrand),
    left_on='RxNorm_concept_id',
    right_on='concept_id_1'
)
)
m[primarykey].nunique()

1999


1114

In [38]:
m_renamed = \
(m.
 loc[:,
     [primarykey,'concept_class_id_2','concept_code_2','concept_name_2','concept_id_2']
    ].
 rename(columns={
     'concept_class_id_2' : 'RxNorm_concept_class_id',
     'concept_code_2' : 'RxNorm_concept_code',
     'concept_name_2' : 'RxNorm_concept_name',
     'concept_id_2' : 'RxNorm_concept_id'})
)

In [39]:
(m_renamed.
 to_csv('../../data/pediatric_patients_report_drug_brands.csv.gz',
       compression='gzip')
)