In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
studies_table_0 = pd.read_pickle('int_new_studies_0.pkl').reset_index(drop=True)
studies_table_1 = pd.read_pickle('int_new_studies_1.pkl').reset_index(drop=True)
studies_table_2 = pd.read_pickle('int_new_studies_2.pkl').reset_index(drop=True)

studies_table = pd.concat([studies_table_0, studies_table_1, studies_table_2]).reset_index(drop=True)


In [3]:
studies_table

Unnamed: 0,study_id,official_title,short_title,conditions,verified_date,responsible_party,sponsor,type,description,interventions,purpose,intervention_type,mesh_terms,criteria,min_age,max_age,gender
0,NCT03266419,"A Double-blind, Randomized, Parallel Design to...",Effectiveness of Deep Versus Moderate Neuromus...,[Neuromuscular Blockade],October 2019,Byung-Moon Choi,Asan Medical Center,Interventional,The aim of this study is to evaluate the influ...,[Rocuronium],Treatment,Parallel Assignment,[],Inclusion Criteria:\n\nPatients 20 to 65 years...,20 Years,65 Years,All
1,NCT03262441,Mycophenolate Mofetil Therapy for Reduction of...,MMF for HIV Reservoir Reduction,[Human Immunodeficiency Virus I Infection],November 2020,Joshua Schiffer,Fred Hutchinson Cancer Research Center,Interventional,"This is an open label, randomized Phase II stu...",[Mycophenolic Acid],Treatment,Single Group Assignment,"[Acquired Immunodeficiency Syndrome, HIV Infec...",Inclusion Criteria:\n\nConfirmed HIV infection...,18 Years,65 Years,All
2,NCT03260894,"A Randomized, Open-Label, Phase 3 Study to Eva...",Pembrolizumab (MK-3475) Plus Epacadostat vs St...,[Renal Cell Carcinoma (RCC)],August 2020,,Incyte Corporation,Interventional,The purpose of this study was to evaluate the ...,"[Pembrolizumab, Sunitinib]",Treatment,Parallel Assignment,"[Carcinoma, Carcinoma, Renal Cell]",Inclusion Criteria:\n\nHistologic confirmation...,18 Years,,All
3,NCT03267940,"A Phase 1B, Randomized, Open-Label Study of PE...",Study of PEGPH20 With Cisplatin (CIS) and Gemc...,"[Cholangiocarcinoma Non-resectable, Cholangioc...",January 2020,,Halozyme Therapeutics,Interventional,The study is being conducted to assess the saf...,[Atezolizumab],Treatment,Sequential Assignment,"[Adenocarcinoma, Cholangiocarcinoma]",Inclusion Criteria:\n\nFor both portions of th...,18 Years,,All
4,NCT03264157,"A Prospective, Randomized, Double-Blind Parall...",Safety and Effectiveness of BPL HRIG With Acti...,[Healthy],January 2020,,Bio Products Laboratory,Interventional,"A prospective, randomized, blinded, parallel-g...",[Vaccines],Treatment,Parallel Assignment,[Rabies],Inclusion Criteria:\n\nAble and willing to sig...,18 Years,75 Years,All
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28090,NCT03035708,Human Laboratory Study of Varenicline for Alco...,Human Laboratory Study of Varenicline for Alco...,[Alcohol Use Disorder],October 2019,,National Institute on Alcohol Abuse and Alcoho...,Interventional,"This study is a double-blind, randomized, plac...",[Varenicline],Treatment,Parallel Assignment,"[Alcoholism, Alcohol Drinking]","Inclusion Criteria:\n\nTo be eligible, the sub...",21 Years,,All
28091,NCT03034772,A Randomized Controlled Trial Comparing the Ef...,Dorzolamide-timolol in Combination With Anti-v...,"[Neovascular Age-related Macular Degeneration,...",June 2020,"Jason Hsu, MD",Wills Eye,Interventional,A previous pilot study demonstrated that commo...,"[Timolol, Dorzolamide, Lubricant Eye Drops]",Treatment,Parallel Assignment,"[Macular Degeneration, Wet Macular Degeneration]",Inclusion Criteria:\n\nActive choroidal neovas...,45 Years,,All
28092,NCT03030989,A Double-blind Randomized Placebo Controlled C...,A Double-blind Randomized Placebo Controlled C...,[Stem Cell Transplant Complications],November 2020,,University of Chicago,Interventional,This study evaluates the use of 2% CHG washclo...,"[Chlorhexidine, Chlorhexidine gluconate]",Prevention,Parallel Assignment,[Infection],Inclusion Criteria:\n\nAdmitted to the Univers...,18 Years,,All
28093,NCT03038880,STAIRWAY: Simultaneous Blockade of Angiopoieti...,Study to Evaluate Faricimab (RO6867461; RG7716...,"[Neovascularization, Choroidal, Macular Degene...",December 2020,,Hoffmann-La Roche,Interventional,"This was a Phase II, multicenter, randomized, ...",[Ranibizumab],Treatment,Parallel Assignment,"[Macular Degeneration, Choroidal Neovasculariz...",Inclusion Criteria\n\nTreatment-naive CNV seco...,50 Years,,All


In [4]:
def create_base_measurements(studies):
    df = {
        'study': [],
        'group_id': [],
        'base': [],
        'class': [],
        'category': [],
        'param_type': [],
        'dispersion_type': [],
        'unit': [],
        'value': [],
        'spread': [],
        'upper': [],
        'lower': []
    }
    for study in studies:
        study_id = study['Study']['ProtocolSection']['IdentificationModule']['NCTId']
        measures = study['Study']['ResultsSection']['BaselineCharacteristicsModule'].get('BaselineMeasureList', {'BaselineMeasure': []})['BaselineMeasure']

        for measure in measures:
            classes = measure.get('BaselineClassList', {'BaselineClass': []})['BaselineClass']

            for clss in classes:
                categories = clss.get('BaselineCategoryList', {'BaselineCategory': []})['BaselineCategory']

                for category in categories:
                    measurements = category.get('BaselineMeasurementList', {'BaselineMeasurement': []})['BaselineMeasurement']
                    for measurement in measurements:
                        df['study'].append(study_id)
                        df['group_id'].append(measurement.get('BaselineMeasurementGroupId', 'NA'))
                        df['base'].append(measure.get('BaselineMeasureTitle', 'NA'))
                        df['class'].append(clss.get('BaselineClassTitle', 'NA'))
                        df['category'].append(category.get('BaselineCategoryTitle', 'NA'))
                        df['param_type'].append(measure.get('BaselineMeasureParamType', 'NA'))
                        df['dispersion_type'].append(measure.get('BaselineMeasureDispersionType', 'NA'))
                        df['unit'].append(measure.get('BaselineMeasureUnitOfMeasure', 'NA'))
                        df['value'].append(measurement.get('BaselineMeasurementValue', 'NA'))
                        df['spread'].append(measurement.get('BaselineMeasurementSpread', 'NA'))
                        df['upper'].append(measurement.get('BaselineMeasurementUpperLimit', 'NA'))
                        df['lower'].append(measurement.get('BaselineMeasurementLowerLimit', 'NA'))
    return df

In [67]:
def create_studies_table(studies):
    buffer = {
        'study_id': [], 'official_title': [], 'short_title':[], 'conditions': [], 
        'verified_date': [], 'responsible_party': [], 'sponsor':[], 'type': [], 'description': [],
        'interventions': [], 'purpose': [], 'intervention_type': [], 'mesh_terms': [],
        'criteria': [], 'min_age': [], 'max_age': [], 'gender': []}
    for i, study in enumerate(studies):


        try:
            buffer['study_id'].append(study['Study']['ProtocolSection']['IdentificationModule']['NCTId'])
        except KeyError as e:
            buffer['study_id'].append('NA')

        try:
            buffer['official_title'].append(study['Study']['ProtocolSection']['IdentificationModule']['OfficialTitle'])
        except KeyError as e:
            buffer['official_title'].append('NA')
            
        try:
            buffer['short_title'].append(study['Study']['ProtocolSection']['IdentificationModule']['BriefTitle'])
        except KeyError as e:
            buffer['short_title'].append('NA')

        try:
            buffer['verified_date'].append(study['Study']['ProtocolSection']['StatusModule']['StatusVerifiedDate'])
        except KeryError as e:
            buffer['verified_date'].append('NA')

        try:
            buffer['responsible_party'].append(study['Study']['ProtocolSection']['SponsorCollaboratorsModule']['ResponsibleParty']['ResponsiblePartyInvestigatorFullName'])
        except KeyError as e:
            buffer['responsible_party'].append('NA')
            
        try:
            buffer['sponsor'].append(study['Study']['ProtocolSection']['SponsorCollaboratorsModule']['LeadSponsor']['LeadSponsorName'])
        except KeyError as e:
            buffer['sponsor'].append('NA')

        try: 
            buffer['conditions'].append(study['Study']['ProtocolSection']['ConditionsModule']['ConditionList']['Condition'])
        except KeyError as e:
            buffer['conditions'].append('NA')

        try:
            buffer['type'].append(study['Study']['ProtocolSection']['DesignModule']['StudyType'])
        except KeyError as e:
            buffer['type'].append('NA')
        
        try:
            buffer['purpose'].append(study['Study']['ProtocolSection']['DesignModule']['DesignInfo'].get('DesignPrimaryPurpose', 'NA'))
        except KeyError as e:
            buffer['purpose'].append('NA')
            
        try:
            buffer['intervention_type'].append(study['Study']['ProtocolSection']['DesignModule']['DesignInfo'].get('DesignInterventionModel', 'NA'))
        except KeyError as e:
            buffer['intervention_type'].append('NA')
        
        try:
            buffer['mesh_terms'].append([x.get('ConditionMeshTerm', 'NA') for x in study['Study']['DerivedSection']['ConditionBrowseModule']['ConditionMeshList']['ConditionMesh']])
        except KeyError as e:
            buffer['mesh_terms'].append([])
            
        try:
            buffer['description'].append(study['Study']['ProtocolSection']['DescriptionModule']['BriefSummary'])
        except KeyError as e:
            buffer['description'].append('NA')
            
        try: 
            buffer['interventions'].append([x.get('InterventionMeshTerm', 'NA') for x in study['Study']['DerivedSection']['InterventionBrowseModule']['InterventionMeshList']['InterventionMesh']])
        except KeyError as e:
            buffer['interventions'].append([])
            
        try:
            buffer['criteria'].append(study['Study']['ProtocolSection']['EligibilityModule']['EligibilityCriteria'])
        except KeyError as e:
            buffer['criteria'].append('NA')
            
        try:
            buffer['gender'].append(study['Study']['ProtocolSection']['EligibilityModule']['Gender'])
        except KeyError as e:
            buffer['gender'].append('NA')
        
        try:
            buffer['min_age'].append(study['Study']['ProtocolSection']['EligibilityModule']['MinimumAge'])
        except KeyError as e:
            buffer['min_age'].append('NA')
            
        try:
            buffer['max_age'].append(study['Study']['ProtocolSection']['EligibilityModule']['MaximumAge'])
        except KeyError as e:
            buffer['max_age'].append('NA')
    return buffer
    

In [1]:
def create_effects_table(studies):
    df = {
        'study_id': [],
        'group_id': [],
        'effect_name': [],
        'type': [], #Serious or other
        'organ_system': [],
        'assesment': [],
        'no_effected': [],
        'collection_threshold': [],
        'no_at_risk':[]
    }
    for i, study in enumerate(studies):
        study_id = study['Study']['ProtocolSection']['IdentificationModule']['NCTId']
        adverse_module = study['Study']['ResultsSection'].get('AdverseEventsModule', {}) #Small risk here
        for event in adverse_module.get('OtherEventList', {'OtherEvent': []})['OtherEvent']:
            for stat in event.get('OtherEventStatsList', {'OtherEventStats': []})['OtherEventStats']:
                df['study_id'].append(study_id)
                df['group_id'].append(stat.get('OtherEventStatsGroupId', 'NA'))
                df['effect_name'].append(event.get('OtherEventTerm', 'NA'))
                df['type'].append('other')
                df['organ_system'].append(event.get('OtherEventOrganSystem', 'NA'))
                df['assesment'].append(event.get('OtherEventAssessmentType', 'NA'))
                df['no_effected'].append(float(stat.get('OtherEventStatsNumAffected', 0)) or float(stat.get('OtherEventStatsNumEvents', 0)))
                df['collection_threshold'].append(float(adverse_module.get('EventsFrequencyThreshold', -1)))
                df['no_at_risk'].append(int(stat.get('OtherEventStatsNumAtRisk', -1)))
        for event in adverse_module.get('SeriousEventList', {'SeriousEvent': []})['SeriousEvent']:
            for stat in event.get('SeriousEventStatsList', {'SeriousEventStats': []})['SeriousEventStats']:
                df['study_id'].append(study_id)
                df['group_id'].append(stat.get('SeriousEventStatsGroupId', 'NA'))
                df['effect_name'].append(event.get('SeriousEventTerm', 'NA'))
                df['type'].append('serious')
                df['organ_system'].append(event.get('SeriousEventOrganSystem', 'NA'))
                df['assesment'].append(event.get('SeriousEventAssessmentType', 'NA'))
                df['no_effected'].append(float(stat.get('SeriousEventStatsNumAffected', 0)) or float(stat.get('OtherEventStatsNumEvents', 0)))
                df['collection_threshold'].append(float(adverse_module.get('EventsFrequencyThreshold', -1)))
                df['no_at_risk'].append(int(stat.get('SeriousEventStatsNumAtRisk', -1)))

    return df

In [1]:
def create_effects_groups_table(studies):
    df = {
        'study_id': [],
        'group_id': [],
        'title': [],
        'description': [], #Serious or other
    }
    for i, study in enumerate(studies):
        study_id = study['Study']['ProtocolSection']['IdentificationModule']['NCTId']
        adverse_module = study['Study']['ResultsSection'].get('AdverseEventsModule', {}) #Small risk here
        for group in adverse_module.get('EventGroupList', {'EventGroup': []})['EventGroup']:
            df['study_id'].append(study_id)
            df['group_id'].append(group.get('EventGroupId', 'NA'))
            df['title'].append(group.get('EventGroupTitle', 'NA'))
            df['description'].append(group.get('EventGroupDescription', 'NA'))
            
    return df

In [22]:
def get_outcome_modules(studies):
    outcome_modules = []
    for study in studies:
        if 'OutcomeMeasuresModule' in study['Study']['ResultsSection']:
            outcome_modules.append(study['Study']['ResultsSection']['OutcomeMeasuresModule'])
            continue 
        print('No Results: ', study['Study']['ProtocolSection']['IdentificationModule']['OfficialTitle'])
        
    return outcome_modules

def create_outcomes_table(studies):
    outcome_modules = get_outcome_modules(studies)
    admin_df = {
        'study_id': [],
        'group_id': [],
        'measure': [],
        'title': [],
        'description': [],
    }
    
    outcome_df = {
        'study_id': [],
        'group_title': [],
        'group_no': [],
        'measure': [],
        'title': [],
        'value': [],
        'dispersion': [],
        'upper': [],
        'lower': [],
        'participants': []
    }
    
    for i, module in enumerate(outcome_modules):
        study_id = studies[i]['Study']['ProtocolSection']['IdentificationModule']['NCTId']
        for measure in module['OutcomeMeasureList']['OutcomeMeasure']:
            try:
                overall_group_to_no = {}
                for denom in measure.get('OutcomeDenomList', {'OutcomeDenom': []})['OutcomeDenom']:
                    if denom.get('OutcomeDenomUnits', 'NA') == 'Participants':
                        for count in denom.get('OutcomeDenomCountList', {'OutcomeDenomCount': []})['OutcomeDenomCount']:
                            overall_group_to_no[count['OutcomeDenomCountGroupId']] = count['OutcomeDenomCountValue']
                
                group_to_title = {}
                for admin in measure.get('OutcomeGroupList', {'OutcomeGroup': []})['OutcomeGroup']:
                    admin_df['study_id'].append(study_id)
                    admin_df['group_id'].append(admin.get('OutcomeGroupId', 'NA'))
                    admin_df['measure'].append(measure.get('OutcomeMeasureTitle', 'NA'))
                    admin_df['title'].append(admin.get('OutcomeGroupTitle', 'NA'))
                    admin_df['description'].append(admin.get('OutcomeGroupDescription', 'NA'))
                    group_to_title[admin.get('OutcomeGroupId', 'NA')] = admin.get('OutcomeGroupTitle', 'NA')
                    
                # Sometimes the participants are just listed one time before all the others - not just in the class
                for group in measure.get('OutcomeClassList', {'OutcomeClass': []})['OutcomeClass']:
    
                    group_to_no = {}
                    for denom in group.get('OutcomeClassDenomList', {'OutcomeClassDenom': []})['OutcomeClassDenom']:
                        for count in denom.get('OutcomeClassDenomCountList', {'OutcomeClassDenomCount': []})['OutcomeClassDenomCount']:
                            group_to_no[count['OutcomeClassDenomCountGroupId']] = count['OutcomeClassDenomCountValue']

                    for cat in group.get('OutcomeCategoryList', {'OutcomeCategory': []})['OutcomeCategory']:
                        for outcome in cat['OutcomeMeasurementList']['OutcomeMeasurement']:
                            outcome_df['study_id'].append(study_id)
                            outcome_df['group_title'].append(group_to_title[outcome.get('OutcomeMeasurementGroupId', 'NA')])
                            outcome_df['group_no'].append(outcome.get('OutcomeMeasurementGroupId', 'NA'))
                            outcome_df['measure'].append(measure.get('OutcomeMeasureTitle', 'NA'))
                            outcome_df['value'].append(outcome.get('OutcomeMeasurementValue', 'NA'))
                            outcome_df['dispersion'].append(outcome.get('OutcomeMeasurementSpread', 'NA'))
                            outcome_df['upper'].append(outcome.get('OutcomeMeasurementUpperLimit', 'NA'))
                            outcome_df['lower'].append(outcome.get('OutcomeMeasurementLowerLimit', 'NA'))
                            outcome_df['participants'].append(group_to_no.get(outcome.get('OutcomeMeasurementGroupId', 'NA'), None) or overall_group_to_no.get(outcome.get('OutcomeMeasurementGroupId', 'NA'), 'NA'))
                            outcome_df['title'].append(group.get('OutcomeClassTitle', 'NA'))
                        
                    
            except KeyError as e:
                print(e)
                continue
                
    return outcome_df

## New Studies

In [3]:
from os import listdir
from os.path import isfile, join, isdir
import json

def sample_int_studies(no_studies, table_func, table_name):
    # Need to add in MESH terms as a requirement
    study_directories = [f for f in listdir('AllAPIJSON/') if isdir(join('AllAPIJSON/', f))]
    studies = []
    write_counter = 0
    for directory in study_directories:
        studyFileNames = [f for f in listdir('AllAPIJSON/'+directory+'/')]
        for file in studyFileNames:
            try:
                with open('AllAPIJSON/'+directory+'/'+file) as f:
                    data = json.load(f)['FullStudy']
                    has_results = 'ResultsSection' in data['Study'] and 'OutcomeMeasuresModule' in data['Study']['ResultsSection']
                    interventions = [x.get('InterventionMeshTerm', 'NA') for x in data['Study']['DerivedSection']['InterventionBrowseModule']['InterventionMeshList']['InterventionMesh']]
                    conditions = data['Study']['ProtocolSection']['ConditionsModule']['ConditionList']['Condition']
                    study_type = data['Study']['ProtocolSection']['DesignModule']['StudyType']
                    if has_results and interventions and conditions and (study_type == 'Interventional'):
                        studies.append(data)

                    f.close()
                    
                if len(studies) + len(studies)*write_counter >= no_studies:
                    print('writing study', write_counter)
                    table = pd.DataFrame.from_dict(table_func(studies))
                    table.to_pickle('int_'+table_name+'_'+str(write_counter)+'.pkl')
                    studies = []
                    return
                    
                if len(studies) >= 10000:
                    print('writing study', write_counter)
                    table = pd.DataFrame.from_dict(table_func(studies))
                    table.to_pickle('int_'+table_name+'_'+str(write_counter)+'.pkl')
                    write_counter += 1
                    studies = []

            except KeyError: 
                continue

    table = pd.DataFrame.from_dict(table_func(studies))
    table.to_pickle('int_'+table_name+'_'+str(write_counter)+'.pkl')

    write_counter += 1
    studies = []

In [6]:
sample_int_studies(float('inf'), create_effects_table, 'newer_effects')

writing study 0
writing study 1


## Study Treatments

In [23]:
# We're going to have to do a direct connection due to the nans in the float column
from sqlalchemy import create_engine

db = create_engine("postgresql://meditreats:meditreats@localhost:5432/meditreats")

In [6]:
# Load up the conditions from the database
db_treatments = pd.read_csv('meditreats_public_treatments.csv')

In [7]:
db_treatments.head()

Unnamed: 0,id,name,from_study,no_studies
0,0,Cyclophosphamide,True,678
1,1,Vaccines,True,644
2,2,Paclitaxel,True,621
3,3,Bevacizumab,True,607
4,4,Dexamethasone,True,542


In [8]:
studies_conditions = studies_table[['study_id', 'interventions']].explode('interventions')

In [12]:
study_treats = studies_conditions.merge(db_treatments[['id', 'name']].rename(columns={
    'name': 'interventions',
    'id': 'treatment'
}), 'left',)[['study_id','treatment']].rename(columns={
    'study_id':'study'
})

In [14]:
study_treats.to_sql('study_treatments', db, index=False, if_exists='append')

## Cleaning Conditions

In [84]:
conditions = studies_table.explode('conditions')[['conditions','study_id']]

In [85]:
# Just combine those with the same words for now
import re

conditions['alpha_num'] = conditions['conditions'].apply(lambda x: re.sub(r'[^a-zA-Z0-9 ]', '', x))

In [86]:
conditions['sorted'] = conditions['alpha_num'].str.split().apply(sorted).apply(tuple)

In [87]:
conditions_dict = {k:i for i,k in enumerate(conditions['sorted'].unique())}

In [88]:
import re

def hash_condition(condition):
    return tuple(sorted(re.sub(r'[^a-zA-Z0-9 ]', '', condition).split()))

def most_common(lst):
    return max(set(lst), key=lst.count)
    

In [89]:
conditions['condition_id'] = conditions['sorted'].apply(lambda x: conditions_dict[x])

In [90]:
conditions['l_conditions'] = conditions['conditions'].apply(lambda x: [x])

In [91]:
conditions_table = pd.DataFrame(conditions.groupby('condition_id')['l_conditions'].apply(sum).apply(most_common))

In [92]:
conditions_table = conditions_table.set_axis(['name'], axis=1, inplace=False)

In [93]:
conditions_table = conditions_table.rename_axis(['id'], axis=0)

In [94]:
study_conditions = conditions[['study_id', 'condition_id']].rename(columns={
    'study_id': 'study',
    'condition_id': 'condition'
}).reset_index(drop=True)

In [98]:
conditions_table.to_csv('db_int_conditions.csv', header=False)
study_conditions.to_csv('db_int_study_conditions.csv', header=False)

In [106]:
study_conditions

Unnamed: 0,study,condition
0,NCT03266419,0
1,NCT03262441,1
2,NCT03260894,2
3,NCT03267940,3
4,NCT03267940,4
...,...,...
48275,NCT03034772,4407
48276,NCT03030989,8836
48277,NCT03038880,1313
48278,NCT03038880,3414


## Cleaning Treatments

In [14]:
studies_table

Unnamed: 0,study_id,official_title,short_title,conditions,verified_date,responsible_party,sponsor,type,description,interventions,purpose,intervention_type,mesh_terms,criteria,min_age,max_age,gender
0,NCT03266419,"A Double-blind, Randomized, Parallel Design to...","A Double-blind, Randomized, Parallel Design to...",[Neuromuscular Blockade],October 2019,Byung-Moon Choi,Asan Medical Center,Interventional,All patients were fasted from midnight without...,[Rocuronium],Treatment,Parallel Assignment,[],Inclusion Criteria:\n\nPatients 20 to 65 years...,20 Years,65 Years,All
1,NCT03262441,Mycophenolate Mofetil Therapy for Reduction of...,Mycophenolate Mofetil Therapy for Reduction of...,[Human Immunodeficiency Virus I Infection],November 2020,Joshua Schiffer,Fred Hutchinson Cancer Research Center,Interventional,"This is an open-label, randomized pilot trial ...",[Mycophenolic Acid],Treatment,Single Group Assignment,"[Acquired Immunodeficiency Syndrome, HIV Infec...",Inclusion Criteria:\n\nConfirmed HIV infection...,18 Years,65 Years,All
2,NCT03260894,"A Randomized, Open-Label, Phase 3 Study to Eva...","A Randomized, Open-Label, Phase 3 Study to Eva...",[Renal Cell Carcinoma (RCC)],August 2020,,Incyte Corporation,Interventional,,"[Pembrolizumab, Sunitinib]",Treatment,Parallel Assignment,"[Carcinoma, Carcinoma, Renal Cell]",Inclusion Criteria:\n\nHistologic confirmation...,18 Years,,All
3,NCT03267940,"A Phase 1B, Randomized, Open-Label Study of PE...","A Phase 1B, Randomized, Open-Label Study of PE...","[Cholangiocarcinoma Non-resectable, Cholangioc...",January 2020,,Halozyme Therapeutics,Interventional,The study will have a Run-in portion and an Ex...,[Atezolizumab],Treatment,Sequential Assignment,"[Adenocarcinoma, Cholangiocarcinoma]",Inclusion Criteria:\n\nFor both portions of th...,18 Years,,All
4,NCT03264157,"A Prospective, Randomized, Double-Blind Parall...","A Prospective, Randomized, Double-Blind Parall...",[Healthy],January 2020,,Bio Products Laboratory,Interventional,Each subject will undergo a total of 9 visits....,[Vaccines],Treatment,Parallel Assignment,[Rabies],Inclusion Criteria:\n\nAble and willing to sig...,18 Years,75 Years,All
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,NCT02186665,"A Multicenter, Randomized, Double Blind, Paral...","A Multicenter, Randomized, Double Blind, Paral...",[Plaque Psoriasis],March 2017,,Galderma R&D,Interventional,,[Calcitriol],Treatment,Parallel Assignment,[Psoriasis],Inclusion Criteria:\n\nMale or female 2 to 12 ...,2 Years,12 Years,All
9996,NCT02184195,"A Phase III, Randomised, Double Blind, Placebo...","A Phase III, Randomised, Double Blind, Placebo...","[Germline BRCA1/2 Mutations and, Metastatic Ad...",November 2020,,AstraZeneca,Interventional,Approximately 145 patients will be randomised ...,[Olaparib],Treatment,Parallel Assignment,"[Adenocarcinoma, Pancreatic Neoplasms]",Key Inclusion Criteria\n\nHistologically or cy...,18 Years,130 Years,All
9997,NCT02181790,Addition of 308-nm Excimer Laser to Acitretin ...,Addition of 308-nm Excimer Laser to Acitretin ...,"[Psoriasis, Skin or Nails]",February 2021,Mark Lebwohl,Icahn School of Medicine at Mount Sinai,Interventional,This open label study evaluated whether a redu...,[Acitretin],Treatment,Parallel Assignment,[Psoriasis],Inclusion Criteria:\n\nMale or female subject ...,18 Years,,All
9998,NCT00265109,Open-Label Study of Levetiracetam in Body Dysm...,Open-Label Study of Levetiracetam in Body Dysm...,[Body Dysmorphic Disorder],August 2019,,Butler Hospital,Interventional,"Body dysmorphic disorder (BDD), a perceived de...",[Levetiracetam],Treatment,Single Group Assignment,"[Disease, Body Dysmorphic Disorders]",Inclusion Criteria:\n\nMen and women age 18-65...,18 Years,65 Years,All


In [139]:
treatments = studies_table.explode('interventions')[['study_id', 'interventions']]
treatments = pd.DataFrame(treatments.groupby('interventions')['study_id'].apply(list).apply(len)).reset_index()

In [140]:
treatment_table = treatments.sort_values(by=['study_id'], ascending=False).rename(columns={
    'study_id': 'no_studies',
    'interventions':'name'
}).reset_index(drop=True)
treatment_table['from_study'] = True

In [87]:
treatment_table.to_csv('int_treatments_db.csv')

## Groups Table

In [78]:
# We want to get the arm description and type, the baseline population numbers, and the
int_admins_sample = pd.read_pickle('int_admins_sample.pkl').reset_index(drop=True)

In [79]:
int_admins_sample['title_treats'] = [[z for z in y if z in x.lower()] for x,y in zip(int_admins_sample['title'], int_admins_sample['treatments'])]
title_treats_df = int_admins_sample.groupby('study_id')['title_treats'].apply(list).reset_index()
title_treats_df['title_treats_flat'] = [list(set([item for sublist in x for item in sublist])) for x in title_treats_df['title_treats']]
with_title_treats = int_admins_sample.merge(title_treats_df[['study_id', 'title_treats_flat']], 'inner', ['study_id'])
with_title_treats['subtracted'] = [[z for z in x if z not in y] for x,y in zip(with_title_treats['treatments'], with_title_treats['title_treats_flat'])]
with_title_treats['added'] = with_title_treats['subtracted'] + with_title_treats['title_treats']
adjusted_int_admins = with_title_treats.drop(columns=['title_treats', 'title_treats_flat', 'subtracted']).rename(columns={'added':'adjusted'})
adjusted_int_admins = adjusted_int_admins.drop_duplicates(['study_id', 'group_id', 'measure'])
adjusted_int_admins.head()

Unnamed: 0,study_id,group_id,measure,title,description,desc_plus_title,treatments,adjusted
0,NCT00262834,OG000,Number of Participants With Adverse Events,Vorinostat,Women in the vorinostat group were scheduled t...,Vorinostat Women in the vorinostat group were ...,[vorinostat],[vorinostat]
1,NCT00262834,OG000,Change in Tissue Proliferation After 3 Days of...,Vorinostat,Women in the vorinostat group were scheduled t...,Vorinostat Women in the vorinostat group were ...,[vorinostat],[vorinostat]
2,NCT00262834,OG001,Change in Tissue Proliferation After 3 Days of...,Tissue Only,Women who declined vorinostat but agreed to do...,Tissue Only Women who declined vorinostat but ...,[vorinostat],[]
3,NCT00262834,OG000,Change in Tissue Apoptosis After 3 Days of Tre...,Vorinostat,Women in the vorinostat group were scheduled t...,Vorinostat Women in the vorinostat group were ...,[vorinostat],[vorinostat]
4,NCT00262834,OG001,Change in Tissue Apoptosis After 3 Days of Tre...,Tissue Only,Women who declined vorinostat but agreed to do...,Tissue Only Women who declined vorinostat but ...,[vorinostat],[]


In [13]:
# Get the outcome groups
outcome_groups = adjusted_int_admins[['study_id', 'group_id', 'title', 'description']].drop_duplicates().reset_index(drop=True)

In [61]:
outcome_groups.to_csv('db_int_groups.csv') # There can be multiple titles for an id... wow

## Administrations Table

In [66]:
admin_table = adjusted_int_admins[['group_id', 'adjusted', 'description']].rename(columns={
    'adjusted': 'treatments'}).drop_duplicates(['group_id', 'description']).reset_index(drop=True)


In [88]:
treats = treatment_table.rename(columns={'name': 'treatments'})
treats['treatments'] = treats['treatments'].str.lower()

In [89]:
# We need to get the new treatments migrated over to our table
admin_treats = admin_table.explode('treatments').merge(treats, 'left')

In [90]:
treats_to_add = admin_treats[~admin_treats['treatments'].isnull() & admin_treats['no_studies'].isnull()]['treatments'].unique()

In [96]:
len(treats_to_add)

10897

In [102]:
no_studies_dummy = pd.Series(1, range(len(treats_to_add)))
from_study = pd.Series(False, range(len(treats_to_add)))
add_treats = pd.concat([pd.Series(treats_to_add), no_studies_dummy, from_study], axis=1).rename(columns={
    0: 'name',
    1: 'no_studies',
    2: 'from_study'
})

In [104]:
treatment_table = pd.concat([treatment_table, add_treats])

In [106]:
treatment_table.to_csv('int_treatments_db.csv')

In [None]:
# Now let's add on the 

## Effects Table

In [111]:
sample_int_studies(float('inf'), create_effects_table, 'new_effects')

writing study 0
writing study 1


In [120]:
effects_table['no_effected'].apply(int)

0         10
1         12
2          2
3          4
4          5
          ..
909182     0
909183     2
909184     0
909185     2
909186     0
Name: no_effected, Length: 909187, dtype: int64

In [117]:
effects_table['organ_system'].value_counts()

Gastrointestinal disorders                                             122097
Infections and infestations                                            102008
General disorders                                                       89102
Skin and subcutaneous tissue disorders                                  69214
Nervous system disorders                                                68276
Investigations                                                          66467
Respiratory, thoracic and mediastinal disorders                         64564
Musculoskeletal and connective tissue disorders                         61054
Metabolism and nutrition disorders                                      49236
Psychiatric disorders                                                   29424
Injury, poisoning and procedural complications                          28941
Eye disorders                                                           28137
Blood and lymphatic system disorders                            

## Upload to Database

### Studies

In [72]:
db_studies_table = studies_table[['study_id', 'verified_date', 'short_title', 'official_title', 'description', 'responsible_party', 'sponsor',
              'type', 'purpose', 'intervention_type', 'min_age', 'max_age', 'gender']].rename(columns={
    'study_id': 'id',
    'verified_date': 'upload_date',
})

In [73]:
db_studies_table['max_age_units'] = db_studies_table['max_age'].str.split(' ').apply(lambda x: x[1] if x != ['NA'] else 'NA')
db_studies_table['max_age_units'] = db_studies_table['max_age_units'].apply(lambda x: x + 's' if (x != 'NA' and x[-1] != 's') else x)
db_studies_table['max_age'] = db_studies_table['max_age'].str.split(' ').apply(lambda x: x[0] if x != ['NA'] else -1)


In [74]:
db_studies_table['min_age_units'] = db_studies_table['min_age'].str.split(' ').apply(lambda x: x[1] if x != ['NA'] else 'NA')
db_studies_table['min_age_units'] = db_studies_table['min_age_units'].apply(lambda x: x + 's' if (x != 'NA' and x[-1] != 's') else x)
db_studies_table['min_age'] = db_studies_table['min_age'].str.split(' ').apply(lambda x: x[0] if x != ['NA'] else -1)

In [75]:
db_studies_table['min_age'] = db_studies_table['min_age'].apply(int)
db_studies_table['max_age'] = db_studies_table['max_age'].apply(int)

In [76]:
db_studies_table = db_studies_table.set_index('id')

In [77]:
month_dict = {
    'January':1,
    'February':2,
    'March':3,
    'April':4,
    'May':5,
    'June':6,
    'July':7,
    'August':8,
    'September':9,
    'October':10,
    'November':11,
    'December':12,
}

db_studies_table['upload_date'] = db_studies_table['upload_date'].str.split(' ').apply(lambda x: x[-1]) + '-' + db_studies_table['upload_date'].str.split(' ').apply(lambda x: str(month_dict[x[0]])) + "-01" 

In [81]:
db_studies_table = db_studies_table[['upload_date', 'short_title', 'official_title', 'description', 'responsible_party', 'sponsor',
              'type', 'purpose', 'intervention_type', 'min_age', 'min_age_units', 'max_age', 'max_age_units', 'gender']]
db_studies_table['intervention_type'] = db_studies_table['intervention_type'].str.upper()
db_studies_table['intervention_type'] = db_studies_table['intervention_type'].str.replace(' ', '_')
db_studies_table['type'] = db_studies_table['type'].str.upper()
db_studies_table['type'] = db_studies_table['type'].str.replace(' ', '_')
db_studies_table['purpose'] = db_studies_table['purpose'].str.upper()
db_studies_table['purpose'] = db_studies_table['purpose'].str.replace(' ', '_')
db_studies_table['min_age_units'] = db_studies_table['min_age_units'].str.upper()
db_studies_table['max_age_units'] = db_studies_table['max_age_units'].str.upper()
db_studies_table['gender'] = db_studies_table['gender'].str.upper()

In [82]:
db_studies_table.head()

Unnamed: 0_level_0,upload_date,short_title,official_title,description,responsible_party,sponsor,type,purpose,intervention_type,min_age,min_age_units,max_age,max_age_units,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
NCT03266419,2019-10-01,Effectiveness of Deep Versus Moderate Neuromus...,"A Double-blind, Randomized, Parallel Design to...",The aim of this study is to evaluate the influ...,Byung-Moon Choi,Asan Medical Center,INTERVENTIONAL,TREATMENT,PARALLEL_ASSIGNMENT,20,YEARS,65,YEARS,ALL
NCT03262441,2020-11-01,MMF for HIV Reservoir Reduction,Mycophenolate Mofetil Therapy for Reduction of...,"This is an open label, randomized Phase II stu...",Joshua Schiffer,Fred Hutchinson Cancer Research Center,INTERVENTIONAL,TREATMENT,SINGLE_GROUP_ASSIGNMENT,18,YEARS,65,YEARS,ALL
NCT03260894,2020-8-01,Pembrolizumab (MK-3475) Plus Epacadostat vs St...,"A Randomized, Open-Label, Phase 3 Study to Eva...",The purpose of this study was to evaluate the ...,,Incyte Corporation,INTERVENTIONAL,TREATMENT,PARALLEL_ASSIGNMENT,18,YEARS,-1,,ALL
NCT03267940,2020-1-01,Study of PEGPH20 With Cisplatin (CIS) and Gemc...,"A Phase 1B, Randomized, Open-Label Study of PE...",The study is being conducted to assess the saf...,,Halozyme Therapeutics,INTERVENTIONAL,TREATMENT,SEQUENTIAL_ASSIGNMENT,18,YEARS,-1,,ALL
NCT03264157,2020-1-01,Safety and Effectiveness of BPL HRIG With Acti...,"A Prospective, Randomized, Double-Blind Parall...","A prospective, randomized, blinded, parallel-g...",,Bio Products Laboratory,INTERVENTIONAL,TREATMENT,PARALLEL_ASSIGNMENT,18,YEARS,75,YEARS,ALL


In [83]:
db_studies_table.to_csv('db_int_studies.csv', header=False)

### Conditions

### Measures

In [107]:
measures_table_0 = pd.read_pickle('int_measures_0.pkl')
measures_table_1 = pd.read_pickle('int_measures_1.pkl')
measures_table_2 = pd.read_pickle('int_measures_2.pkl')

measures_table = pd.concat([measures_table_0, measures_table_1, measures_table_2]).reset_index(drop=True)

In [111]:
db_measures_table = measures_table.rename(columns={
    'study_id': 'study',
    'measure': 'title',
    'dispersion_param': 'dispersion',
    'measure_param': 'param'
}).rename_axis(['id'], axis=0)

In [120]:
dispersion_map = {
    'Standard Deviation': 'STANDARD_DEVIATION',
    '95% Confidence Interval': 'CONFIDENCE_INTERVAL_95',
    'Standard Error': 'STANDARD_ERROR',
    'Full Range': 'FULL_RANGE',
    'Geometric Coefficient of Variation': 'GEOMETRIC_COEFFICIENT_OF_VARIATION',
    'Inter-Quartile Range': 'INTER_QUARTILE_RANGE',
    '90% Confidence Interval': 'CONFIDENCE_INTERVAL_90',
    '80% Confidence Interval': 'CONFIDENCE_INTERVAL_80',
    '97% Confidence Interval': 'CONFIDENCE_INTERVAL_97',
    '99% Confidence Interval': 'CONFIDENCE_INTERVAL_99',
    '60% Confidence Interval': 'CONFIDENCE_INTERVAL_60',
    '96% Confidence Interval': 'CONFIDENCE_INTERVAL_96',
    '98% Confidence Interval': 'CONFIDENCE_INTERVAL_98',
    '70% Confidence Interval': 'CONFIDENCE_INTERVAL_70',
    '85% Confidence Interval': 'CONFIDENCE_INTERVAL_85',
    '75% Confidence Interval': 'CONFIDENCE_INTERVAL_75',
    '94% Confidence Interval': 'CONFIDENCE_INTERVAL_94',
    '100% Confidence Interval': 'CONFIDENCE_INTERVAL_100',
    'NA': 'NA'
}

db_measures_table['dispersion'] = db_measures_table['dispersion'].apply(lambda x: x if '.' not in x else x[:x.index('.')] + x[x.index('%'):]).apply(lambda x: dispersion_map[x])

In [134]:
db_measures_table['param'] = db_measures_table['param'].str.upper().str.replace(' ', '_')

In [127]:
measure_type_map = {
    'Primary': 'PRIMARY',
    'Secondary': 'SECONDARY',
    'Other Pre-specified': 'OTHER',
    'Post-Hoc': 'OTHER'
}


db_measures_table['type'] = db_measures_table['type'].apply(lambda x: measure_type_map[x])

In [129]:
db_measures_table = db_measures_table[['study', 'title', 'description', 'dispersion',  'type', 'param', 'units']]

In [138]:
db_measures_table['description'].str.len().max()

1004

In [130]:
db_measures_table.head()

Unnamed: 0_level_0,study,title,description,dispersion,type,param,units
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,NCT03266419,Minimum Effective Analgesic Dose (MEAD) of Oxy...,The patient was administered intravenous oxyco...,FULL_RANGE,PRIMARY,MEDIAN,mg
1,NCT03266419,Mean Visual Analogue Scale (VAS) Score for Wou...,The patient was administered intravenous oxyco...,FULL_RANGE,SECONDARY,MEAN,mm
2,NCT03262441,Change in Cell-associated HIV DNA (Ca-DNA) Lev...,Regression slope of change in cell-associated ...,CONFIDENCE_INTERVAL_95,PRIMARY,MEAN,log10 caDNA copies per 10^6 T-cells/week
3,NCT03262441,Change in Cell-associated HIV DNA (Ca-DNA) Lev...,Regression slope of change in cell-associated ...,CONFIDENCE_INTERVAL_95,PRIMARY,MEAN,log10 caDNA copies per 10^6 T-cells/week
4,NCT03262441,Change in Cell-associated Intact HIV DNA (Ca-i...,Regression slope of change in cell-associated ...,CONFIDENCE_INTERVAL_95,PRIMARY,MEAN,log10 caDNA copies per 10^6 T-cells/week


In [136]:
db_measures_table.to_csv('db_int_measures.csv', header=False)

### Treatments

In [45]:
treatments = studies_table.explode('interventions')[['study_id', 'interventions']]
treatments = pd.DataFrame(treatments.groupby('interventions')['study_id'].apply(list).apply(len)).reset_index()
treatment_table = treatments.sort_values(by=['study_id'], ascending=False).rename(columns={
    'study_id': 'no_studies',
    'interventions':'name'
}).reset_index(drop=True)
treatment_table['from_study'] = True

In [6]:
analytics_table = pd.read_pickle('intervention_analytics.pkl')

In [22]:
treats_a = analytics_table.explode('treatments_a')['treatments_a']
treats_b = analytics_table.explode('treatments_b')['treatments_b']
treats = pd.Series(pd.concat([treats_a, treats_b]).unique())

In [23]:
treats = treats.str.capitalize()

In [33]:
treats = pd.Series([x for x in treats if x not in set(treatment_table['name'])])

In [46]:
treats_df = pd.DataFrame(treats)

In [47]:
treats_df['from_study'] = False

In [48]:
treats_df['no_studies'] = -1

In [49]:
treats_df = treats_df.rename(columns={0: 'name'})

In [50]:
treatment_table = pd.concat([treatment_table, treats_df])

In [54]:
treatment_table = treatment_table.rename(columns = {
    'name': 'title'
})[['title', 'from_study', 'no_studies']]

In [59]:
treatment_table = treatment_table.reset_index(drop=True)

In [60]:
treatment_table.to_csv('db_int_treatments.csv', header=False)

### Groups

In [61]:
groups_table = pd.read_csv('db_int_groups.csv')

In [66]:
groups_table = groups_table.rename_axis(['id'], axis=0).drop(columns=['Unnamed: 0'])

In [68]:
groups_table = groups_table.rename(columns={
    'study_id': 'study',
    'group_id': 'study_id',
})[['title', 'study_id', 'description', 'study']]

In [71]:
groups_table.to_csv('db_int_groups.csv', header=False)

### Administration

In [81]:
adjusted_int_admins.head()

Unnamed: 0,study_id,group_id,measure,title,description,desc_plus_title,treatments,adjusted
0,NCT00262834,OG000,Number of Participants With Adverse Events,Vorinostat,Women in the vorinostat group were scheduled t...,Vorinostat Women in the vorinostat group were ...,[vorinostat],[vorinostat]
1,NCT00262834,OG000,Change in Tissue Proliferation After 3 Days of...,Vorinostat,Women in the vorinostat group were scheduled t...,Vorinostat Women in the vorinostat group were ...,[vorinostat],[vorinostat]
2,NCT00262834,OG001,Change in Tissue Proliferation After 3 Days of...,Tissue Only,Women who declined vorinostat but agreed to do...,Tissue Only Women who declined vorinostat but ...,[vorinostat],[]
3,NCT00262834,OG000,Change in Tissue Apoptosis After 3 Days of Tre...,Vorinostat,Women in the vorinostat group were scheduled t...,Vorinostat Women in the vorinostat group were ...,[vorinostat],[vorinostat]
4,NCT00262834,OG001,Change in Tissue Apoptosis After 3 Days of Tre...,Tissue Only,Women who declined vorinostat but agreed to do...,Tissue Only Women who declined vorinostat but ...,[vorinostat],[]


In [89]:
# Merge with treatments to get the id
expl_admins = adjusted_int_admins.explode('adjusted')

In [91]:
expl_admins['adjusted'] = expl_admins['adjusted'].str.capitalize()
expl_admins.head()

Unnamed: 0,study_id,group_id,measure,title,description,desc_plus_title,treatments,adjusted
0,NCT00262834,OG000,Number of Participants With Adverse Events,Vorinostat,Women in the vorinostat group were scheduled t...,Vorinostat Women in the vorinostat group were ...,[vorinostat],Vorinostat
1,NCT00262834,OG000,Change in Tissue Proliferation After 3 Days of...,Vorinostat,Women in the vorinostat group were scheduled t...,Vorinostat Women in the vorinostat group were ...,[vorinostat],Vorinostat
2,NCT00262834,OG001,Change in Tissue Proliferation After 3 Days of...,Tissue Only,Women who declined vorinostat but agreed to do...,Tissue Only Women who declined vorinostat but ...,[vorinostat],
3,NCT00262834,OG000,Change in Tissue Apoptosis After 3 Days of Tre...,Vorinostat,Women in the vorinostat group were scheduled t...,Vorinostat Women in the vorinostat group were ...,[vorinostat],Vorinostat
4,NCT00262834,OG001,Change in Tissue Apoptosis After 3 Days of Tre...,Tissue Only,Women who declined vorinostat but agreed to do...,Tissue Only Women who declined vorinostat but ...,[vorinostat],


In [94]:
treatment_table.head()

Unnamed: 0,title,from_study,no_studies
0,Cyclophosphamide,True,678
1,Vaccines,True,644
2,Paclitaxel,True,621
3,Bevacizumab,True,607
4,Dexamethasone,True,542


In [103]:
treatment_merge = treatment_table.rename_axis(['id'], axis=0).rename(columns={
    'title': 'adjusted'
})


In [109]:
treatment_merge['treatment'] = treatment_merge.index

In [113]:
admins_treats = expl_admins.merge(treatment_merge[['adjusted', 'treatment']], 'left', ['adjusted'])

In [120]:
# losing a little less than 1% but still not good practice
admins_treats = admins_treats[~admins_treats['treatment'].isna()]

In [125]:
admins_treats.head()

Unnamed: 0,study_id,group_id,measure,title,description,desc_plus_title,treatments,adjusted,treatment
0,NCT00262834,OG000,Number of Participants With Adverse Events,Vorinostat,Women in the vorinostat group were scheduled t...,Vorinostat Women in the vorinostat group were ...,[vorinostat],Vorinostat,111.0
1,NCT00262834,OG000,Change in Tissue Proliferation After 3 Days of...,Vorinostat,Women in the vorinostat group were scheduled t...,Vorinostat Women in the vorinostat group were ...,[vorinostat],Vorinostat,111.0
2,NCT00262834,OG001,Change in Tissue Proliferation After 3 Days of...,Tissue Only,Women who declined vorinostat but agreed to do...,Tissue Only Women who declined vorinostat but ...,[vorinostat],,2182.0
3,NCT00262834,OG000,Change in Tissue Apoptosis After 3 Days of Tre...,Vorinostat,Women in the vorinostat group were scheduled t...,Vorinostat Women in the vorinostat group were ...,[vorinostat],Vorinostat,111.0
4,NCT00262834,OG001,Change in Tissue Apoptosis After 3 Days of Tre...,Tissue Only,Women who declined vorinostat but agreed to do...,Tissue Only Women who declined vorinostat but ...,[vorinostat],,2182.0


In [123]:
groups_table['group'] = groups_table.index

In [126]:
groups_merge = groups_table.rename(columns = {
    'study_id': 'group_id',
    'study': 'study_id'
})[['title', 'group_id', 'study_id', 'group']]

In [132]:
admin_db = admins_treats.merge(groups_merge, 'left', ['study_id', 'group_id', 'title'])[['group', 'treatment', 'description']].reset_index(drop=True).drop_duplicates()

In [135]:
admin_db['treatment'] = admin_db['treatment'].apply(int)

In [138]:
admin_db['description'].str.len().max()

1490

In [137]:
admin_db.to_csv('db_int_admin.csv', header=False)

### Outcomes

In [25]:
outcomes_0 = pd.read_pickle('int_new_outcomes_0.pkl')
outcomes_1 = pd.read_pickle('int_new_outcomes_1.pkl')
outcomes_2 = pd.read_pickle('int_new_outcomes_2.pkl')

outcomes = pd.concat([outcomes_0, outcomes_1, outcomes_2]).reset_index(drop=True)

In [26]:
outcomes

Unnamed: 0,study_id,group_title,group_no,measure,title,value,dispersion,upper,lower,participants
0,NCT03266419,Deep NMB Using Rocuronium,OG000,Minimum Effective Analgesic Dose (MEAD) of Oxy...,,8.0,,26,2,51
1,NCT03266419,Moderate NMB Using Rocuronium,OG001,Minimum Effective Analgesic Dose (MEAD) of Oxy...,,8.0,,27,2,49
2,NCT03266419,Deep NMB Using Rocuronium,OG000,Mean Visual Analogue Scale (VAS) Score for Wou...,,6.1,,8.4,2.8,51
3,NCT03266419,Moderate NMB Using Rocuronium,OG001,Mean Visual Analogue Scale (VAS) Score for Wou...,,6.1,,7.3,2.7,49
4,NCT03262441,Mycophenolate Mofetil,OG000,Change in Cell-associated HIV DNA (Ca-DNA) Lev...,,-0.00033,,0.0014,-0.0020,4
...,...,...,...,...,...,...,...,...,...,...
1707752,NCT03031496,Treatment B,OG001,Body Temperature Values at Indicated Time Points,"P2; Day1; 8 hour post-dose; n= 19, 19",36.65,0.399,,,19
1707753,NCT03031496,Treatment A,OG000,Body Temperature Values at Indicated Time Points,"P2; Day2; 24 hour post-dose; n= 19, 19",36.15,0.301,,,19
1707754,NCT03031496,Treatment B,OG001,Body Temperature Values at Indicated Time Points,"P2; Day2; 24 hour post-dose; n= 19, 19",36.17,0.437,,,19
1707755,NCT03031496,Treatment A,OG000,Body Temperature Values at Indicated Time Points,"P2; Day3; 48 hour post-dose; n= 18, 19",36.08,0.323,,,18


In [27]:
groups = pd.read_csv('meditreats_public_groups.csv')

In [29]:
groups.head()

Unnamed: 0,id,title,study_id,description,study
0,0,Vorinostat,OG000,Women in the vorinostat group were scheduled t...,NCT00262834
1,1,Tissue Only,OG001,Women who declined vorinostat but agreed to do...,NCT00262834
2,2,Arm I,OG000,Patients receive oral vorinostat twice daily o...,NCT00262834
3,3,Low CIWA Flumazenil/Gabapentin,OG000,2 mg flumazenil given over 20 minutes on Day 1...,NCT00262639
4,4,Low CIWAar Placebo,OG001,20 mg Saline infused slowly over 20 minutes. P...,NCT00262639


In [39]:
groups[groups['study'] == 'NCT02460380']

Unnamed: 0,id,title,study_id,description,study
39489,39360,Vitamin D3,OG000,Women allocated to vitamin D3 group received o...,NCT02460380
39490,39361,Placebo,OG001,Women in the placebo group received once capsu...,NCT02460380


In [30]:
groups_merge = groups.rename(columns={
    'title': 'group_title',
    'study': 'study_id',
    'study_id': 'group_no'
})

In [38]:
groups_merge.head()

Unnamed: 0,id,group_title,group_no,description,study_id
0,0,Vorinostat,OG000,Women in the vorinostat group were scheduled t...,NCT00262834
1,1,Tissue Only,OG001,Women who declined vorinostat but agreed to do...,NCT00262834
2,2,Arm I,OG000,Patients receive oral vorinostat twice daily o...,NCT00262834
3,3,Low CIWA Flumazenil/Gabapentin,OG000,2 mg flumazenil given over 20 minutes on Day 1...,NCT00262639
4,4,Low CIWAar Placebo,OG001,20 mg Saline infused slowly over 20 minutes. P...,NCT00262639


In [32]:
merged = outcomes.merge(groups_merge[['group_title', 'study_id', 'id', 'group_no']], 'left', ['study_id', 'group_title', 'group_no']).drop_duplicates()



In [40]:
outcome_table = merged[~merged['id'].isna()]

In [43]:
# Now let's merge in the meeasures 
measures = pd.read_csv('meditreats_public_measures.csv')

In [47]:
measures.head()

Unnamed: 0,id,study,title,description,dispersion,type,param,units
0,278,NCT03322566,Duration of Response of Pembrolizumab + Chemot...,Defined as the time from the earliest date of ...,CONFIDENCE_INTERVAL_95,SECONDARY,MEDIAN,months
1,279,NCT03322566,Safety and Tolerability of Pembrolizumab + Che...,An AE is defined as any untoward medical occur...,,SECONDARY,COUNT_OF_PARTICIPANTS,Participants
2,280,NCT03322566,Safety and Tolerability of Pembrolizumab + Che...,An AE is defined as any untoward medical occur...,,SECONDARY,COUNT_OF_PARTICIPANTS,Participants
3,281,NCT03324451,Change in Glycosylated Hemoglobin (HbA1c) Levels,Collected from blood test to assess the Glycos...,STANDARD_DEVIATION,PRIMARY,MEAN,percent of glycosylated Hb
4,282,NCT03324451,Change in Diabetes Empowerment Process,A 13-item Chinese version of the Diabetes Empo...,STANDARD_DEVIATION,SECONDARY,MEAN,score on a scale


In [52]:
measures_merge = measures.rename(columns = {
    'study': 'study_id',
    'title': 'measure'
})[['id', 'study_id', 'measure']].drop_duplicates(['study_id', 'measure'])

In [54]:
outcome_table = outcome_table.merge(measures_merge, 'left', ['study_id', 'measure'])

In [56]:
outcome_table.head()

Unnamed: 0,study_id,group_title,group_no,measure,title,value,dispersion,upper,lower,participants,id_x,id_y
0,NCT03266419,Deep NMB Using Rocuronium,OG000,Minimum Effective Analgesic Dose (MEAD) of Oxy...,,8.0,,26.0,2.0,51,35418.0,0
1,NCT03266419,Moderate NMB Using Rocuronium,OG001,Minimum Effective Analgesic Dose (MEAD) of Oxy...,,8.0,,27.0,2.0,49,35419.0,0
2,NCT03266419,Deep NMB Using Rocuronium,OG000,Mean Visual Analogue Scale (VAS) Score for Wou...,,6.1,,8.4,2.8,51,35418.0,1
3,NCT03266419,Moderate NMB Using Rocuronium,OG001,Mean Visual Analogue Scale (VAS) Score for Wou...,,6.1,,7.3,2.7,49,35419.0,1
4,NCT03262441,Mycophenolate Mofetil,OG000,Change in Cell-associated HIV DNA (Ca-DNA) Lev...,,-0.00033,,0.0014,-0.002,4,35420.0,2


In [60]:
outcome_db = outcome_table.rename(columns={
    'study_id': 'study',
    'id_x': 'group',
    'id_y': 'measure',
    'measure': 'measure_title',
    'participants': 'no_participants'
})[['study', 'group', 'measure', 'title', 'value', 'dispersion', 'upper', 'lower', 'no_participants']]

In [61]:
outcome_db['group'] = outcome_db['group'].apply(int)

In [17]:
def string2float(string):
    try:
        return float(string.replace(',', ''))
    except Exception as e:
        return float('nan')
    
    
def string2int(string):
    try:
        return int(string.replace(',',''))
    except Exception as e:
        return -1
    

In [84]:
outcome_db.head()

Unnamed: 0,study,group,measure,title,value,dispersion,upper,lower,no_participants
0,NCT03266419,35418,0,,8.0,,26.0,2.0,51
1,NCT03266419,35419,0,,8.0,,27.0,2.0,49
2,NCT03266419,35418,1,,6.1,,8.4,2.8,51
3,NCT03266419,35419,1,,6.1,,7.3,2.7,49
4,NCT03262441,35420,2,,-0.00033,,0.0014,-0.002,4


In [67]:
outcome_db['value'] = outcome_db['value'].apply(string2float)

In [69]:
outcome_db['dispersion'] = outcome_db['dispersion'].apply(string2float)

In [71]:
outcome_db['upper'] = outcome_db['upper'].apply(string2float)

In [74]:
outcome_db['lower'] = outcome_db['lower'].apply(string2float)

In [78]:
outcome_db['no_participants'] = outcome_db['no_participants'].apply(string2int)

In [89]:
outcome_db.to_csv('int_db_outcome.csv', header=False)

In [88]:
outcome_db['no_participants']

0          51
1          49
2          51
3          49
4           4
           ..
2023779    19
2023780    19
2023781    19
2023782    18
2023783    19
Name: no_participants, Length: 2023784, dtype: int64

In [42]:
# We're going to have to do a direct connection due to the nans in the float column
from sqlalchemy import create_engine

db = create_engine("postgresql://meditreats:meditreats@localhost:5432/meditreats")

In [99]:
outcome_db.to_sql('outcomes', db, index=False, if_exists='append')

### Analytics Table

In [2]:
# This is going to be a pain
# There aare so little studies that can actually be analyzed with this current method...

non_study_analytics = pd.read_pickle('int_analytics.pkl')
non_study_analytics = non_study_analytics[non_study_analytics['pval'] > 0].reset_index(drop=True)

In [3]:
non_study_analytics.head()

Unnamed: 0,study_id,measure,groups,description,method,param_type,fromStudy,pval,group_titles
0,NCT00262639,Percent Days Abstinent,"[OG001, OG003]",,t-test,?,False,0.236682,"{'OG001': 'Low CIWAar Placebo', 'OG003': 'High..."
1,NCT00262639,Percent Days Abstinent,"[OG000, OG003]",,t-test,?,False,0.014809,"{'OG000': 'Low CIWA Flumazenil/Gabapentin', 'O..."
2,NCT00262639,Percent Days Abstinent,"[OG000, OG001]",,t-test,?,False,0.02979,"{'OG000': 'Low CIWA Flumazenil/Gabapentin', 'O..."
3,NCT00262639,Percent Days Abstinent,"[OG002, OG003]",,t-test,?,False,0.084525,"{'OG002': 'High CIWAar Placebo', 'OG003': 'Hig..."
4,NCT00262639,Percent Days Abstinent,"[OG002, OG001]",,t-test,?,False,0.247262,"{'OG002': 'High CIWAar Placebo', 'OG001': 'Low..."


In [4]:
analytics_0 = pd.read_pickle('int_new_analytics_0.pkl')
analytics_1 = pd.read_pickle('int_new_analytics_1.pkl')
analytics_2 = pd.read_pickle('int_new_analytics_2.pkl')

study_analytics = pd.concat([analytics_0, analytics_1, analytics_2]).reset_index(drop=True)

In [5]:
study_analytics.head()

Unnamed: 0,study_id,measure,groups,description,method,param_type,fromStudy,pval,group_titles,is_non_inferiority,non_inferiority_type,non_inferiority_comment,param_value,ci_pct,ci_lower,ci_upper
0,NCT03262441,Change in Cell-associated HIV DNA (Ca-DNA) Lev...,[OG000],,,Slope,True,,{'OG000': 'Mycophenolate Mofetil'},,Other,,-0.00033,95,-0.002,0.0014
1,NCT03262441,Change in Cell-associated HIV DNA (Ca-DNA) Lev...,[OG000],,,Slope,True,,{'OG000': 'Mycophenolate Mofetil'},,Other,,0.001,95,-0.0036,0.0056
2,NCT03262441,Change in Cell-associated Intact HIV DNA (Ca-i...,[OG000],,,Slope,True,,{'OG000': 'Mycophenolate Mofetil'},,Other,,0.0024,95,-0.003,0.0078
3,NCT03264157,Proportion of Subjects With Anti-rabies Antibo...,"[OG000, OG001]",,Farrington and Manning test,lower 95% CI,True,0.0006,"{'OG000': 'BPL HRIG + RabAvert', 'OG001': 'Com...",,Non-Inferiority,The null hypothesis is p-p0 ≤ -0.1. The altern...,-0.05,95,-0.05,
4,NCT03264157,Analysis of AUC0-7d,"[OG000, OG001]",,,lower 95% CI,True,,"{'OG000': 'BPL HRIG + RabAvert', 'OG001': 'Com...",,Non-Inferiority,The prespecified non inferiority margin was 20...,0.74,95,0.74,0.94


In [6]:
# 1. Get both in the same table
# 2. Merge in the measures 
non_study_analytics['is_non_inferiority'] = False
non_study_analytics['non_inferiority_type'] = 'NA'
non_study_analytics['non_inferiority_comment'] = 'NA'
non_study_analytics['param_value'] = float('nan')
non_study_analytics['ci_pct'] = -1
non_study_analytics['ci_lower'] = -1
non_study_analytics['ci_upper'] = -1

non_study_analytics.head()

Unnamed: 0,study_id,measure,groups,description,method,param_type,fromStudy,pval,group_titles,is_non_inferiority,non_inferiority_type,non_inferiority_comment,param_value,ci_pct,ci_lower,ci_upper
0,NCT00262639,Percent Days Abstinent,"[OG001, OG003]",,t-test,?,False,0.236682,"{'OG001': 'Low CIWAar Placebo', 'OG003': 'High...",False,,,,-1,-1,-1
1,NCT00262639,Percent Days Abstinent,"[OG000, OG003]",,t-test,?,False,0.014809,"{'OG000': 'Low CIWA Flumazenil/Gabapentin', 'O...",False,,,,-1,-1,-1
2,NCT00262639,Percent Days Abstinent,"[OG000, OG001]",,t-test,?,False,0.02979,"{'OG000': 'Low CIWA Flumazenil/Gabapentin', 'O...",False,,,,-1,-1,-1
3,NCT00262639,Percent Days Abstinent,"[OG002, OG003]",,t-test,?,False,0.084525,"{'OG002': 'High CIWAar Placebo', 'OG003': 'Hig...",False,,,,-1,-1,-1
4,NCT00262639,Percent Days Abstinent,"[OG002, OG001]",,t-test,?,False,0.247262,"{'OG002': 'High CIWAar Placebo', 'OG001': 'Low...",False,,,,-1,-1,-1


In [7]:
total_analytics = pd.concat([study_analytics, non_study_analytics]).reset_index(drop=True)

In [8]:
measures = pd.read_csv('meditreats_public_measures.csv')

In [9]:
measures.head()

Unnamed: 0,id,study,title,description,dispersion,type,param,units
0,278,NCT03322566,Duration of Response of Pembrolizumab + Chemot...,Defined as the time from the earliest date of ...,CONFIDENCE_INTERVAL_95,SECONDARY,MEDIAN,months
1,279,NCT03322566,Safety and Tolerability of Pembrolizumab + Che...,An AE is defined as any untoward medical occur...,,SECONDARY,COUNT_OF_PARTICIPANTS,Participants
2,280,NCT03322566,Safety and Tolerability of Pembrolizumab + Che...,An AE is defined as any untoward medical occur...,,SECONDARY,COUNT_OF_PARTICIPANTS,Participants
3,281,NCT03324451,Change in Glycosylated Hemoglobin (HbA1c) Levels,Collected from blood test to assess the Glycos...,STANDARD_DEVIATION,PRIMARY,MEAN,percent of glycosylated Hb
4,282,NCT03324451,Change in Diabetes Empowerment Process,A 13-item Chinese version of the Diabetes Empo...,STANDARD_DEVIATION,SECONDARY,MEAN,score on a scale


In [10]:
measures_merge = measures[['id', 'study', 'title']].rename(columns={
    'study': 'study_id',
    'title': 'measure'
})

In [11]:
len(total_analytics)

1003970

In [12]:
total_analytics = total_analytics.merge(measures_merge, 'left', ['study_id', 'measure']).drop(columns=['measure']).rename(columns={
    'id': 'measure'
})

In [13]:
db_analytics = total_analytics.rename(columns={
    'study_id': 'study',
    'id': 'measure',
    'fromStudy': 'from_study',
    'pval': 'p_value',
})[['study', 'measure', 'from_study', 'method', 'p_value', 'param_type', 'is_non_inferiority', 'non_inferiority_type', 
    'non_inferiority_comment', 'param_value', 'ci_pct', 'ci_lower', 'ci_upper', 'groups', 'group_titles']]

In [18]:
db_analytics['param_value'] = db_analytics['param_value'].apply(str)
db_analytics['param_value'] = db_analytics['param_value'].apply(string2float)

In [19]:
db_analytics['ci_pct'] = db_analytics['ci_pct'].apply(str)
db_analytics['ci_pct'] = db_analytics['ci_pct'].apply(string2int)

In [20]:
db_analytics['ci_lower'] = db_analytics['ci_lower'].apply(str)
db_analytics['ci_lower'] = db_analytics['ci_lower'].apply(string2float)

In [21]:
db_analytics['ci_upper'] = db_analytics['ci_upper'].apply(str)
db_analytics['ci_upper'] = db_analytics['ci_upper'].apply(string2float)

In [22]:
db_analytics['p_value'] = db_analytics['p_value'].apply(str)
db_analytics['p_value'] = db_analytics['p_value'].apply(string2float)

In [23]:
db_analytics['is_non_inferiority'] = db_analytics['is_non_inferiority'].apply(lambda x: False if (x=='NA') or (not x) else True)

In [24]:
db_analytics['non_inferiority_type'] = db_analytics['non_inferiority_type'].str.upper().str.replace('-','_',regex=False).str.replace(' ','_', regex=False).str.replace('(','', regex=False).str.replace(')', '', regex=False)


In [25]:
db_analytics['id'] = db_analytics.index.to_series()

In [26]:
db_analytics

Unnamed: 0,study,measure,from_study,method,p_value,param_type,is_non_inferiority,non_inferiority_type,non_inferiority_comment,param_value,ci_pct,ci_lower,ci_upper,groups,group_titles,id
0,NCT03262441,2,True,,,Slope,False,OTHER,,-0.00033,95,-0.0020,0.0014,[OG000],{'OG000': 'Mycophenolate Mofetil'},0
1,NCT03262441,3,True,,,Slope,False,OTHER,,0.00100,95,-0.0036,0.0056,[OG000],{'OG000': 'Mycophenolate Mofetil'},1
2,NCT03262441,4,True,,,Slope,False,OTHER,,0.00240,95,-0.0030,0.0078,[OG000],{'OG000': 'Mycophenolate Mofetil'},2
3,NCT03264157,32,True,Farrington and Manning test,0.000600,lower 95% CI,False,NON_INFERIORITY,The null hypothesis is p-p0 ≤ -0.1. The altern...,-0.05000,95,-0.0500,,"[OG000, OG001]","{'OG000': 'BPL HRIG + RabAvert', 'OG001': 'Com...",3
4,NCT03264157,33,True,,,lower 95% CI,False,NON_INFERIORITY,The prespecified non inferiority margin was 20...,0.74000,95,0.7400,0.9400,"[OG000, OG001]","{'OG000': 'BPL HRIG + RabAvert', 'OG001': 'Com...",4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037249,NCT03031496,226580,False,t-test,0.554490,?,False,,,,-1,-1.0000,-1.0000,"[OG000, OG001]","{'OG000': 'Treatment A', 'OG001': 'Treatment B'}",1037249
1037250,NCT03031496,226583,False,t-test,0.312255,?,False,,,,-1,-1.0000,-1.0000,"[OG000, OG001]","{'OG000': 'Treatment A', 'OG001': 'Treatment B'}",1037250
1037251,NCT03031496,226583,False,t-test,0.751883,?,False,,,,-1,-1.0000,-1.0000,"[OG000, OG001]","{'OG000': 'Treatment A', 'OG001': 'Treatment B'}",1037251
1037252,NCT03031496,226583,False,t-test,0.345805,?,False,,,,-1,-1.0000,-1.0000,"[OG000, OG001]","{'OG000': 'Treatment A', 'OG001': 'Treatment B'}",1037252


In [89]:
db_analytics.drop(columns=['groups', 'group_titles']).to_sql('analytics', db, index=False, if_exists='append')

### Comparisons

In [27]:
db_analytics.head()

Unnamed: 0,study,measure,from_study,method,p_value,param_type,is_non_inferiority,non_inferiority_type,non_inferiority_comment,param_value,ci_pct,ci_lower,ci_upper,groups,group_titles,id
0,NCT03262441,2,True,,,Slope,False,OTHER,,-0.00033,95,-0.002,0.0014,[OG000],{'OG000': 'Mycophenolate Mofetil'},0
1,NCT03262441,3,True,,,Slope,False,OTHER,,0.001,95,-0.0036,0.0056,[OG000],{'OG000': 'Mycophenolate Mofetil'},1
2,NCT03262441,4,True,,,Slope,False,OTHER,,0.0024,95,-0.003,0.0078,[OG000],{'OG000': 'Mycophenolate Mofetil'},2
3,NCT03264157,32,True,Farrington and Manning test,0.0006,lower 95% CI,False,NON_INFERIORITY,The null hypothesis is p-p0 ≤ -0.1. The altern...,-0.05,95,-0.05,,"[OG000, OG001]","{'OG000': 'BPL HRIG + RabAvert', 'OG001': 'Com...",3
4,NCT03264157,33,True,,,lower 95% CI,False,NON_INFERIORITY,The prespecified non inferiority margin was 20...,0.74,95,0.74,0.94,"[OG000, OG001]","{'OG000': 'BPL HRIG + RabAvert', 'OG001': 'Com...",4


In [28]:
analytics_group = db_analytics[['id', 'group_titles', 'study']]

In [29]:
analytics_group['group_title'] = analytics_group['group_titles'].apply(lambda x: [(k,v) for k,v in x.items()])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analytics_group['group_title'] = analytics_group['group_titles'].apply(lambda x: [(k,v) for k,v in x.items()])


In [30]:
exploded = analytics_group.explode('group_title')

In [31]:
exploded['study_id'] = exploded['group_title'].apply(lambda x: x[0])

In [32]:
exploded['title'] = exploded['group_title'].apply(lambda x: x[1])

In [33]:
exploded

Unnamed: 0,id,group_titles,study,group_title,study_id,title
0,0,{'OG000': 'Mycophenolate Mofetil'},NCT03262441,"(OG000, Mycophenolate Mofetil)",OG000,Mycophenolate Mofetil
1,1,{'OG000': 'Mycophenolate Mofetil'},NCT03262441,"(OG000, Mycophenolate Mofetil)",OG000,Mycophenolate Mofetil
2,2,{'OG000': 'Mycophenolate Mofetil'},NCT03262441,"(OG000, Mycophenolate Mofetil)",OG000,Mycophenolate Mofetil
3,3,"{'OG000': 'BPL HRIG + RabAvert', 'OG001': 'Com...",NCT03264157,"(OG000, BPL HRIG + RabAvert)",OG000,BPL HRIG + RabAvert
3,3,"{'OG000': 'BPL HRIG + RabAvert', 'OG001': 'Com...",NCT03264157,"(OG001, Comparator HyperRab + RabAvert)",OG001,Comparator HyperRab + RabAvert
...,...,...,...,...,...,...
1037251,1037251,"{'OG000': 'Treatment A', 'OG001': 'Treatment B'}",NCT03031496,"(OG001, Treatment B)",OG001,Treatment B
1037252,1037252,"{'OG000': 'Treatment A', 'OG001': 'Treatment B'}",NCT03031496,"(OG000, Treatment A)",OG000,Treatment A
1037252,1037252,"{'OG000': 'Treatment A', 'OG001': 'Treatment B'}",NCT03031496,"(OG001, Treatment B)",OG001,Treatment B
1037253,1037253,"{'OG000': 'Treatment A', 'OG001': 'Treatment B'}",NCT03031496,"(OG000, Treatment A)",OG000,Treatment A


In [34]:
admin_merge = pd.read_csv('admins_join_groups.csv')
admin_merge.head()

Unnamed: 0,id,group,treatment,description,id.1,title,study_id,description.1,study
0,24766,1922,428,Participants in cohort 3 received once 1- to 4...,1922,Cohort 3 Brexpiprazole 4mg,OG002,Participants in cohort 3 received 1- to 4-mg d...,NCT01854944
1,24184,1906,2516,2 subcutaneous injections of Dupilumab 300 mg ...,1906,Dupilumab 300 mg q4w,OG003,2 subcutaneous injections of Dupilumab 300 mg ...,NCT01854047
2,24185,1906,103,2 subcutaneous injections of Dupilumab 300 mg ...,1906,Dupilumab 300 mg q4w,OG003,2 subcutaneous injections of Dupilumab 300 mg ...,NCT01854047
3,24186,1906,2403,2 subcutaneous injections of Dupilumab 300 mg ...,1906,Dupilumab 300 mg q4w,OG003,2 subcutaneous injections of Dupilumab 300 mg ...,NCT01854047
4,24187,1906,2517,2 subcutaneous injections of Dupilumab 300 mg ...,1906,Dupilumab 300 mg q4w,OG003,2 subcutaneous injections of Dupilumab 300 mg ...,NCT01854047


In [35]:
comp_table = admin_merge[['group', 'study_id', 'title', 'study']].drop_duplicates()
comp_table

Unnamed: 0,group,study_id,title,study
0,1922,OG002,Cohort 3 Brexpiprazole 4mg,NCT01854944
1,1906,OG003,Dupilumab 300 mg q4w,NCT01854047
6,1918,OG000,Cohort 1 - Brexpiprazole 4 mg,NCT01854944
8,1928,OG001,Mometasone Furoate 0.1%,NCT01856543
11,1932,OG001,SOF+VEL 100 mg 12 Weeks (GT1),NCT01858766
...,...,...,...,...
215450,1879,OG001,Placebo to 100 mg q2w Due to EE/LE/CO,NCT01856309
215456,1871,OG000,FPNS 200 μg,NCT01916226
215461,1889,OG001,TPV/RBV,NCT01854528
215465,1894,OG002,TXA-500 Group,NCT01850394


In [42]:
comp_table[comp_table['title'] == 'Milnacipram']

Unnamed: 0,group,study_id,title,study


In [39]:
merged = exploded.merge(comp_table, 'left', ['study_id', 'title', 'study'])

In [41]:
merged[merged['group'].isna()]

Unnamed: 0,id,group_titles,study,group_title,study_id,title,group
2572,1125,{'OG000': 'Milnacipram'},NCT01304589,"(OG000, Milnacipram)",OG000,Milnacipram,
2574,1127,{'OG000': 'Milnacipram'},NCT01304589,"(OG000, Milnacipram)",OG000,Milnacipram,
2575,1128,{'OG000': 'Milnacipram'},NCT01304589,"(OG000, Milnacipram)",OG000,Milnacipram,
3208,1315,{'OG000': 'Milciclib'},NCT01301391,"(OG000, Milciclib)",OG000,Milciclib,
29311,12449,"{'OG000': 'Control: Losartan / Week 4,8 -ITT',...",NCT00922480,"(OG000, Control: Losartan / Week 4,8 -ITT)",OG000,"Control: Losartan / Week 4,8 -ITT",
...,...,...,...,...,...,...,...
310792,133315,{'OG000': 'Treatment'},NCT01288807,"(OG000, Treatment)",OG000,Treatment,
310793,133316,{'OG000': 'Treatment'},NCT01288807,"(OG000, Treatment)",OG000,Treatment,
310794,133317,{'OG000': 'Treatment'},NCT01288807,"(OG000, Treatment)",OG000,Treatment,
310795,133318,{'OG000': 'Treatment'},NCT01288807,"(OG000, Treatment)",OG000,Treatment,


In [44]:
merged = merged[~merged['group'].isna()]

In [45]:
merged['group'] = merged['group'].apply(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['group'] = merged['group'].apply(int)


In [46]:
merged

Unnamed: 0,id,group_titles,study,group_title,study_id,title,group
0,0,{'OG000': 'Mycophenolate Mofetil'},NCT03262441,"(OG000, Mycophenolate Mofetil)",OG000,Mycophenolate Mofetil,35420
1,1,{'OG000': 'Mycophenolate Mofetil'},NCT03262441,"(OG000, Mycophenolate Mofetil)",OG000,Mycophenolate Mofetil,35420
2,2,{'OG000': 'Mycophenolate Mofetil'},NCT03262441,"(OG000, Mycophenolate Mofetil)",OG000,Mycophenolate Mofetil,35420
3,3,"{'OG000': 'BPL HRIG + RabAvert', 'OG001': 'Com...",NCT03264157,"(OG000, BPL HRIG + RabAvert)",OG000,BPL HRIG + RabAvert,35435
4,3,"{'OG000': 'BPL HRIG + RabAvert', 'OG001': 'Com...",NCT03264157,"(OG000, BPL HRIG + RabAvert)",OG000,BPL HRIG + RabAvert,35433
...,...,...,...,...,...,...,...
2448666,1037251,"{'OG000': 'Treatment A', 'OG001': 'Treatment B'}",NCT03031496,"(OG001, Treatment B)",OG001,Treatment B,98186
2448667,1037252,"{'OG000': 'Treatment A', 'OG001': 'Treatment B'}",NCT03031496,"(OG000, Treatment A)",OG000,Treatment A,98185
2448668,1037252,"{'OG000': 'Treatment A', 'OG001': 'Treatment B'}",NCT03031496,"(OG001, Treatment B)",OG001,Treatment B,98186
2448669,1037253,"{'OG000': 'Treatment A', 'OG001': 'Treatment B'}",NCT03031496,"(OG000, Treatment A)",OG000,Treatment A,98185


In [47]:
db_comparisons = merged.rename(columns={
    'id': 'analytic',
})[['analytic', 'group']]

In [52]:
db_comparisons.to_sql('comparison', db, index=False, if_exists='append')

### Baselines

In [64]:
baselines = pd.read_csv('db_int_baselines.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [65]:
baselines

Unnamed: 0.1,Unnamed: 0,base,clss,category,param_type,dispersion,unit,value,spread,upper,lower,type,sub_type,study,group
0,3,"Sex: Female, Male",,Female,Count of Participants,,Participants,16.0,,,,GENDER,FEMALE,NCT03266419,BG000
1,4,"Sex: Female, Male",,Female,Count of Participants,,Participants,16.0,,,,GENDER,FEMALE,NCT03266419,BG001
2,5,"Sex: Female, Male",,Female,Count of Participants,,Participants,32.0,,,,GENDER,FEMALE,NCT03266419,BG002
3,6,"Sex: Female, Male",,Male,Count of Participants,,Participants,35.0,,,,GENDER,MALE,NCT03266419,BG000
4,7,"Sex: Female, Male",,Male,Count of Participants,,Participants,33.0,,,,GENDER,MALE,NCT03266419,BG001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
721813,849094,Region of Enrollment,United States,,Number,,participants,29.0,,,,OTHER,,NCT03030989,BG001
721814,849095,Region of Enrollment,United States,,Number,,participants,50.0,,,,OTHER,,NCT03030989,BG002
721815,849159,"Race/Ethnicity, Customized",Black or African American Heritage,,Count of Participants,,Participants,29.0,,,,OTHER,,NCT03031496,BG000
721816,849160,"Race/Ethnicity, Customized",White Heritage,,Count of Participants,,Participants,12.0,,,,RACE,WHITE,NCT03031496,BG000


In [66]:
baselines['base'] = baselines['base'].fillna('NA')
baselines['clss'] = baselines['clss'].fillna('NA')
baselines['category'] = baselines['category'].fillna('NA')
baselines['param_type'] = baselines['param_type'].fillna('NA')
baselines['unit'] = baselines['unit'].fillna('NA')
baselines['type'] = baselines['type'].fillna('OTHER')
baselines['sub_type'] = baselines['sub_type'].fillna('NA')


In [71]:
baselines = baselines.drop(columns=['Unnamed: 0'])


KeyError: "['Unnamed: 0'] not found in axis"

In [72]:
baselines = baselines.drop(columns=['group'])

In [76]:
baselines['param_type'] = baselines['param_type'].str.upper().str.replace(' ', '_')

In [79]:
baselines['dispersion'] = baselines['dispersion'].str.replace(' ','_').str.replace('-','_')

In [82]:
baselines['category'].str.len().max()

94

In [83]:
baselines.to_sql('baselines', db, index=False, if_exists='append')

### Effects Groups

In [9]:
effect_groups = pd.read_pickle('effects_groups_treats.pkl')
effect_groups

Unnamed: 0,study_id,group_id,title,description,desc_plus_title,treatments,adjusted
0,NCT03266419,EG000,Deep NMB Using Rocuronium,The abdomen is insufflated to 13 mmHg pneumope...,Deep NMB Using Rocuronium The abdomen is insuf...,[rocuronium],[rocuronium]
1,NCT03266419,EG001,Moderate NMB Using Rocuronium,The abdomen is insufflated to 13 mmHg pneumope...,Moderate NMB Using Rocuronium The abdomen is i...,[rocuronium],[rocuronium]
2,NCT03262441,EG000,Mycophenolate Mofetil,Mycophenolate Mofetil 500mg Tablets once per d...,Mycophenolate Mofetil Mycophenolate Mofetil 50...,"[mycophenolatemofetilmycophenolatemofetil, myc...","[mycophenolatemofetilmycophenolatemofetil, myc..."
3,NCT03260894,EG000,Pembrolizumab + Epacadostat,Pembrolizumab 200 mg administered intravenousl...,Pembrolizumab + Epacadostat Pembrolizumab 200 ...,[epacadostat],[epacadostat]
4,NCT03260894,EG001,SoC (Sunitinib or Pazopanib),Standard of care (SoC) (sunitinib or pazopanib...,SoC (Sunitinib or Pazopanib) Standard of care ...,"[pazopanib, sunitinib]","[pazopanib, sunitinib]"
...,...,...,...,...,...,...,...
68808,NCT03038880,EG000,6 mg Faricimab Q12W,6 mg faricimab was given by intravitreal (IVT)...,6 mg Faricimab Q12W 6 mg faricimab was given b...,[faricimab],[faricimab]
68809,NCT03038880,EG001,6 mg Faricimab Q16W,6 mg faricimab was administered by IVT injecti...,6 mg Faricimab Q16W 6 mg faricimab was adminis...,"[no, faricimabivt, faricimab]","[no, faricimabivt, faricimab]"
68810,NCT03038880,EG002,0.5 mg Ranibizumab Q4W,0.5 mg of ranibizumab was administered by IVT ...,0.5 mg Ranibizumab Q4W 0.5 mg of ranibizumab w...,[],[]
68811,NCT03031496,EG000,Treatment A,Participants received a single oral dose of hy...,Treatment A Participants received a single ora...,"[hydrochlorothiazide, amiloridehydrochloride]","[hydrochlorothiazide, amiloridehydrochloride]"


In [10]:
def is_str_double(string):
    return string[:len(string)//2] == string[len(string)//2:]

In [11]:
# We will just use the raw treatments column for now - maybe we sould do some cleaning of the treatments in the future
# Append these to treatments if they don't exist

db_treats = pd.read_csv('meditreats_public_treatments.csv')

In [12]:
treats = set(db_treats['name'].unique())

In [13]:
effects_treats = effect_groups.explode('treatments')

In [14]:
effects_treats = effects_treats[~effects_treats['treatments'].isna()]['treatments'].str.capitalize().apply(lambda x: x[:len(x)//2] if is_str_double(x) else x)

In [15]:
new_effects_treats = [x for x in effects_treats if x not in treats]

In [16]:
new_treats = pd.Series(list(set(new_effects_treats)))

  new_treats = pd.Series(list(set(new_effects_treats)))


In [70]:
new_db_treats = pd.DataFrame.from_dict({
    'name': new_treats.to_list(),
    'from_study': [False for x in range(len(new_treats))],
    'no_studies': [-1 for x in range(len(new_treats))]
                })

In [38]:
pd.Series(new_db_treats.index + 1)

0            1
1            2
2            3
3            4
4            5
         ...  
10622    10623
10623    10624
10624    10625
10625    10626
10626    10627
Length: 10627, dtype: int64

In [71]:
new_db_treats['id'] = pd.Series(new_db_treats.index+11818)

In [72]:
new_db_treats.to_sql('treatments', db, index=False, if_exists='append')

In [73]:
new_db_treats

Unnamed: 0,name,from_study,no_studies,id
0,Raltegravirtenofovir,False,-1,11818
1,Coprecipitate,False,-1,11819
2,Poly(adp-ribose),False,-1,11820
3,Darbepoetinalfana,False,-1,11821
4,Propranololla,False,-1,11822
...,...,...,...,...
1836,Svn53-67,False,-1,13654
1837,Benzyl,False,-1,13655
1838,Nf54,False,-1,13656
1839,Treosulfane,False,-1,13657


In [74]:
# Load back the treatments and merge them with the groups
db_treats = pd.read_csv('meditreats_public_treatments.csv')

In [75]:
db_treats

Unnamed: 0,id,name,from_study,no_studies
0,0,Cyclophosphamide,True,678
1,1,Vaccines,True,644
2,2,Paclitaxel,True,621
3,3,Bevacizumab,True,607
4,4,Dexamethasone,True,542
...,...,...,...,...
13653,13654,Svn53-67,False,-1
13654,13655,Benzyl,False,-1
13655,13656,Nf54,False,-1
13656,13657,Treosulfane,False,-1


In [17]:
# Upload the groups to get the id then we can merge
effect_groups['id'] = effect_groups.index

In [78]:
effect_groups

Unnamed: 0,study_id,group_id,title,description,desc_plus_title,treatments,adjusted,id
0,NCT03266419,EG000,Deep NMB Using Rocuronium,The abdomen is insufflated to 13 mmHg pneumope...,Deep NMB Using Rocuronium The abdomen is insuf...,[rocuronium],[rocuronium],0
1,NCT03266419,EG001,Moderate NMB Using Rocuronium,The abdomen is insufflated to 13 mmHg pneumope...,Moderate NMB Using Rocuronium The abdomen is i...,[rocuronium],[rocuronium],1
2,NCT03262441,EG000,Mycophenolate Mofetil,Mycophenolate Mofetil 500mg Tablets once per d...,Mycophenolate Mofetil Mycophenolate Mofetil 50...,"[mycophenolatemofetilmycophenolatemofetil, myc...","[mycophenolatemofetilmycophenolatemofetil, myc...",2
3,NCT03260894,EG000,Pembrolizumab + Epacadostat,Pembrolizumab 200 mg administered intravenousl...,Pembrolizumab + Epacadostat Pembrolizumab 200 ...,[epacadostat],[epacadostat],3
4,NCT03260894,EG001,SoC (Sunitinib or Pazopanib),Standard of care (SoC) (sunitinib or pazopanib...,SoC (Sunitinib or Pazopanib) Standard of care ...,"[pazopanib, sunitinib]","[pazopanib, sunitinib]",4
...,...,...,...,...,...,...,...,...
68808,NCT03038880,EG000,6 mg Faricimab Q12W,6 mg faricimab was given by intravitreal (IVT)...,6 mg Faricimab Q12W 6 mg faricimab was given b...,[faricimab],[faricimab],68808
68809,NCT03038880,EG001,6 mg Faricimab Q16W,6 mg faricimab was administered by IVT injecti...,6 mg Faricimab Q16W 6 mg faricimab was adminis...,"[no, faricimabivt, faricimab]","[no, faricimabivt, faricimab]",68809
68810,NCT03038880,EG002,0.5 mg Ranibizumab Q4W,0.5 mg of ranibizumab was administered by IVT ...,0.5 mg Ranibizumab Q4W 0.5 mg of ranibizumab w...,[],[],68810
68811,NCT03031496,EG000,Treatment A,Participants received a single oral dose of hy...,Treatment A Participants received a single ora...,"[hydrochlorothiazide, amiloridehydrochloride]","[hydrochlorothiazide, amiloridehydrochloride]",68811


In [80]:
effect_groups.rename(columns={
    'study_id': 'study',
    'group_id': 'study_id'
})[['id', 'title', 'description', 'study_id']].to_sql('effectsgroups', db, index=False, if_exists='append')

### Effects

In [2]:
effects_0 = pd.read_pickle('int_newer_effects_0.pkl')
effects_1 = pd.read_pickle('int_newer_effects_1.pkl')
effects_2 = pd.read_pickle('int_newer_effects_2.pkl')

effects = pd.concat([effects_0, effects_1, effects_0]).reset_index(drop=True)

In [3]:
effects

Unnamed: 0,study_id,group_id,effect_name,type,organ_system,assesment,no_effected,collection_threshold,no_at_risk
0,NCT03262441,EG000,Finger cellulitis,serious,Skin and subcutaneous tissue disorders,Systematic Assessment,1.0,0.0,5
1,NCT03260894,EG000,Anaemia,other,Blood and lymphatic system disorders,Systematic Assessment,10.0,5.0,64
2,NCT03260894,EG001,Anaemia,other,Blood and lymphatic system disorders,Systematic Assessment,12.0,5.0,63
3,NCT03260894,EG000,Thrombocytopenia,other,Blood and lymphatic system disorders,Systematic Assessment,2.0,5.0,64
4,NCT03260894,EG001,Thrombocytopenia,other,Blood and lymphatic system disorders,Systematic Assessment,4.0,5.0,63
...,...,...,...,...,...,...,...,...,...
4865148,NCT00265148,EG001,MYOCARDIAL INFARCTION,serious,Cardiac disorders,Systematic Assessment,0.0,5.0,40
4865149,NCT00265148,EG000,HYPONATRAEMIA,serious,Metabolism and nutrition disorders,Systematic Assessment,1.0,5.0,40
4865150,NCT00265148,EG001,HYPONATRAEMIA,serious,Metabolism and nutrition disorders,Systematic Assessment,0.0,5.0,40
4865151,NCT00265148,EG000,ABNORMAL BEHAVIOUR,serious,Psychiatric disorders,Systematic Assessment,0.0,5.0,40


In [4]:
# Get DB effect groups 
db_effect_groups = pd.read_csv('effects_groups_study.csv')

In [5]:
db_effect_groups

Unnamed: 0,study,id,study_id
0,NCT00000125,46463,EG000
1,NCT00000125,46464,EG001
2,NCT00000134,46556,EG000
3,NCT00000134,46557,EG001
4,NCT00000134,46558,EG002
...,...,...,...
43234,NCT04534465,15977,EG004
43235,NCT04534491,15978,EG000
43236,NCT04555525,5662,EG000
43237,NCT04555525,5663,EG001


In [6]:
# Merge in the groups
db_effects = effects.merge(db_effect_groups.rename(columns={
    'study_id': 'group_id',
    'study': 'study_id'
})).rename(columns= {
    'id': 'group',
    'study_id': 'study',
    'effect_name': 'name'
})

In [10]:
len(db_effects)

4865153

In [11]:
db_effects = db_effects.drop_duplicates(['study','group','name','no_at_risk'])

In [12]:
# We need to make a bunch of new tables for this to work :(
db_effects['type'].value_counts()

other      1878977
serious    1180731
Name: type, dtype: int64

In [13]:
db_effects

Unnamed: 0,study,group_id,name,type,organ_system,assesment,no_effected,collection_threshold,no_at_risk,group
0,NCT03262441,EG000,Finger cellulitis,serious,Skin and subcutaneous tissue disorders,Systematic Assessment,1.0,0.0,5,2
2,NCT03260894,EG000,Anaemia,other,Blood and lymphatic system disorders,Systematic Assessment,10.0,5.0,64,3
3,NCT03260894,EG000,Thrombocytopenia,other,Blood and lymphatic system disorders,Systematic Assessment,2.0,5.0,64,3
4,NCT03260894,EG000,Hyperthyroidism,other,Endocrine disorders,Systematic Assessment,5.0,5.0,64,3
5,NCT03260894,EG000,Hypothyroidism,other,Endocrine disorders,Systematic Assessment,9.0,5.0,64,3
...,...,...,...,...,...,...,...,...,...,...
4865146,NCT03496974,EG001,Swelling on right hand,other,General disorders,Systematic Assessment,1.0,0.0,28,49099
4865147,NCT03496974,EG001,Urinary Track Infection,other,Infections and infestations,Systematic Assessment,1.0,0.0,28,49099
4865148,NCT03496974,EG001,Vomiting,other,Gastrointestinal disorders,Systematic Assessment,1.0,0.0,28,49099
4865149,NCT03496974,EG001,Wheezing,other,"Respiratory, thoracic and mediastinal disorders",Systematic Assessment,2.0,0.0,28,49099


In [14]:
db_effects['type'] = db_effects['type'].str.upper()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db_effects['type'] = db_effects['type'].str.upper()


In [15]:
db_effects['organ_system'] = db_effects['organ_system'].apply(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db_effects['organ_system'] = db_effects['organ_system'].apply(str)


In [16]:
db_effects['organ_system'] = db_effects['organ_system'].apply(lambda x: x.split(' (')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db_effects['organ_system'] = db_effects['organ_system'].apply(lambda x: x.split(' (')[0])


In [17]:
db_effects['organ_system'] = db_effects['organ_system'].str.upper().str.replace(' ','_').str.replace(',','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db_effects['organ_system'] = db_effects['organ_system'].str.upper().str.replace(' ','_').str.replace(',','')


In [18]:
db_effects['assesment'] = db_effects['assesment'].str.upper().str.replace(' ','_').str.replace('-','_')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db_effects['assesment'] = db_effects['assesment'].str.upper().str.replace(' ','_').str.replace('-','_')


In [19]:
db_effects['id'] = db_effects.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db_effects['id'] = db_effects.index


In [20]:
db_effects = db_effects.rename(columns={
    'type': 'effect_type',
    'assesment': 'assessment'
})

In [21]:
db_effects

Unnamed: 0,study,group_id,name,effect_type,organ_system,assessment,no_effected,collection_threshold,no_at_risk,group,id
0,NCT03262441,EG000,Finger cellulitis,SERIOUS,SKIN_AND_SUBCUTANEOUS_TISSUE_DISORDERS,SYSTEMATIC_ASSESSMENT,1.0,0.0,5,2,0
2,NCT03260894,EG000,Anaemia,OTHER,BLOOD_AND_LYMPHATIC_SYSTEM_DISORDERS,SYSTEMATIC_ASSESSMENT,10.0,5.0,64,3,2
3,NCT03260894,EG000,Thrombocytopenia,OTHER,BLOOD_AND_LYMPHATIC_SYSTEM_DISORDERS,SYSTEMATIC_ASSESSMENT,2.0,5.0,64,3,3
4,NCT03260894,EG000,Hyperthyroidism,OTHER,ENDOCRINE_DISORDERS,SYSTEMATIC_ASSESSMENT,5.0,5.0,64,3,4
5,NCT03260894,EG000,Hypothyroidism,OTHER,ENDOCRINE_DISORDERS,SYSTEMATIC_ASSESSMENT,9.0,5.0,64,3,5
...,...,...,...,...,...,...,...,...,...,...,...
4865146,NCT03496974,EG001,Swelling on right hand,OTHER,GENERAL_DISORDERS,SYSTEMATIC_ASSESSMENT,1.0,0.0,28,49099,4865146
4865147,NCT03496974,EG001,Urinary Track Infection,OTHER,INFECTIONS_AND_INFESTATIONS,SYSTEMATIC_ASSESSMENT,1.0,0.0,28,49099,4865147
4865148,NCT03496974,EG001,Vomiting,OTHER,GASTROINTESTINAL_DISORDERS,SYSTEMATIC_ASSESSMENT,1.0,0.0,28,49099,4865148
4865149,NCT03496974,EG001,Wheezing,OTHER,RESPIRATORY_THORACIC_AND_MEDIASTINAL_DISORDERS,SYSTEMATIC_ASSESSMENT,2.0,0.0,28,49099,4865149


In [24]:
db_effects.drop(columns=['group_id']).to_sql('effects', db, index=False, if_exists='append')

### Effects Comparisons

In [137]:
db_treats = pd.read_csv('meditreats_public_treatments.csv')

In [142]:
treats_merge = db_treats.rename(columns={
    'name': 'treatment'
})

In [141]:
db_treats.head()

Unnamed: 0,id,name,from_study,no_studies
0,0,Cyclophosphamide,True,678
1,1,Vaccines,True,644
2,2,Paclitaxel,True,621
3,3,Bevacizumab,True,607
4,4,Dexamethasone,True,542


In [144]:
treats_merge

Unnamed: 0,id,treatment,from_study,no_studies
0,0,Cyclophosphamide,True,678
1,1,Vaccines,True,644
2,2,Paclitaxel,True,621
3,3,Bevacizumab,True,607
4,4,Dexamethasone,True,542
...,...,...,...,...
13653,13654,Svn53-67,False,-1
13654,13655,Benzyl,False,-1
13655,13656,Nf54,False,-1
13656,13657,Treosulfane,False,-1


In [135]:
effect_groups_expl = effect_groups.explode('treatments')

In [161]:
effect_groups_expl['treatments'] = effect_groups_expl['treatments'].fillna('NA').str.capitalize()

In [162]:
effect_groups_expl

Unnamed: 0,study_id,group_id,title,description,desc_plus_title,treatments,adjusted,id,treaments
0,NCT03266419,EG000,Deep NMB Using Rocuronium,The abdomen is insufflated to 13 mmHg pneumope...,Deep NMB Using Rocuronium The abdomen is insuf...,Rocuronium,[rocuronium],0,Rocuronium
1,NCT03266419,EG001,Moderate NMB Using Rocuronium,The abdomen is insufflated to 13 mmHg pneumope...,Moderate NMB Using Rocuronium The abdomen is i...,Rocuronium,[rocuronium],1,Rocuronium
2,NCT03262441,EG000,Mycophenolate Mofetil,Mycophenolate Mofetil 500mg Tablets once per d...,Mycophenolate Mofetil Mycophenolate Mofetil 50...,Mycophenolatemofetilmycophenolatemofetil,"[mycophenolatemofetilmycophenolatemofetil, myc...",2,Mycophenolatemofetilmycophenolatemofetil
2,NCT03262441,EG000,Mycophenolate Mofetil,Mycophenolate Mofetil 500mg Tablets once per d...,Mycophenolate Mofetil Mycophenolate Mofetil 50...,Mycophenolatemofetil500mg,"[mycophenolatemofetilmycophenolatemofetil, myc...",2,Mycophenolatemofetil500mg
2,NCT03262441,EG000,Mycophenolate Mofetil,Mycophenolate Mofetil 500mg Tablets once per d...,Mycophenolate Mofetil Mycophenolate Mofetil 50...,Mycophenolatemofetil,"[mycophenolatemofetilmycophenolatemofetil, myc...",2,Mycophenolatemofetil
...,...,...,...,...,...,...,...,...,...
68810,NCT03038880,EG002,0.5 mg Ranibizumab Q4W,0.5 mg of ranibizumab was administered by IVT ...,0.5 mg Ranibizumab Q4W 0.5 mg of ranibizumab w...,Na,[],68810,Na
68811,NCT03031496,EG000,Treatment A,Participants received a single oral dose of hy...,Treatment A Participants received a single ora...,Hydrochlorothiazide,"[hydrochlorothiazide, amiloridehydrochloride]",68811,Hydrochlorothiazide
68811,NCT03031496,EG000,Treatment A,Participants received a single oral dose of hy...,Treatment A Participants received a single ora...,Amiloridehydrochloride,"[hydrochlorothiazide, amiloridehydrochloride]",68811,Amiloridehydrochloride
68812,NCT03031496,EG001,Treatment B,Participants received a single oral dose of hy...,Treatment B Participants received a single ora...,Hydrochlorothiazide,"[hydrochlorothiazide, amiloridehydrochloride]",68812,Hydrochlorothiazide


In [163]:
db_admins = effect_groups_expl[['id','treatments']].merge(db_treats[['name','id']], left_on='treatments', right_on='name')

In [165]:
db_admins.rename(columns={
    'id_x': 'group',
    'id_y': 'treatment'
})[['group', 'treatment']].to_sql('effectsadministrations', db, index=False, if_exists='append')