# EDA for clinical trial data

In [1]:
import numpy as np
import pandas as pd
import pendulum
import matplotlib.pyplot as plt
import matplotlib as mpl
import re
import os
from datetime import datetime

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [2]:
df = pd.read_csv("SearchResults.csv")
df = df.set_index("Rank")

## First glimpse on the data (count, missing data)

First of all, take a look at what variables do we have

In [3]:
df.columns

Index(['NCT Number', 'Title', 'Acronym', 'Status', 'Study Results',
       'Conditions', 'Interventions', 'Outcome Measures',
       'Sponsor/Collaborators', 'Gender', 'Age', 'Phases', 'Enrollment',
       'Funded Bys', 'Study Type', 'Study Designs', 'Other IDs', 'Start Date',
       'Primary Completion Date', 'Completion Date', 'First Posted',
       'Results First Posted', 'Last Update Posted', 'Locations',
       'Study Documents', 'URL'],
      dtype='object')

In [4]:
df

Unnamed: 0_level_0,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,Outcome Measures,Sponsor/Collaborators,Gender,Age,Phases,Enrollment,Funded Bys,Study Type,Study Designs,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,URL
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
1,NCT04372602,Duvelisib to Combat COVID-19,,Not yet recruiting,No Results Available,COVID-19,Drug: Duvelisib|Procedure: Peripheral blood dr...,Overall survival|Length of hospital stay|Lengt...,Washington University School of Medicine|Veras...,All,"18 Years and older (Adult, Older Adult)",Phase 2,28.0,Other|Industry,Interventional,Allocation: Randomized|Intervention Model: Sin...,202007009,"September 30, 2020","October 31, 2021","March 31, 2022","May 4, 2020",,"September 10, 2020","Washington University School of Medicine, Sain...",,https://ClinicalTrials.gov/show/NCT04372602
2,NCT04364698,Observational Cohort of COVID-19 Patients at R...,COVID-RPC,Recruiting,No Results Available,COVID-19,,"clinical, biological and radiological characte...",Assistance Publique - Hôpitaux de Paris,All,"18 Years and older (Adult, Older Adult)",,500.0,Other,Observational,Observational Model: Cohort|Time Perspective: ...,20SBS-COVID-RPC,"May 7, 2020",June 2020,June 2020,"April 28, 2020",,"May 14, 2020","Department of Infectiology, Raymond Poincaré H...",,https://ClinicalTrials.gov/show/NCT04364698
3,NCT04482621,Decitabine for Coronavirus (COVID-19) Pneumoni...,DART,Recruiting,No Results Available,COVID-19,Drug: Decitabine|Other: Placebo Saline,Change in clinical state as assessed by a 6-po...,Johns Hopkins University,All,"18 Years and older (Adult, Older Adult)",Phase 2,40.0,Other,Interventional,Allocation: Randomized|Intervention Model: Par...,IRB00247544,"August 31, 2020",January 2021,July 2021,"July 22, 2020",,"August 18, 2020","Johns Hopkins University, Baltimore, Maryland,...",,https://ClinicalTrials.gov/show/NCT04482621
4,NCT04459637,COVID-19 Surveillance Based on Smart Wearable ...,COVID-19SWD,Not yet recruiting,No Results Available,COVID-19,,Deterioration of the condition|Mortality|The i...,Peking University First Hospital,All,"18 Years to 75 Years (Adult, Older Adult)",,200.0,Other,Observational,Observational Model: Cohort|Time Perspective: ...,2020055-0615,"July 1, 2020","March 10, 2021","March 10, 2021","July 7, 2020",,"July 7, 2020","Peking University First Hospital, Beijing, Bei...",,https://ClinicalTrials.gov/show/NCT04459637
5,NCT04425538,A Phase 2 Trial of Infliximab in Coronavirus D...,,Recruiting,No Results Available,COVID-19,Drug: Infliximab,Time to improvement in oxygenation|28-day mort...,Tufts Medical Center|National Institutes of He...,All,"18 Years and older (Adult, Older Adult)",Phase 2,17.0,Other|NIH,Interventional,Allocation: N/A|Intervention Model: Single Gro...,STUDY00000564,"June 1, 2020",September 2020,December 2020,"June 11, 2020",,"June 11, 2020","Tufts Medical Center, Boston, Massachusetts, U...",,https://ClinicalTrials.gov/show/NCT04425538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3367,NCT04417153,Who Benefits More? Optimising Mindfulness Base...,,Recruiting,No Results Available,"Stress, Psychological|Sleep Initiation and Mai...",Behavioral: Mindfulness Based Intervention,Change in Subjective measures of Sleep quality...,"National University, Singapore|Potential proje...",All,"21 Years and older (Adult, Older Adult)",,1000.0,Other,Observational,Observational Model: Cohort|Time Perspective: ...,S-19-176,"September 20, 2019","June 30, 2022","June 30, 2022","June 4, 2020",,"June 9, 2020","National University Singapore, Singapore, Sing...",,https://ClinicalTrials.gov/show/NCT04417153
3368,NCT03871491,Azithromycin-Prevention in Labor Use Study (A-...,,Recruiting,No Results Available,Maternal Death|Maternal Infections Affecting F...,Drug: Azithromycin|Drug: Placebo,Maternal: Incidence of maternal death or sepsi...,NICHD Global Network for Women's and Children'...,Female,18 Years to 45 Years (Adult),Phase 3,34000.0,Other,Interventional,Allocation: Randomized|Intervention Model: Par...,CP Azithromycin,"September 1, 2020","September 1, 2023","September 1, 2023","March 12, 2019",,"September 16, 2020","ICDDRB, Dhaka, Bangladesh|Kinshasa School of P...",,https://ClinicalTrials.gov/show/NCT03871491
3369,NCT04386876,Bioequivalence Study of Lopinavir/Ritonavir 20...,Orvical,"Active, not recruiting",No Results Available,Bioequivalence,Drug: Lopinavir 200Mg/Ritonavir 50Mg FT Test|D...,Primary PK End Points,World Medicine ILAC SAN. ve TIC. A.S.|Novageni...,Male,20 Years to 40 Years (Adult),Phase 1,30.0,Industry|Other,Interventional,Allocation: Randomized|Intervention Model: Cro...,NOV2020/01911|FARGE365,"April 30, 2020","May 22, 2020","June 20, 2020","May 13, 2020",,"May 15, 2020","Novagenix Drug R&D Center, Akyurt, Ankara, Tur...",,https://ClinicalTrials.gov/show/NCT04386876
3370,NCT04276987,A Pilot Clinical Study on Inhalation of Mesenc...,,Completed,No Results Available,Coronavirus,Biological: MSCs-derived exosomes,Adverse reaction (AE) and severe adverse react...,Ruijin Hospital|Shanghai Public Health Clinica...,All,"18 Years to 75 Years (Adult, Older Adult)",Phase 1,24.0,Other|Industry,Interventional,Allocation: N/A|Intervention Model: Single Gro...,MEXCOVID,"February 15, 2020","May 31, 2020","July 31, 2020","February 19, 2020",,"September 7, 2020",Ruijin Hospital Shanghai Jiao Tong University ...,,https://ClinicalTrials.gov/show/NCT04276987


### Status and if study results availabe

In [5]:
df["Status"].value_counts()

Recruiting                 1756
Not yet recruiting          845
Completed                   352
Active, not recruiting      212
Enrolling by invitation     105
Withdrawn                    44
Available                    20
Suspended                    18
Terminated                   15
No longer available           3
Approved for marketing        1
Name: Status, dtype: int64

In [7]:
df["Study Results"].value_counts()

No Results Available    3364
Has Results                7
Name: Study Results, dtype: int64

In [8]:
df["Results First Posted"].value_counts()

July 21, 2020        1
August 12, 2020      1
August 11, 2020      1
August 21, 2020      1
September 4, 2020    1
August 27, 2020      1
June 4, 2020         1
Name: Results First Posted, dtype: int64

Sadly, only 7 projects has results (out of 352 completed projects). Most of them done in August.

### Age and gender

Need to figure out what age and gender refers to, but first we can take a look at these variables.

In [None]:
df["Age"].value_counts()

There are too many categories, we can simplify it by extracting the content in brackets.

In [10]:
age = df["Age"]
age = pd.Series([re.findall(r'[(](.*?)[)]',x) for x in age])
age.value_counts()

[Adult, Older Adult]           2573
[]                              308
[Child, Adult, Older Adult]     194
[Adult]                         172
[Child, Adult]                   45
[Older Adult]                    42
[Child]                          37
dtype: int64

We can see most projects involve adults and older adults, only small propotion invlove children.

Half projects are recruiting members(about half), quarter projects not yet recruiting, and 1/10 are completed.

### Phases, and funded by(s)

Need to figure out what "Phases"'s definition is.

In [6]:
df["Phases"].value_counts()

Not Applicable     618
Phase 2            481
Phase 3            302
Phase 2|Phase 3    134
Phase 1            132
Phase 1|Phase 2    116
Phase 4             93
Early Phase 1       31
Name: Phases, dtype: int64

## Analysis with plot

### Date

Transfer date str varibalbes into datetime format.

In [31]:
def rep_m(m):
    months = ["January", "February", "March", "April", "May", "June", "July", 
              "August", "September", "October", "November", "December"]
    for i in months:
        if m == i:
            m = months.index(i) + 1
    return str(m)

def to_date(date_str):

    date_str = date_str.split(" ")
    
    # some date are y-m-d
    if len(date_str) == 3:
        
        Y = date_str[2]
        m = rep_m(date_str[0])
        d = date_str[1][:-1] # drop comma

        date = datetime.strptime(Y + "-" + m + "-" + d, "%Y-%m-%d")
    
    # some date are y-m
    else:
        
        Y = date_str[1]
        m = rep_m(date_str[0])
        
        date = datetime.strptime(Y + "-" + m, "%Y-%m")

    return date

In [37]:
start_date = [to_date(x) for x in df["Start Date"].dropna()]
primary_completion_date = [to_date(x) for x in df["Primary Completion Date"].dropna()]
completion_date = [to_date(x) for x in df["Completion Date"].dropna()]
first_posted = [to_date(x) for x in df["First Posted"].dropna()]
results_first_posted = [to_date(x) for x in df["Results First Posted"].dropna()]
last_update_posted = [to_date(x) for x in df["Last Update Posted"].dropna()]

Further plots here...

### Enronllment

In [41]:
enrollment = df["Enrollment"]

Further plots here...