## Dependencies

In [5]:
import psycopg2

In [6]:
import pandas as pd

## Connecting to Postgres DB

In [7]:
dbname = 'aact'
user = 'postgres'
password = 'lqt38be'
host = 'localhost'

conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host)
curs = conn.cursor()

In [10]:
# Verifying Connection
query = """SELECT COUNT(*) 
FROM ctgov.studies;
"""
curs.execute(query)
curs.fetchall()

[(309078,)]

# Subsetting Studies Table to only Active Trials

In [14]:
# Selecting only relevant columns
study_cols = ['nct_id', 'start_date','completion_date',
              'study_type','overall_status','brief_title','phase','source']
studies = studies[study_cols]
studies.shape

(309078, 8)

In [25]:
active_statuses = ['Recruiting','Active, not recruiting','Not yet recruiting','Available','Approved for marketing']
active_studies = studies[ studies['overall_status'].isin(active_statuses) ]

In [26]:
active_studies.shape

(81666, 8)

In [28]:
keys = active_studies['nct_id']

In [79]:
active_studies.to_json('ClinicalTrialFinder-DS/Subsetted_Data/studies.json')

## Subsetting Facilities Table

In [74]:
query = 'SELECT * FROM ctgov.facilities'
facilities = pd.read_sql(sql=query, con=conn)
facilities.shape

(2173072, 8)

In [75]:
facil_cols = ['id', 'nct_id', 'name', 'city', 'state', 'country']
facilities = facilities[ facil_cols ]

In [76]:
active_facilities = facilities[ facilities['nct_id'].isin(keys) ]

In [77]:
active_facilities.shape

(669308, 6)

In [78]:
active_facilities.to_json('ClinicalTrialFinder-DS/Subsetted_Data/facilities.json')

# Subsetting Contacts Table

In [65]:
query = 'SELECT * FROM ctgov.central_contacts'
contacts = pd.read_sql(sql=query, con=conn)
contacts.shape

(120251, 6)

In [66]:
contact_cols = ['id', 'nct_id','contact_type','name','phone','email']
contacts = contacts[ contact_cols ]

In [67]:
active_contacts = contacts[ contacts['nct_id'].isin(keys) ]
active_contacts.shape

(90465, 6)

In [68]:
active_contacts.to_json('ClinicalTrialFinder-DS/Subsetted_Data/contacts.json')

## Subsetting Sponsors Table

In [40]:
query = 'SELECT * FROM ctgov.sponsors'
sponsors = pd.read_sql(sql=query, con=conn)
sponsors.shape

(490861, 5)

In [41]:
# Subset to only include lead sponsors
sponsors = sponsors[ sponsors['lead_or_collaborator'] == 'lead']
sponsors.shape

(309078, 5)

In [42]:
sponsor_cols = ['id', 'nct_id', 'name']
sponsors = sponsors[ sponsor_cols ]

(309078, 3)

In [43]:
active_sponsors = sponsors[ sponsors['nct_id'].isin(keys) ]
active_sponsors.shape

(81666, 3)

In [59]:
active_sponsors.to_json('ClinicalTrialFinder-DS/Subsetted_Data/sponsors.json')

## Subsetting the eligibilities Table

In [60]:
query = 'SELECT * FROM ctgov.eligibilities'
eligble = pd.read_sql(sql=query, con=conn)
eligble.shape

(309078, 11)

In [61]:
eligb_cols = ['id', 'nct_id', 'gender', 'minimum_age','maximum_age','healthy_volunteers']
eligble = eligble[ eligb_cols ]
eligble.shape

(309078, 6)

In [62]:
active_eligble = eligble[ eligble['nct_id'].isin(keys) ]
active_eligble.shape

(81666, 6)

In [63]:
active_eligble.to_json('ClinicalTrialFinder-DS/Subsetted_Data/eligibilities.json')

## Subsetting the Conditions Table

In [47]:
query = 'SELECT * FROM ctgov.conditions'
conditions = pd.read_sql(sql=query, con=conn)
conditions.shape

(510918, 4)

In [48]:
condition_cols = ['id', 'nct_id', 'name']
conditions = conditions[ condition_cols ]
conditions.shape

(510918, 3)

In [50]:
active_conditions = conditions[ conditions['nct_id'].isin(keys) ]
active_conditions.shape

(154517, 3)

In [57]:
active_conditions.to_json('ClinicalTrialFinder-DS/Subsetted_Data/conditions.json')

## Subsetting Brief Summaries

In [51]:
query = 'SELECT * FROM ctgov.brief_summaries'
summaries = pd.read_sql(sql=query, con=conn)
summaries.shape

(308277, 3)

In [52]:
active_summaries = summaries[ summaries['nct_id'].isin(keys) ]
active_summaries.shape

(81666, 3)

In [56]:
active_summaries.to_json('ClinicalTrialFinder-DS/Subsetted_Data/summaries.json')