## Dependencies

In [2]:
import pandas as pd
import psycopg2

## Connect to Database

In [3]:
dbname = 'aact'
user = 'postgres'
password = 'lqt38be'
host = 'localhost'

conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host)
curs = conn.cursor()

In [4]:
# Verifying Connection
query = """SELECT COUNT(*) 
FROM ctgov.studies;
"""
curs.execute(query)
curs.fetchall()

[(309078,)]

## Load Studies Table

In [5]:
query = 'SELECT * FROM ctgov.studies'
studies = pd.read_sql(sql=query, con=conn)
studies.shape

(309078, 64)

## Split into Pred, Test, Val, and Train Sets

In [6]:
studies['overall_status'].value_counts()

Completed                    164904
Recruiting                    51010
Unknown status                32018
Terminated                    17438
Active, not recruiting        16655
Not yet recruiting            13675
Withdrawn                      7677
Enrolling by invitation        3306
Suspended                      1027
Withheld                        800
No longer available             222
Available                       205
Approved for marketing          121
Temporarily not available        20
Name: overall_status, dtype: int64

In [7]:
active_status = ['Recruiting', 'Active, not recruiting', 'Not yet recruiting', 
                 'Enrolling by invitation', 'Available', 'Approved for marketing']

In [9]:
pred_set = studies[ studies['overall_status'].isin(active_status) ]
pred_set.shape

(84972, 64)

In [10]:
inactive_status = ['Completed', 'Terminated', 'Withdrawn', 'Suspended']

In [11]:
inactive_set = studies[ studies['overall_status'].isin(inactive_status) ]
inactive_set.shape

(191046, 64)

In [20]:
inactive_set = inactive_set.copy()
inactive_set['completion'] = (inactive_set['overall_status'] == 'Completed')

In [22]:
inactive_set = inactive_set.drop(columns='overall_status')

In [23]:
y = inactive_set['completion']
X = inactive_set.drop(columns='completion')

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)

X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((122268, 63), (30568, 63), (38210, 63), (122268,), (30568,), (38210,))

In [31]:
X_train.to_csv('X_train.csv', index=False)
X_val.to_csv('X_val.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False, header=False)
y_val.to_csv('y_val.csv', index=False, header=False)
y_test.to_csv('y_test.csv', index=False, header=False)
pred_set.to_csv('pred_set.csv', index=False)