In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlalchemy
import psycopg2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

In [69]:
POSTGRES_ADDRESS = 'localhost'
POSTGRES_PORT = '5433'
POSTGRES_USERNAME = 'nick'
POSTGRES_PASSWORD = '00tracker'

postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'.format(
   username=POSTGRES_USERNAME, 
    password=POSTGRES_PASSWORD,
    ipaddress=POSTGRES_ADDRESS,
    port=POSTGRES_PORT,
    dbname='project'))
# Create the connection
cnx = sqlalchemy.create_engine(postgres_str)

In [95]:
raw_data = pd.read_sql_query('select * from patient_record order by random() limit 200000', cnx)
raw_data.fillna(0, inplace=True)
raw_data.drop(columns=["as_of_date", "patient_id", 
                       "patient_race", "patient_language", 
                       "patient_gender"], inplace=True)

In [106]:
train_data, test_data = train_test_split(raw_data, test_size=0.2)

# Drop columns with all same values
column_sums = train_data.sum(0)
dropped_columns = []
for i in range(len(column_sums)):
    if column_sums[i] == 0:
        dropped_columns.append(train_data.columns[i])
train_data = train_data.drop(columns=dropped_columns)
test_data = test_data.drop(columns=dropped_columns)

# standardize data
data_mean = train_data.mean()
data_std = train_data.std()
train_data_norm = (train_data - data_mean) / data_std
test_data_norm = (test_data - data_mean) / data_std

# Labels
train_lab_12mo = train_data["fut_admission_12mo"].values
train_lab_6mo = train_data["fut_admission_6mo"].values
train_lab_3mo = train_data["fut_admission_3mo"].values
train_lab_1mo = train_data["fut_admission_1mo"].values
test_lab_12mo = test_data["fut_admission_12mo"].values
test_lab_6mo = test_data["fut_admission_6mo"].values
test_lab_3mo = test_data["fut_admission_3mo"].values
test_lab_1mo = test_data["fut_admission_1mo"].values

# Feature Data
train_features = train_data.drop(columns=[
    "fut_admission_12mo",
    "fut_admission_6mo",
    "fut_admission_3mo",
    "fut_admission_1mo"
])
test_features = test_data.drop(columns=[
    "fut_admission_12mo",
    "fut_admission_6mo",
    "fut_admission_3mo",
    "fut_admission_1mo"
])

no_temp_train_features = train_features["prev_admission"].values.reshape(-1, 1)
no_temp_test_features = test_features["prev_admission"].values.reshape(-1, 1)
temp_train_features = train_features[["prev_admission_12mo", 
                                     "prev_admission_6mo",
                                     "prev_admission_3mo",
                                     "prev_admission_1mo"]]
temp_test_features = test_features[["prev_admission_12mo", 
                                     "prev_admission_6mo",
                                     "prev_admission_3mo",
                                     "prev_admission_1mo"]]

In [108]:
# Compare the two models against eachother

no_temp_model = LogisticRegression().fit(no_temp_train_features, train_lab_12mo)
print(no_temp_model.score(no_temp_test_features, test_lab_12mo))

temp_model = LogisticRegression().fit(temp_train_features, train_lab_12mo)
print(temp_model.score(temp_test_features, test_lab_12mo))

0.92915
0.92915
