In [2]:
# import necessary libraries

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
#matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

# BigQuery settings
from google.cloud import bigquery
from google.cloud.bigquery import dbapi;
client = bigquery.Client("som-nero-phi-jonc101"); # Project identifier
conn = dbapi.connect(client);
cursor = conn.cursor();



In [None]:
# read the time matching CSV and remove MRNs for security reasons (ignore if already exists)
import pandas as pd
data_frame = pd.read_csv('jon_mapping.csv')
data_frame = data_frame.drop('MRN', axis = 1) 
data_frame.to_csv('tmp.csv', index = False)

In [None]:
# Generate a random mapping (CSV file) to test

import pandas as pd
data_frame = pd.read_csv('tmp.csv')
print(list(data_frame.columns)) # print the column names

num_row = data_frame.shape[0]
print(num_row) # print num_row

data_frame = data_frame.drop('JITTER', axis = 1) # remove a column

data_frame['JITTER_test']= np.random.randint(10, size=num_row) # add a column 

print(data_frame[0:10]) # print the first 10 rows

data_frame.to_csv('tmp_rnd_shift.csv', index = False) # save to CSV

In [None]:
# Create a table of ADT of cohort patients (ADT_cohort_jit): 
###.  *** time_out is either the actual time_out or TPA_admin_time, whichever is earlier

query =  """
drop table if exists noshad_test.ADT_cohort_jit;
create table noshad_test.ADT_cohort_jit as
(
SELECT ADT.jc_uid, ADT.pat_enc_csn_id_coded, ADT.department_id, CH.tpaAdminTime,
    min(ADT.event_time_jittered) AS time_in, max(ADT.event_time_jittered) AS time_out
FROM `starr_datalake2018.adt` AS ADT
INNER JOIN `noshad_test.cohort_AL_user_role` AS CH
  USING (pat_enc_csn_id_coded)
GROUP BY ADT.jc_uid, ADT.pat_enc_csn_id_coded, ADT.department_id, CH.tpaAdminTime
ORDER BY ADT.jc_uid, ADT.pat_enc_csn_id_coded, time_in
)
"""
cursor.execute(query);

#results = cursor.fetchall();
#print(results[:2])
#results_np = np.array(results)

In [3]:
## main part to generate the number of patients per each cohort_patient

client = bigquery.Client("som-nero-phi-jonc101"); # Project identifier
conn = dbapi.connect(client);
cursor = conn.cursor();

# Upload time_mapping tmp.CSV 

schemafield_col1 = bigquery.schema.SchemaField("ANON_ID","STRING") #Define your schema
schemafield_col2 = bigquery.schema.SchemaField("JITTER","INTEGER")

filename = 'tmp.csv'
table_id = 'tmp' # the name of the chart to create

dataset_ref = client.dataset('noshad_test')
table_ref = dataset_ref.table(table_id)

job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.CSV
job_config.skip_leading_rows = 1
job_config.autodetect = True

with open(filename, "rb") as source_file:
    job = client.load_table_from_file(source_file, table_ref, job_config=job_config)

job.result()  # Waits for table load to complete.

print('temporary tmp file generated')

# Main part to generate num_pat per each patient
query= """

CREATE OR REPLACE TABLE noshad_test.num_pat AS(
WITH

    -- Generate ADT_cohort with actual times
  ADT_real_date AS
  (SELECT ADT.* except(time_in,time_out), DATETIME_SUB(ADT.time_in, INTERVAL TMP.JITTER DAY) as time_in,  
  DATETIME_SUB(ADT.time_out, INTERVAL TMP.JITTER DAY) as time_out
  
  FROM `noshad_test.ADT_cohort_jit` as ADT,
  `noshad_test.tmp` as TMP
  
  WHERE ADT.jc_uid=TMP.ANON_ID
  
  ORDER BY ADT.jc_uid, ADT.pat_enc_csn_id_coded, time_in
  ),
  
    -- Generate AL with actual times
  AL_real_date AS
  (SELECT AL.*, DATETIME_SUB(AL.access_time_jittered, INTERVAL TMP.JITTER DAY) as access_time_real
  FROM `shc_access_log.shc_access_log_de` as AL,
  `noshad_test.tmp` as TMP
  WHERE AL.rit_uid=TMP.ANON_ID
  ORDER BY AL.rit_uid
  ),
  
  --- Generate NUM of PAT PER DEP with times
  NUM_PAT_PER_DEP AS 
  (SELECT ADT_real_date.*, count(*) as num_tranx , 
    count(*)/ DATETIME_DIFF(ADT_real_date.time_out, ADT_real_date.time_in, MINUTE) as num_tranx_rate 
  FROM ADT_real_date, AL_real_date
  WHERE ADT_real_date.jc_uid=AL_real_date.rit_uid 
    AND ADT_real_date.time_in < AL_real_date.access_time_real 
    AND AL_real_date.access_time_real < ADT_real_date.time_out
  GROUP BY ADT_real_date.jc_uid, ADT_real_date.pat_enc_csn_id_coded, 
    ADT_real_date.department_id, ADT_real_date.tpaAdminTime, 
    ADT_real_date.time_in, ADT_real_date.time_out
  ORDER BY ADT_real_date.department_id)

  -- Main script
SELECT jc_uid, pat_enc_csn_id_coded, max(num_tranx_rate) as max_norm_num_tranx
FROM NUM_PAT_PER_DEP
WHERE time_in < tpaAdminTime
GROUP BY jc_uid, pat_enc_csn_id_coded
)"""

cursor.execute(query);

print('feature extraced')

## Final step: delete the temporary time mapping
query = "DROP TABLE noshad_test.tmp"
cursor.execute(query);

print('temporary tmp file deleted')



temporary tmp file generated
feature extraced
temporary tmp file deleted


In [None]:
# Create Patient Dictionary
# we have a dictionary of patients (jc_uid). Each patient has dictionary of different statistics: 
#          time2tpa, number of unique users, etc


Unique_ids, Indx = np.unique(AL[:,1], return_index = True) # unique patiant ids


import datetime

# Create the patient dictionary and extract TPA times for each patient and put it in a dictionary

Time2tpa = np.zeros(Indx.shape)
Pat_dic = {}

for id in Indx:
    t1 = AL[id,5]
    t2 = AL[id,9]
    #print(t1,t2)
    date_time_t1 = datetime.datetime.strptime(t1, '%Y-%m-%d %H:%M:%S')
    date_time_t2 = datetime.datetime.strptime(t2, '%Y-%m-%d %H:%M:%S')
    delta_t = date_time_t2 - date_time_t1
    time_int = int(delta_t.total_seconds() / 60)
    
    if time_int < 60*10:
        Pat_dic[AL[id,1]] = {} #each patient has a dictionary
        Pat_dic[AL[id,1]]['t2tpa'] = time_int

In [None]:
query = "select * from `noshad_test.num_pat`"; # Example dataset table
cursor.execute(query);
results = cursor.fetchall();
Cohort_AL_np = np.array(results)

# return in numpy array
print(Cohort_AL_np.shape)