# Merging labs and visits

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import timedelta

matplotlib.style.use('ggplot') # Look Pretty

os.chdir('/Users/marleneguraieb/Google Drive/Healthcare_metrics')

I start from the two databases that contain:
    - CACU: all the lab files (which we cleaned and edited in the last notebook)
    - SIAIS: all the visits to the clinic of our target group (women between 25 and 65)

In [4]:
CACU = pd.read_csv('DATA/CACUwide.csv',low_memory=False)
SIAIS = pd.read_csv('DATA/SIAIS.csv')

Now make dates dates and use the index as patient.

In [11]:
cols = CACU.columns.values.tolist()

cols1 = [cols.index(s) for s in cols if 'Date' in s]
cols2 = [cols.index(s) for s in cols if '_Cper' in s]
cols = cols1 + cols2

for i in range(0,26):
    CACU.iloc[:,cols[i]] = pd.to_datetime(CACU.iloc[:,cols[i]], format='%Y-%m-%d')
    
 CACU = CACU.rename(columns = {'paciente':'Patient'})

In [17]:
SIAIS['fecha'] = pd.to_datetime(SIAIS['fecha'], format='%Y-%m-%d')
SIAIS = SIAIS[['unidadmedica', 'paciente', 'fecha', 'matricula', 'Cacu']]

colsEng = ['Medical_unit','Patient','Date','NurseID','Cacu']

SIAIS.columns = colsEng

SIAIS['NurseID'] = SIAIS['NurseID'].astype('category')

Now we create a df called TWT (the whole thing), that merges the history of every patient in the lab test db, with every visit to the doctor of that patient. This is necessary to determine eligibility for each visit to the doctor. 

In [25]:
TWT = pd.merge(SIAIS, CACU, how='left', on='Patient', 
               sort=True, suffixes=('_x', '_y'), copy=True)

# Determining whether the test got done

Now we have to determine whether each patient was elegible for a test at the time she went to the doctor. For this we will need the new df (TWT), because it contains the patient history for every patient in every medical visit. The first thing we know, is that if the patient has never had a test, then she was eligible to get one, so if the first test date is empty, we know it's ``'NHT'''

In [27]:
TWT['Tested'] = np.nan

TWT.ix[pd.isnull(TWT['Date.0'])==True,'Tested'] = "NHT"

First, if the date of the consult matches the date of the test (+/- 28 days, for capture error and delays), then we consider the test made:

In [29]:
for i in range(0,8):
#first find the relevant columns for each date
    cols = TWT.columns.values.tolist()
    cols = [cols.index(s) for s in cols if str(i) in s]
    TWT.ix[(TWT['Date']>= TWT.iloc[:,cols[0]]-pd.to_timedelta(28,unit ='D')) &
        (TWT['Date']<= TWT.iloc[:,cols[0]]+pd.to_timedelta(28,unit ='D')),'Tested'] = TWT.iloc[:,cols[1]]

Then, if it doesn't match but the visit is inside the "covered periods" determined by the periods between lab visits, then we consider the test not needed:

In [30]:
for i in range(0,8):
#first find the relevant columns for each date
    cols = TWT.columns.values.tolist()
    cols = [cols.index(s) for s in cols if str(i) in s]
    TWT['Tested']= np.where((TWT['Date']>= TWT.iloc[:,cols[2]]) & 
                            (TWT['Date']<= TWT.iloc[:,cols[3]]) & 
                            (pd.isnull(TWT['Tested'])==True),'NHNT',TWT['Tested'])

Finally, if none of these things is true, then the patient needed a test and didn't get one. '

In [31]:
TWT.Tested = TWT.Tested.fillna('NHT')

Now we have TWT.Tested that encodes the four possibilites for every test. Let's see how that looks. It appears we only have 12% average hit rate (patients who needed tests and got them); a further 10% of patients did not get tests but did not need them, and 77% of patients needed a test but did not get one. 

In [34]:
TWT.Tested.value_counts(normalize=True)

NHT     0.767523
HT      0.123668
NHNT    0.101945
HNT     0.006864
Name: Tested, dtype: float64

Now all we need from this dataset is the 'Tested column', I can get rid of everything else because all the information is now contained in that. 

In [None]:
TWT.to_csv('TWT.csv')

TWT = TWT[['Medical_unit','NurseID','Date','Cacu','Tested']]

Now I need to get a) dummies for each of the possible t, and then compare self-reported scores (whether the nurse claimed to have done the test or not) with whether the test was done or not. (These will be our measures of honesty for the nurses). 

In [35]:
TWT = pd.concat([TWT, pd.get_dummies(TWT['Tested'])],axis=1)

TWT['Self_rep'] = np.nan
TWT.ix[(TWT['Cacu'].isin([1,2])) & (TWT['Tested'].isin(['HT','HNT'])),'Self_rep'] = 'Said1or2LabYes'
TWT.ix[(TWT['Cacu'].isin([1,2])) & (TWT['Tested'].isin(['NHT','NHNT'])),'Self_rep'] = 'Said1or2LabNo'
TWT.ix[(TWT['Cacu']==9) & (TWT['Tested'].isin(['HT','HNT'])),'Self_rep'] = 'Said9LabYes'
TWT.ix[(TWT['Cacu']==9) & (TWT['Tested'].isin(['NHT','NHNT'])),'Self_rep'] = 'Said9LabNo'
TWT.ix[(TWT['Cacu']==0) & (TWT['Tested'].isin(['HT','HNT'])),'Self_rep'] = 'Said0LabYes'
TWT.ix[(TWT['Cacu']==0) & (TWT['Tested'].isin(['NHT','NHNT'])),'Self_rep'] = 'Said0LabNo'

TWT = pd.concat([TWT, pd.get_dummies(TWT['Self_rep'])],axis=1)

# Combining with TMT stats to determine relevant dates.

To do this, we need a dataframe that contains every nurse that received a treatment in our sample (for each possible treatment), as well as the treatment date. This is because we will calculate Average Treatment effects as the difference in behavior 30 days before the treatment and 30 days after the treatment. 

In [41]:
TMTstatsN = pd.read_table('DATA/TMTstatsNurses.csv')
TMTstatsN['TMT_Date'] = pd.to_datetime(TMTstatsN['TMT_Date'], format='%Y-%m-%d')

Our sample is composed of:
    - 315 nurses who were working on the median date of our treatments on the clinics selected for the no treatment (cero group) 
    - 107 nurses who received only an exhortation (control group) 
    - 151 nurses that received a letter containing their average score (feedback group)
    - 146 nurses that received a letter containing their average score + a request to give explanations about their behavior (reasons group)

In [42]:
pd.crosstab(TMTstatsN.Received,TMTstatsN.treatment,margins=True)

treatment,cero,control,feedback,reasons,All
Received,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,107,151,146,404
99,315,0,0,0,315
All,315,107,151,146,719


In [43]:
TWT = pd.merge(TWT,TMTstatsN[['NurseID', 'ExpectedUnitAtTMT',
'Grade_received','treatment', 'Received', 'TMT_Date']],on='NurseID',how='left')

Now let's calculate which visits took place 30 days before and after receiving the letter (both in 2013 and in 2014). The past year's performance will be needed to de-trend the data.

In [45]:
TWT['TMT_time'] = np.nan

TWT.ix[(TWT['Date']>=TWT['TMT_Date']-pd.to_timedelta(395, unit='D')) 
     & (TWT['Date']<TWT['TMT_Date']-pd.to_timedelta(365, unit='D')),'TMT_time'] = 't_0_2013'
TWT.ix[(TWT['Date']>=TWT['TMT_Date']-pd.to_timedelta(365, unit='D')) 
     & (TWT['Date']<TWT['TMT_Date']-pd.to_timedelta(335, unit='D')),'TMT_time'] = 't_1_2013'
TWT.ix[(TWT['Date']>=TWT['TMT_Date']-pd.to_timedelta(30, unit='D')) 
     & (TWT['Date']<TWT['TMT_Date']),'TMT_time'] = 't_0_2014'
TWT.ix[(TWT['Date']>=TWT['TMT_Date']) 
     & (TWT['Date']<TWT['TMT_Date']+pd.to_timedelta(30, unit='D')),'TMT_time'] = 't_1_2014'


Now that we have flagged these four groups ($t_0$ and $t_1$ for 2013 and 2014), we can have a dataframe with only those visits, sorted by nurse and TMT_time (in order to calculate the grades based on that time period alone).

In [46]:
TWT['TMT_time'] = TWT['TMT_time'].astype('category')

ATEconsults = TWT[pd.isnull(TWT['TMT_time'])==False]

ATEconsults = ATEconsults.sort_values(['NurseID',
                            'TMT_time','Date','Tested'], axis=0, ascending=True)
list = ATEconsults.groupby(['NurseID','TMT_time']).apply(lambda x:
                                                        range(len(x))).tolist()
loop = [val for sublist in list for val in sublist]
ATEconsults['NumConsPer']=loop

In [None]:
ATEconsults.to_csv('DATA/ATEconsults.csv')

# Calculating dependent variables by nurse

In [47]:
#This is our main dependent variable. How many tests that were needed (HT) did the nurse get done as a percentage 
#percentage of all the patients that went to consult with her and needed a test (HT+NHT).
Grade1 = lambda g: ATEconsults.ix[g.index].HT.sum()/(
            ATEconsults.ix[g.index].HT.sum()+ATEconsults.ix[g.index].NHT.sum())

In [48]:
#This is the ``honesty rate'': the percentage of times that the patient needed a test, did not get one and the nurse 
#correctly said she did not give the patient a test.
Said0LabNo = lambda g: ATEconsults.ix[g.index].Said0LabNo.sum()/(
            ATEconsults.ix[g.index].NHT.sum())

In [49]:
#This is the ``excuse rate'': the percentage of times that the nurse claimed the patient had an (unverifyable) excuse
#and did not give her a test. 
Said9LabNo = lambda g: ATEconsults.ix[g.index].Said9LabNo.sum()/(
            ATEconsults.ix[g.index].NumConsPer.max()+1)

In [50]:
#This is the ``lying rate'' the number of times that a nurse claimed she did a test (said 1 or 2) and did the test
#over the number of times she claimed to have done a test 
Said1or2Lab0_Said1 = lambda g: 'nan' if ATEconsults.ix[g.index].Said1or2LabNo.sum()+ ATEconsults.ix[g.index].Said1or2LabYes.sum()==0 else ATEconsults.ix[g.index].Said1or2LabNo.sum()/(
            ATEconsults.ix[g.index].Said1or2LabNo.sum()+ATEconsults.ix[g.index].Said1or2LabYes.sum())

In [51]:
#these are unique variables that we need in the dataframe that is by nurse:

NumConsPer = lambda g: ATEconsults.ix[g.index].NumConsPer.max()+1
treatment = lambda g: str(ATEconsults.ix[g.index].treatment.unique()[0])
TMT_Date = lambda g: str(ATEconsults.ix[g.index].TMT_Date.unique()[0])
Received = lambda g: str(ATEconsults.ix[g.index].Received.unique()[0])
ExpectedUnitAtTMT = lambda g: str(ATEconsults.ix[g.index].ExpectedUnitAtTMT.unique()[0])

Now we create a dictionary with all these functions to aggregate the by consult table:

In [52]:
f = {'HT': {'Grade1': Grade1},
     'Said0LabNo' : {'Said0LabNo_Toc': Said0LabNo},
     'Said9LabNo': {'Said9LabNo_cons': Said9LabNo},
     'Said1or2LabNo': {'Said1or2Lab0_Said1': Said1or2Lab0_Said1},
     'NumConsPer': {'NumConsPer': NumConsPer},
     'treatment' : {'treatment' : treatment},
     'TMT_Date' : {'TMT_Date': TMT_Date},
     'Received' : {'Received': Received},
     'ExpectedUnitAtTMT' : {'ExpectedUnitAtTMT': ExpectedUnitAtTMT}}

In [56]:
ATEbyNurse = ATEconsults.groupby(['NurseID','TMT_time']).agg(f)

In [57]:
ATEbyNurse.columns.values

array([('Said1or2LabNo', 'Said1or2Lab0_Said1'), ('treatment', 'treatment'),
       ('Said9LabNo', 'Said9LabNo_cons'), ('Said0LabNo', 'Said0LabNo_Toc'),
       ('Received', 'Received'), ('TMT_Date', 'TMT_Date'),
       ('ExpectedUnitAtTMT', 'ExpectedUnitAtTMT'),
       ('NumConsPer', 'NumConsPer'), ('HT', 'Grade1')], dtype=object)

In [58]:
colsEng = ['Said1or2LabNo_Said1','treatment', 'Said9LabNo_cons', 'Said0LabNo_Toc', 'Received', 'TMT_Date',
 'ExpectedUnitAtTMT','NumConsPer', 'Grade1']

ATEbyNurse.columns = colsEng

In [59]:
ATEbyNurse.describe()



Unnamed: 0,Said1or2LabNo_Said1,Said9LabNo_cons,Said0LabNo_Toc,Received,NumConsPer,Grade1
count,2365.0,2417.0,2417.0,2417.0,2417.0,2417.0
mean,0.550431,0.45012,0.277408,40.613571,142.12288,0.174342
std,0.34518,0.271662,0.251965,48.102518,70.231042,0.147205
min,0.0,0.0,0.0,1.0,1.0,0.0
25%,,0.21519,0.094488,1.0,110.0,0.032468
50%,,0.495868,0.207101,1.0,141.0,0.160256
75%,,0.668639,0.380952,99.0,169.0,0.274336
max,1.0,1.0,2.0,99.0,1050.0,0.9


In [None]:
ATEbyNurse.to_csv('DATA/ATEbyNurse.csv')

This completes the code to calculate the grade in the four periods that we have: 

$$t_0^{2013}$$ thirty days before the treatment date the year prior to the treatment (for detrending);

$$t_1^{2013}$$ thirty days after the treatment date the year prior the the treatment (for detrending);

$$t_0^{2014}$$ thirty days before the treatment the year of the treatment;

$$t_1^{2014}$$ thirty days after the treatment date the year of the treatment.

The next notebook will calculate the Average Treatment Effects combining these grades and the matching protocol done prior to the application of the treatment. 

# Calculating grade differences by period

In [2]:
TMTstats = pd.read_table('DATA/TMTstats.csv')

In [4]:
TMTstats.TMTdate = pd.to_datetime(TMTstats.TMTdate,format = '%Y-%m-%d')

In [5]:
ATEbyNurse = pd.read_csv('DATA/ATEbyNurse.csv')

In [7]:
ATEbyNurse = pd.merge(ATEbyNurse,TMTstats[['Medical_unit', 
                                           'Treatment','MatchItCR','MatchItCC']],
                      how = 'left',left_on = 'ExpectedUnitAtTMT', 
                      right_on = 'Medical_unit')

First I have to get all the unique values here, because this dataset is in long format, it has every nurse for each of the time periods that are relevant, so I'm going to make a temporary dataset that contains all the unique values so taht I can merge it later when I transform to wide. 

In [9]:
tempDF = ATEbyNurse[['NurseID','ExpectedUnitAtTMT', 'treatment', 'TMT_Date', 'Medical_unit',
           'Treatment', 'MatchItCR', 'MatchItCC']].copy()
tempDF = tempDF.drop_duplicates()

Now use `pivot_table` to change data from long to wide: each nurse with each of her dependent variables for each time period is one observation. Also, rename the columns.

In [10]:
ATEbyNurse = pd.pivot_table(ATEbyNurse, index=['NurseID'],columns='TMT_time',
                                  values = ['NumConsPer','Grade1',
                                         'Said1or2Lab0_Said1','Said9LabNo_cons',
                                         'Said0LabNo_Toc'])
ATEbyNurse.columns =[s1 +'_'+ str(s2) for (s1,s2) in ATEbyNurse.columns.tolist()]

In [11]:
ATEbyNurse = pd.merge(ATEbyNurse,tempDF,how='inner',left_index=True,right_on='NurseID')

The following is a for loop to calculate the $t^{2014}$ and $t^{2013}$ differences as well as $t^{14}-t^{13}$ 

In [12]:
cols = ATEbyNurse.columns.values.tolist()
t_0_2013 = [cols.index(s) for s in cols if 't_0_2013' in s]
t_1_2013 = [cols.index(s) for s in cols if 't_1_2013' in s]
t_0_2014 = [cols.index(s) for s in cols if 't_0_2014' in s]
t_1_2014 = [cols.index(s) for s in cols if 't_1_2014' in s]


for i in range(0,5):
    ATEbyNurse[ATEbyNurse.columns.tolist()[t_0_2013[i]].replace('_t_0_2013',
    '_Dif2013')] = ATEbyNurse.ix[:,t_1_2013[i]]-ATEbyNurse.ix[:,t_0_2013[i]]
    ATEbyNurse[ATEbyNurse.columns.tolist()[t_0_2014[i]].replace('_t_0_2014',
    '_Dif2014')] = ATEbyNurse.ix[:,t_1_2014[i]]-ATEbyNurse.ix[:,t_0_2014[i]]

In [13]:
cols = ATEbyNurse.columns.values.tolist()
values = ['Grade1','Said1or2Lab0_Said1','Said9LabNo_cons','Said0LabNo_Toc']

In [14]:
for i in range(0,4):                          
    difs = [cols.index(s) for s in cols if values[i] in s]
    ATEbyNurse[cols[difs[5]].replace('Dif2014','d13_14')] = ATEbyNurse[cols[difs[5]]]-ATEbyNurse[cols[difs[4]]]

In [15]:
ATEbyNurse['NumCons2013'] = (ATEbyNurse['NumConsPer_t_0_2013']+ATEbyNurse['NumConsPer_t_1_2013'])/2
ATEbyNurse['NumCons2014'] = (ATEbyNurse['NumConsPer_t_0_2014']+ATEbyNurse['NumConsPer_t_1_2014'])/2

ATEbyNurse['NumCons2013'] = ATEbyNurse['NumCons2013'].fillna(0)
ATEbyNurse['NumCons2014'] = ATEbyNurse['NumCons2014'].fillna(0)

For the results by clinic I'm going to do it in R. So here I save this and will open it in the next notebook. 

In [17]:
ATEbyNurse.columns.values.tolist()

['NumConsPer_t_0_2013',
 'NumConsPer_t_0_2014',
 'NumConsPer_t_1_2013',
 'NumConsPer_t_1_2014',
 'Grade1_t_0_2013',
 'Grade1_t_0_2014',
 'Grade1_t_1_2013',
 'Grade1_t_1_2014',
 'Said1or2Lab0_Said1_t_0_2013',
 'Said1or2Lab0_Said1_t_0_2014',
 'Said1or2Lab0_Said1_t_1_2013',
 'Said1or2Lab0_Said1_t_1_2014',
 'Said9LabNo_cons_t_0_2013',
 'Said9LabNo_cons_t_0_2014',
 'Said9LabNo_cons_t_1_2013',
 'Said9LabNo_cons_t_1_2014',
 'Said0LabNo_Toc_t_0_2013',
 'Said0LabNo_Toc_t_0_2014',
 'Said0LabNo_Toc_t_1_2013',
 'Said0LabNo_Toc_t_1_2014',
 'NurseID',
 'ExpectedUnitAtTMT',
 'treatment',
 'TMT_Date',
 'Medical_unit',
 'Treatment',
 'MatchItCR',
 'MatchItCC',
 'NumConsPer_Dif2013',
 'NumConsPer_Dif2014',
 'Grade1_Dif2013',
 'Grade1_Dif2014',
 'Said1or2Lab0_Said1_Dif2013',
 'Said1or2Lab0_Said1_Dif2014',
 'Said9LabNo_cons_Dif2013',
 'Said9LabNo_cons_Dif2014',
 'Said0LabNo_Toc_Dif2013',
 'Said0LabNo_Toc_Dif2014',
 'Grade1_d13_14',
 'Said1or2Lab0_Said1_d13_14',
 'Said9LabNo_cons_d13_14',
 'Said0LabNo_Toc_

In [None]:
ATEbyNurse.to_csv("ATEbyNurse.csv")