###  Advanced Analytics- Heritage Health Network: Year 2


In [1]:
#Importing the packages and csv for Year 2 data
%matplotlib inline

import numpy as np
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set_style("whitegrid")

df1 = pd.read_csv('Dummy2.csv', index_col=0)



In [2]:
df1.head()

Unnamed: 0,MemberID,DSFS,ProviderID,Vendor,PCP,Specialty,PlaceSvc,PayDelay,LengthOfStay,PrimaryConditionGroup,CharlsonIndex,ProcedureGroup,SupLOS,DrugCount,LabCount,AgeAtFirstClaim,Sex,ClaimsTruncated,DaysInHospital
1,4,1,994608.0,851052.0,31106.0,Pediatrics,Office,43,0,RESPR4,0,EM,0,0,0,5,M,0,0
2,210,1,8448244.0,122401.0,37508.0,Internal,Office,38,0,PRGNCY,0,MED,0,0,1,35,Missing_Sex,0,0
3,210,1,7053364.0,240043.0,37508.0,Laboratory,Independent Lab,31,0,MSC2a3,0,PL,0,0,1,35,Missing_Sex,0,0
4,210,7,6380938.0,142747.0,37508.0,Other,Office,128,0,PRGNCY,0,MED,0,0,0,35,Missing_Sex,0,0
5,210,4,6380938.0,142747.0,37508.0,Other,Office,38,0,PRGNCY,0,EM,0,0,0,35,Missing_Sex,0,0


In [3]:
df1.columns


Index(['MemberID', 'DSFS', 'ProviderID', 'Vendor', 'PCP', 'Specialty',
       'PlaceSvc', 'PayDelay', 'LengthOfStay', 'PrimaryConditionGroup',
       'CharlsonIndex', 'ProcedureGroup', 'SupLOS', 'DrugCount', 'LabCount',
       'AgeAtFirstClaim', 'Sex', 'ClaimsTruncated', 'DaysInHospital'],
      dtype='object')

In [4]:
# check the rows and columns
df1.shape

(898872, 19)

In [5]:
# To see what kind of data types we have in the data
df1.get_dtype_counts()

float64     3
int64      10
object      6
dtype: int64

In [6]:
df1.dtypes

MemberID                   int64
DSFS                       int64
ProviderID               float64
Vendor                   float64
PCP                      float64
Specialty                 object
PlaceSvc                  object
PayDelay                   int64
LengthOfStay               int64
PrimaryConditionGroup     object
CharlsonIndex             object
ProcedureGroup            object
SupLOS                     int64
DrugCount                  int64
LabCount                   int64
AgeAtFirstClaim            int64
Sex                       object
ClaimsTruncated            int64
DaysInHospital             int64
dtype: object

In [7]:
#make a copy
df2 = df1
df2.shape

(898872, 19)

In [8]:
df2 = pd.get_dummies(df2)
print(df2.shape)


(898872, 106)


In [9]:
df2.head()

Unnamed: 0,MemberID,DSFS,ProviderID,Vendor,PCP,PayDelay,LengthOfStay,SupLOS,DrugCount,LabCount,...,ProcedureGroup_SIS,ProcedureGroup_SMCD,ProcedureGroup_SMS,ProcedureGroup_SNS,ProcedureGroup_SO,ProcedureGroup_SRS,ProcedureGroup_SUS,Sex_F,Sex_M,Sex_Missing_Sex
1,4,1,994608.0,851052.0,31106.0,43,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,210,1,8448244.0,122401.0,37508.0,38,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,210,1,7053364.0,240043.0,37508.0,31,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,210,7,6380938.0,142747.0,37508.0,128,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,210,4,6380938.0,142747.0,37508.0,38,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
df2.columns


Index(['MemberID', 'DSFS', 'ProviderID', 'Vendor', 'PCP', 'PayDelay',
       'LengthOfStay', 'SupLOS', 'DrugCount', 'LabCount',
       ...
       'ProcedureGroup_SIS', 'ProcedureGroup_SMCD', 'ProcedureGroup_SMS',
       'ProcedureGroup_SNS', 'ProcedureGroup_SO', 'ProcedureGroup_SRS',
       'ProcedureGroup_SUS', 'Sex_F', 'Sex_M', 'Sex_Missing_Sex'],
      dtype='object', length=106)

In [11]:
df2_na = (df2.isnull().sum())
df2_na

MemberID                                    0
DSFS                                        0
ProviderID                               5066
Vendor                                   8060
PCP                                      2554
PayDelay                                    0
LengthOfStay                                0
SupLOS                                      0
DrugCount                                   0
LabCount                                    0
AgeAtFirstClaim                             0
ClaimsTruncated                             0
DaysInHospital                              0
Specialty_Anesthesiology                    0
Specialty_Diagnostic Imaging                0
Specialty_Emergency                         0
Specialty_General Practice                  0
Specialty_Internal                          0
Specialty_Laboratory                        0
Specialty_Missing_Specialty                 0
Specialty_Obstetrics and Gynecology         0
Specialty_Other                   

In [12]:
#introduce random column for counting MemberID
df2['MemberID_Count'] = 1
df2.head()


Unnamed: 0,MemberID,DSFS,ProviderID,Vendor,PCP,PayDelay,LengthOfStay,SupLOS,DrugCount,LabCount,...,ProcedureGroup_SMCD,ProcedureGroup_SMS,ProcedureGroup_SNS,ProcedureGroup_SO,ProcedureGroup_SRS,ProcedureGroup_SUS,Sex_F,Sex_M,Sex_Missing_Sex,MemberID_Count
1,4,1,994608.0,851052.0,31106.0,43,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,210,1,8448244.0,122401.0,37508.0,38,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
3,210,1,7053364.0,240043.0,37508.0,31,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
4,210,7,6380938.0,142747.0,37508.0,128,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
5,210,4,6380938.0,142747.0,37508.0,38,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [13]:
# created a new df3 dataframe so that I could drop these 4 variables, as they require different aggregation.
df3 = df2
df3 = df3.drop(['AgeAtFirstClaim' ,'ProviderID','Vendor','PCP','PayDelay','DSFS'], axis=1)

In [14]:
pivot0 = pd.pivot_table(df3,index=['MemberID'], values = ['MemberID_Count'],
                     aggfunc = 'count')
pivot0.head()

Unnamed: 0_level_0,MemberID_Count
MemberID,Unnamed: 1_level_1
4,1
210,6
3197,5
3457,1
3713,10


In [15]:
#will use df3 to do sum on all dummy variables in one line and then add other aggregated variables using concat later
#doing sum of all dummy variables without including 4 int variables, as they require us to do a mean
pivot1 = pd.pivot_table(df3,index=['MemberID'],
                     aggfunc=[np.sum])
pivot1.head()

Unnamed: 0_level_0,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,CharlsonIndex_0,CharlsonIndex_1-2,CharlsonIndex_3-4,CharlsonIndex_5+,ClaimsTruncated,DaysInHospital,DrugCount,LabCount,LengthOfStay,MemberID_Count,...,Specialty_Internal,Specialty_Laboratory,Specialty_Missing_Specialty,Specialty_Obstetrics and Gynecology,Specialty_Other,Specialty_Pathology,Specialty_Pediatrics,Specialty_Rehabilitation,Specialty_Surgery,SupLOS
MemberID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
4,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
210,6,0,0,0,0,0,0,2,0,6,...,1,1,0,0,4,0,0,0,0,0
3197,5,0,0,0,0,0,6,8,0,5,...,1,2,0,0,0,0,1,0,0,0
3457,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3713,10,0,0,0,0,0,36,37,0,10,...,0,2,0,0,0,0,0,0,0,0


In [16]:
#mean of Age, 'PayDelay','DSFS', using df2 
pivot2 = pd.pivot_table(df2,index=['MemberID'],values=['AgeAtFirstClaim','PayDelay','DSFS'],
                     aggfunc=[np.mean])
pivot2.head()

Unnamed: 0_level_0,mean,mean,mean
Unnamed: 0_level_1,AgeAtFirstClaim,DSFS,PayDelay
MemberID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,5.0,1.0,43.0
210,35.0,4.5,51.333333
3197,5.0,1.8,29.6
3457,5.0,1.0,63.0
3713,45.0,4.7,56.2


In [17]:
result1 = pd.concat([pivot1, pivot2, pivot0], axis=1)
result1.head()

Unnamed: 0_level_0,"(sum, CharlsonIndex_0)","(sum, CharlsonIndex_1-2)","(sum, CharlsonIndex_3-4)","(sum, CharlsonIndex_5+)","(sum, ClaimsTruncated)","(sum, DaysInHospital)","(sum, DrugCount)","(sum, LabCount)","(sum, LengthOfStay)","(sum, MemberID_Count)",...,"(sum, Specialty_Other)","(sum, Specialty_Pathology)","(sum, Specialty_Pediatrics)","(sum, Specialty_Rehabilitation)","(sum, Specialty_Surgery)","(sum, SupLOS)","(mean, AgeAtFirstClaim)","(mean, DSFS)","(mean, PayDelay)",MemberID_Count
MemberID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,1,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,5.0,1.0,43.0,1
210,6,0,0,0,0,0,0,2,0,6,...,4,0,0,0,0,0,35.0,4.5,51.333333,6
3197,5,0,0,0,0,0,6,8,0,5,...,0,0,1,0,0,0,5.0,1.8,29.6,5
3457,1,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,5.0,1.0,63.0,1
3713,10,0,0,0,0,0,36,37,0,10,...,0,0,0,0,0,0,45.0,4.7,56.2,10


In [18]:
#distinct count of 3 integer variables  
pivot3 = pd.pivot_table(df2,index=['MemberID'],values=['ProviderID','Vendor','PCP'],
                     aggfunc=lambda x: len((x.dropna()).unique()))

pivot3.head(20)

Unnamed: 0_level_0,PCP,ProviderID,Vendor
MemberID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,1.0,1.0,1.0
210,1.0,3.0,3.0
3197,1.0,4.0,3.0
3457,1.0,1.0,1.0
3713,2.0,5.0,5.0
3741,1.0,3.0,3.0
5187,1.0,1.0,1.0
8213,1.0,2.0,2.0
10242,2.0,4.0,4.0
11951,1.0,8.0,8.0


In [19]:
###### this is the final aggregated df to export to r
# brought all aggregated variables together
result2 = pd.concat([result1, pivot3], axis=1)
result2.head(20)

Unnamed: 0_level_0,"(sum, CharlsonIndex_0)","(sum, CharlsonIndex_1-2)","(sum, CharlsonIndex_3-4)","(sum, CharlsonIndex_5+)","(sum, ClaimsTruncated)","(sum, DaysInHospital)","(sum, DrugCount)","(sum, LabCount)","(sum, LengthOfStay)","(sum, MemberID_Count)",...,"(sum, Specialty_Rehabilitation)","(sum, Specialty_Surgery)","(sum, SupLOS)","(mean, AgeAtFirstClaim)","(mean, DSFS)","(mean, PayDelay)",MemberID_Count,PCP,ProviderID,Vendor
MemberID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,1,0,0,0,0,0,0,0,0,1,...,0,0,0,5.0,1.0,43.0,1,1.0,1.0,1.0
210,6,0,0,0,0,0,0,2,0,6,...,0,0,0,35.0,4.5,51.333333,6,1.0,3.0,3.0
3197,5,0,0,0,0,0,6,8,0,5,...,0,0,0,5.0,1.8,29.6,5,1.0,4.0,3.0
3457,1,0,0,0,0,0,0,0,0,1,...,0,0,0,5.0,1.0,63.0,1,1.0,1.0,1.0
3713,10,0,0,0,0,0,36,37,0,10,...,0,0,0,45.0,4.7,56.2,10,2.0,5.0,5.0
3741,18,2,0,0,0,0,62,56,0,20,...,0,2,0,75.0,3.65,18.0,20,1.0,3.0,3.0
5187,2,0,0,0,0,0,0,0,0,2,...,0,0,0,5.0,1.0,32.5,2,1.0,1.0,1.0
8213,3,0,0,0,0,0,0,8,0,3,...,0,0,0,15.0,1.666667,48.0,3,1.0,2.0,2.0
10242,13,1,0,0,0,0,1,42,6,14,...,0,0,0,25.0,2.428571,45.0,14,2.0,4.0,4.0
11951,11,0,0,0,0,0,0,9,1,11,...,0,1,0,15.0,1.454545,55.272727,11,1.0,8.0,8.0


In [20]:
result2.to_csv('out_Agg_dummy_Y2.csv',index=True)

In [21]:
# IGNORE THIS FOR NOW this was an earlier tedious attempt
#trying to manually summarise each column which was too tedious so used above approach. saving it for future. 

#pd.pivot_table(df4,index=["MemberID],columns=["Product"],values=["Quantity","Price"],
              # aggfunc={"Quantity":len,"Price":np.sum},fill_value=0)
"""                      
pivot2= pd.pivot_table(df4,index=['MemberID'],
                     aggfunc={'DSFS' : 'sum', 'ProviderID' : 'count', 'Vendor' : 'count', 'PayDelay':'sum','PCP':'count',
                              'LengthOfStay': 'sum','SupLOS' : 'sum', 'DrugCount' : 'sum', 'LabCount' : 'sum',
                              'AgeAtFirstClaim':'mean','ClaimsTruncated': 'sum','DaysInHospital':'sum','Specialty_Anesthesiology':'sum',
                              'Specialty_Diagnostic Imaging':'sum',  'Specialty_Emergency':'sum', 'Specialty_General Practice':'sum', 
                              'Specialty_Internal':'sum','Specialty_Laboratory':'sum','Specialty_Missing_Specialty':'sum','Specialty_Obstetrics and Gynecology':'sum',
                              'Specialty_Other':'sum','Specialty_Pathology':'sum','Specialty_Pediatrics':'sum','Specialty_Rehabilitation':'sum',
                              'Specialty_Surgery':'sum',
                              'PlaceSvc_Ambulance':'sum', 'PlaceSvc_Home':'sum','PlaceSvc_Independent Lab':'sum','PlaceSvc_Inpatient Hospital':'sum',
                              'PlaceSvc_Missing_PlaceSvc':'sum','PlaceSvc_Office':'sum','PlaceSvc_Other':'sum','PlaceSvc_Outpatient Hospital':'sum',
                              'PlaceSvc_Urgent Care':'sum'
                              
                                                       
                             
                             
                             })



pivot2.head()
"""

"                      \npivot2= pd.pivot_table(df4,index=['MemberID'],\n                     aggfunc={'DSFS' : 'sum', 'ProviderID' : 'count', 'Vendor' : 'count', 'PayDelay':'sum','PCP':'count',\n                              'LengthOfStay': 'sum','SupLOS' : 'sum', 'DrugCount' : 'sum', 'LabCount' : 'sum',\n                              'AgeAtFirstClaim':'mean','ClaimsTruncated': 'sum','DaysInHospital':'sum','Specialty_Anesthesiology':'sum',\n                              'Specialty_Diagnostic Imaging':'sum',  'Specialty_Emergency':'sum', 'Specialty_General Practice':'sum', \n                              'Specialty_Internal':'sum','Specialty_Laboratory':'sum','Specialty_Missing_Specialty':'sum','Specialty_Obstetrics and Gynecology':'sum',\n                              'Specialty_Other':'sum','Specialty_Pathology':'sum','Specialty_Pediatrics':'sum','Specialty_Rehabilitation':'sum',\n                              'Specialty_Surgery':'sum',\n                              'PlaceSvc_Ambula