In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import matplotlib

matplotlib.rcParams['figure.figsize'] = (8, 3.5)
sns.set()
pd.options.display.max_rows = 200
pd.options.display.max_columns = 100

In [None]:
df = pd.read_csv('diabetic_data.csv')
df

In [None]:
sns.barplot(data=df.groupby(['readmitted']).agg({'encounter_id': 'count'}).reset_index().rename(columns={'encounter_id': '# records'}), x='readmitted', y='# records', palette='Reds') 

In [None]:
df.drop_duplicates('patient_nbr').groupby(['readmitted']).agg({'encounter_id': 'count'}).reset_index().rename(columns={'encounter_id': '# records'})

### Choosing Treatments

In [None]:
df['no_med'] = 'No'
MED_COLUMNS = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone' , 'no_med']
df.loc[:, MED_COLUMNS] = df[MED_COLUMNS].where(df[MED_COLUMNS] == 'No', 1)
df.loc[:, MED_COLUMNS] = df[MED_COLUMNS].where(df[MED_COLUMNS] == 1, 0)
df.loc[:, MED_COLUMNS] = df.loc[:, MED_COLUMNS].astype(int)
df['n_d_meds'] = df[MED_COLUMNS].sum(axis=1).astype(int)
df['no_med'] = (df['n_d_meds'] == 0) * 1
df

In [None]:
s0 = df['n_d_meds'].value_counts()
sns.barplot(x=s0.index, y=s0)

In [None]:
df[(df['metformin'] > 0) & (df['insulin'] > 0)].shape

In [None]:
s1 = df[MED_COLUMNS].sum(axis=0).sort_values()
s1 = s1[s1 > 10]
sns.barplot(x=s1, y=s1.index)

In [None]:
s2 = df[df['n_d_meds'] == 1][MED_COLUMNS].sum(axis=0).sort_values()
s2 = s2[s2 > 20]
sns.barplot(x=s2, y=s2.index)

In [None]:
can_meds = ['glyburide','glipizide','metformin','insulin']
results = []
df_no_med = df
results.append(df_no_med[can_meds].sum(axis=0))
results.append(df_no_med[df_no_med['n_d_meds'] <= 1][can_meds].sum(axis=0))
results.append(df_no_med[df_no_med['n_d_meds'] <= 2][can_meds].sum(axis=0))
results = pd.DataFrame(results).astype(int)
results.index = ['No constraints', 'Exactly 1', '2 or less than']
results

In [None]:
df[df['n_d_meds'] == 1][can_meds].sum(axis=0)

In [None]:
patients = df[['patient_nbr']+MED_COLUMNS].groupby('patient_nbr').mean().reset_index()
patients_insulin_and_metformin = patients[(patients['insulin'] > 0) & (patients['metformin'] > 0)]['patient_nbr'].unique().tolist()
print(len(patients_insulin_and_metformin))

In [None]:
s0 = df.groupby('patient_nbr').agg({'encounter_id':'count'}).reset_index().rename(columns={'encounter_id':'# encounters'}).groupby('# encounters').agg({'patient_nbr':'nunique'}).reset_index().rename(columns={'patient_nbr':'# patients'})
s0 = s0[s0['# encounters'] < 8]
sns.barplot(x=s0['# encounters'], y=s0['# patients'], palette='Paired')

### Filtering The Data

In [None]:
CATEGORICAL_COLUMNS = ['race', 'gender', 'age',  'max_glu_serum', 'A1Cresult', 'readmitted']
df = df[(df['n_d_meds'] <= 2) & (df['patient_nbr'].isin(patients_insulin_and_metformin) == False) &
        (df['glyburide-metformin'] + df['glipizide-metformin'] + df['glimepiride-pioglitazone'] + 
         df['metformin-rosiglitazone'] + df['metformin-pioglitazone'] == 0)]
df['number_visits'] = df['number_outpatient'] + df['number_inpatient']
df['num_procedures'] = df['num_lab_procedures'] + df['num_procedures']
df['treatment'] = 'other'
df.loc[(df['insulin'] == False) & (df['metformin'] == True), 'treatment'] = 'metformin'
df.loc[(df['insulin'] == True) & (df['metformin'] == False), 'treatment'] = 'insulin'
df2 = pd.get_dummies(df[['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'time_in_hospital', 'num_procedures',
   'num_medications', 'number_visits', 'number_emergency', 'number_diagnoses',
    'max_glu_serum', 'A1Cresult', 'readmitted', 'treatment'] + MED_COLUMNS], prefix_sep=':', columns=CATEGORICAL_COLUMNS)

In [None]:
df2.columns

In [None]:
df2.to_csv('casual_diabetes.csv', index=False)

In [None]:
df2 = df2[df2['treatment'] != 'other']

In [None]:
ins = df2[df2['treatment'] == 'insulin']
met = df2[df2['treatment'] == 'metformin']
for col in ['time_in_hospital', 'num_procedures','num_medications', 'number_visits', 'number_emergency','number_diagnoses']:
    print(col)
    print(df2['insulin'].corr(df2[col]))
    print(df2['readmitted:<30'].corr(df2[col]))
    print(ins['readmitted:<30'].corr(ins[col]))
    print(met['readmitted:<30'].corr(met[col]))

### First Results 

In [None]:
means = df2.groupby('treatment').mean()

In [None]:
means[['age:[0-10)', 'age:[10-20)', 'age:[20-30)', 'age:[30-40)', 'age:[40-50)',
       'age:[50-60)', 'age:[60-70)', 'age:[70-80)', 'age:[80-90)', 'age:[90-100)']]

In [None]:
df2 = df2[df2['age:[0-10)'] == 0]
means = df2.groupby('treatment').mean()

In [None]:
means[['race:?',
       'race:AfricanAmerican', 'race:Asian', 'race:Caucasian', 'race:Hispanic',
       'race:Other', 'gender:Female', 'gender:Male']]

In [None]:
means[['time_in_hospital', 'num_procedures',
       'num_medications', 'number_visits', 'number_emergency',
       'number_diagnoses']]

In [None]:
means[['max_glu_serum:>200', 'max_glu_serum:>300',
       'max_glu_serum:None', 'max_glu_serum:Norm', 'A1Cresult:>7',
       'A1Cresult:>8', 'A1Cresult:None', 'A1Cresult:Norm']]

In [None]:
means[['metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', ]]

In [None]:
means[['pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'examide', 'citoglipton', 'insulin']]

In [None]:
means[['readmitted:<30','readmitted:>30', 'readmitted:NO']]

### Propensity

In [None]:
from matplotlib import pyplot as plt

In [None]:
data = pd.read_csv('casual_diabetes.csv')
treatments = (data[data['treatment'] != 'other']['treatment'] == 'insulin').to_numpy()
propensity_scores = pd.read_csv('project_code/propensity_scores.csv')['0'].to_numpy()
print(treatments.shape, propensity_scores.shape)

In [None]:
treat_plt = plt.hist(propensity_scores[treatments==1], fc=(0, 0, 1, 0.5),bins=20, label='Insulin', density=True)
cont_plt = plt.hist(propensity_scores[treatments==0],fc=(1, 0, 0, 0.5),bins=20, label='Metformin', density=True)
plt.legend();
plt.xlabel('propensity score');
plt.ylabel('density');