## Library and Datasets

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

In [2]:
births = pd.read_csv('../data/processed/totDF_dwnsmpl_clean.csv')

## Functions

In [4]:
# Mask a category in a feature 
#Example remove all 'U' for target feature
def remove_category(dataframe, column = str, category = 'U'):
    df = dataframe.loc[dataframe[column] != category]
    return df

# Make a dataframe suitable for plotting a stacked bar graph with percentage of total as values for each sub category
def make_pctdf(dataframe , group = str, target = 'admit_NICU',  columns_titles = ['Y','N']):
    df1 = births_nou.loc[(births[group] == 'Y')].groupby([group])[[target]].count()
    df2 = births_nou.loc[(births[group] == 'Y')].groupby([group])[target].value_counts().unstack()
    df2 = df2.reindex(columns=columns_titles)
    df3 = pd.merge(df1,df2, left_index = True, right_index = True)
    pct_df = pd.DataFrame(list(map(lambda x: df3[x]/df3[target] * 100, df3.columns[1:])))
    return pct_df.T

#Make a dataframe suitable for plotting a bar graph with counts of one subcategory of feature
#For example inlcude only observations marked 'Y', exclude other values
def make_cntdf(dataframe , group = str, filter_output = 'Y', target = 'admit_NICU',  columns_titles = ['Y','N']):
    df1 = births_nou.loc[(births[group] == filter_output)].groupby([group])[[target]].count()
    return df1

#simple stacked bar graph with percentage of total as values for each sub category, for ease of examination
def pct_bplot(dataframe, group = str, target = 'admit_NICU', columns_titles = ['Y','N'] ):
    df1 = dataframe.groupby([group])[[target]].count()
    df2 =  dataframe.groupby([group])[target].value_counts().unstack()
    df2 = df2.reindex(columns=columns_titles)
    df3 = pd.merge(df1,df2, left_index = True, right_index = True)
    pct_df = pd.DataFrame(list(map(lambda x: df3[x]/df3[target] * 100, df3.columns[1:])))
    pct_df = pct_df.T
    pct_df.plot(kind = 'bar', stacked = True, legend = False)

#plot time series for target column
def plot_gpYR(df,col,xlab,ylab,title,kind='line'):
    groupDF = df.groupby('birth_year')[col].value_counts().unstack()
    plt.figure(figsize=(20,20))
    groupDF.plot(kind=kind,logy=True, legend=True)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.title(title)
    return
#Value counts for target column, and value counts for target column when group be year
def val_check(data_frame, column_name = str):
    df = pd.DataFrame(data_frame)
    col = column_name
    print( "Value counts of %s \n" %(col), df[col].value_counts())
    print("Value counts of %s by year \n" %(col), df.groupby(['birth_year'])[col].value_counts())
    
#save plots, run within plotting cells
def saveplt(filename):
    path = './Saved_Visualizations/'
    file1 = path + filename + '.pdf'
    file2 = path + filename + '.png'
    plt.savefig(file1) #as pdf
    plt.savefig(file2) #as png
    
def pct_bplot2(dataframe, group = str, target = 'admit_NICU', columns_titles = ['Y','N'] ):
    df1 = dataframe.groupby([group])[target].count()
    df2 =  dataframe.groupby([group])[target].value_counts().unstack()
    df2 = df2.reindex(columns=columns_titles)
    df3 = pd.merge(df1,df2, left_index = True, right_index = True)
    pct_df = pd.DataFrame(list(map(lambda x: df3[x]/df3[target], df3.columns[1:])))
    pct_df = pct_df.T
    #pct_df.plot(kind = 'bar', stacked = True, legend = True)
    return pct_df
   

## Group Lists and Recoded Columns


In [8]:
#Columns recoded for better display
births['mother_bmi_recode'] = pd.cut(births['mothers_bmi'],[10.0,19.0, 25.0,30.0,35.0,40.0,90.0], 
                                     labels = ['u','h','o','ob1','ob2','ob3'])

births['mothers_age_groups']= pd.cut(births['mothers_age'], 4, labels = ['12-19', '20-29', '30-39','40-50'])

births['mothers_age_groups2']= pd.cut(births['mothers_age'], [0,14,19,24,29,34,39,44,49,100],
                                     labels = ['<15', '15-19', '20-24','25-29','30-34',
                                               '35-39','40-44','45-49','50-100'])
births_nou['bw_recode'] = pd.cut(births_nou['birth_weight_gm'],
                                   [0,499, 999,1499,1999,2499,2999,3499,3999,4499,4999,8200],
                                labels = ['<500', '1000', '1500', '2000','2500','3000',
                                          '3500','4000','4500', '5000', '5500+'])

#Lists of Column names and Full Names for plots of related columns
lstmom_health_risks_cnames = ['gest_diab','pre_preg_diab', 'gest_hypten', 'pre_preg_hypten',
                              'hypten_ecl', 'prev_preterm_birth']
lstmom_health_risks_fullnames =['Gestational Diabetes', "Pre-pregancy Diabetes",'Gestational Hypertension',
                                'Pre-pregnancy Hypertension', 'Hypertension Eclampsia', 'Previous Pre-term Birth']

lstmom_infections_cnames = ['gonorrhea','syphilis', 'chlamydia', 'hepB', 'hepC']
lstmom_infections_fullnames =['Gonorrhea', "Syphilis",'Chlamydia', 'Hepatis B', 'Hepatitis C']

lstdelivery_type_cnames = ['infertility_treatment','fertil_enhance', 'asst_repro_tech']
lstdelivery_type_fullnames =['Infertility Treatment', "Fertility Enhancement",'Assistant Reproductive Technology']

lstmom_delivery_complication_cnames = ['perineal_laceration', 'rupt_uterus','unplanned_hyster', 'admit_to_IC', 
                                   'induced_labor', 'aug_labor', 'steriods', 'antibiotics', 'chorioamnionitis']
lstmom_delivery_complications_fullnames =['Perineal Laceration', 'Rupture Uterus','Unplanned Hysterectomy', 
                                      'Mother Admitted to IC','Induced Labor', 'AugLabor', 'Steroids',
                                      'Antibiotics','Chorioamnionitis' ]

lst_infant_delivery_complication_cnames = ['assist_vent_immed','assist_vent_after6',
                                           'surfactant', 'antibiotics_for_newborn']
lst_infant_delivery_complication_fullnames = ['Assist Ventilation Immediately','Assist Ventilation > 6hr', 
                                              'Surfactant', 'Antibiotics for Newborn']

lstinfant_congenital_cnames = ['seizures','anencephaly', 'meningo_spina_bif', 'cyn_cong_heart_disease',
                                'cong_diaph_hernia', 'omphalocele', 'gastroschisis','limb_reduc_defect', 
                                'cleft_lip_or_palate', 'cleft_palate_only','hypospadias']
lstinfant_congeital_fullnames = ['Seizures','Anencephaly', 'Mening-Spina Bifida', 'CCHD','Diaphrapgm Hernia', 
                                    'Omphalocele', 'Gastroschisis','Limb reduced Defect', 'Cleft Lip or Palate', 
                                    'Cleft Palate Only','Hypospadias']
lstplurality_fullname = ['1','2','3','4','5+']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
