Aggregates objects district wise

In [1]:
import pandas as pd
import numpy as np
import random as rnd

attribute= 'Age'
def aggregate_district(data):
    subset = data[[attribute,'district_code']]  
    district_wise = subset.groupby('district_code').mean() #taking mean of ages for each district

    
    ax = district_wise.plot(kind='line',figsize=(15,3)) #plotting line graph to show district wise probability distribution
    ax.set_xticks(range(0,len(district_wise.index)))
    ax.set_xticklabels(district_wise.index.tolist(),rotation = 'vertical') #setting xtick labels
    ax.set_title('District wise distribution (variance = %.4f)' % (district_wise.var())) #setting title for the graph
    return data





Aggregates objects State wise

In [2]:
def aggregate_state(data):
    subset = data[[attribute,'state_code']]  
    district_wise = subset.groupby('state_code').mean() #taking mean of ages for each district

    
    ax = district_wise.plot(kind='line',figsize=(15,3)) #plotting line graph to show district wise probability distribution
    ax.set_xticks(range(0,len(district_wise.index)))
    ax.set_xticklabels(district_wise.index.tolist(),rotation = 'vertical') #setting xtick labels
    ax.set_title('District wise distribution (variance = %.4f)' % (district_wise.var())) #setting title for the graph
    return data


Samples data of size 20000

In [3]:
def sampling(data_frame):
    total_rows = data_frame.shape[0]
    
    sample_size = 20000

    ls = [] 
    districts = data_frame['district_code'].unique() #returns list of unique districts

    sampled_df = pd.DataFrame(ls,columns=data_frame.columns) #empty data frame with column headings

    for district in districts:
        temp = data_frame.loc[data_frame['district_code']==district] #returns all rows of a certain district
        # print(temp.shape[0])
        rows = temp.sample(n=int((temp.shape[0]*sample_size/total_rows))) #getting rows for sample
        df = pd.DataFrame(rows,columns=data_frame.columns) #making a data frame after sampling
        df.reset_index(drop=True,inplace=True)
        sampled_df = sampled_df.append(df) #sampled data frame

    sampled_df.reset_index(drop=True,inplace=True)
    return sampled_df


Feature creation

In [4]:

def feature_creation(data_frame):
    data_frame['BMI']=data_frame['Weight_in_kg']/((data_frame['Length_height_cm']/100)*(data_frame['Length_height_cm']/100))
    return data_frame


Feature subset selection. Removes attributes that are highly correlated

In [5]:
def feature_subset(data_frame):
    corr_matrix = data_frame.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    print(upper)
    # Find features with correlation greater than threshold
    threshold=0.9
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    #drop features that have correlation > threshold
    data_frame.drop(to_drop, axis=1, inplace=True)
    
    return data_frame, to_drop

In [13]:
feature_creation_df=feature_creation(sampled_df)
feature_creation_df


Unnamed: 0,state_code,district_code,rural_urban,stratum,PSU_ID,ahs_house_unit,house_hold_no,date_survey,test_salt_iodine,record_code_iodine,...,BP_systolic,BP_systolic_2_reading,BP_Diastolic,BP_Diastolic_2reading,Pulse_rate,Pulse_rate_2_reading,Diabetes_test,fasting_blood_glucose,fasting_blood_glucose_mg_dl,BMI
0,JHARKHAND,DHANBAD,Rural,population>=2000,1619324,90,4,12-12-14,30,More than or equal to 15 PPM,...,123.885869,122.520051,80.015751,79.010441,84.052381,83.334482,YES,Measured,93.881913,13.170756
1,JHARKHAND,DHANBAD,Urban,Urban,1615722,81,1,15-09-14,30,More than or equal to 15 PPM,...,113.000000,115.000000,69.000000,63.000000,80.000000,77.000000,YES,Measured,87.000000,20.870295
2,JHARKHAND,DHANBAD,Rural,population>=2000,1618831,52,3,16-12-14,33,More than or equal to 15 PPM,...,149.000000,145.000000,120.000000,115.000000,85.000000,80.000000,YES,Measured,95.000000,25.971701
3,JHARKHAND,DHANBAD,Urban,Urban,1615710,15,1,13-09-14,7,Less than 15 PPM,...,123.885869,122.520051,80.015751,79.010441,84.052381,83.334482,YES,Measured,93.881913,19.840923
4,JHARKHAND,DHANBAD,Urban,Urban,1615117,294,3,12-06-14,7,Less than 15 PPM,...,116.000000,118.000000,87.000000,88.000000,110.000000,112.000000,YES,Measured,99.000000,22.687082
5,JHARKHAND,DHANBAD,Urban,Urban,1616991,124,1,13-11-14,0,No Iodine,...,123.885869,122.520051,80.015751,79.010441,84.052381,83.334482,YES,Measured,93.881913,13.916291
6,JHARKHAND,DHANBAD,Rural,200<population<2000,1619412,33,1,24-12-14,0,No Iodine,...,140.000000,135.000000,80.000000,84.000000,77.000000,80.000000,YES,Measured,84.000000,16.700808
7,JHARKHAND,DHANBAD,Rural,population>=2000,1618467,127,2,12-10-14,30,More than or equal to 15 PPM,...,123.885869,122.520051,80.015751,79.010441,84.052381,83.334482,YES,Measured,93.881913,15.938751
8,JHARKHAND,DHANBAD,Urban,Urban,1615565,97,1,16-09-14,7,Less than 15 PPM,...,142.000000,135.000000,93.000000,83.000000,105.000000,106.000000,YES,Measured,96.000000,42.603309
9,JHARKHAND,DHANBAD,Rural,200<population<2000,1619387,6,2,23-12-14,15,More than or equal to 15 PPM,...,100.000000,110.000000,66.000000,70.000000,70.000000,73.000000,YES,Measured,90.000000,19.579303


In [14]:
feature_subset_df, dropped = feature_subset(feature_creation_df)
feature_subset_df


                             Age  date_of_birth  month_of_birth  \
Age                          NaN       0.266503        0.298070   
date_of_birth                NaN            NaN        0.422943   
month_of_birth               NaN            NaN             NaN   
year_of_birth                NaN            NaN             NaN   
Weight_in_kg                 NaN            NaN             NaN   
Length_height_cm             NaN            NaN             NaN   
Haemoglobin_level            NaN            NaN             NaN   
BP_systolic                  NaN            NaN             NaN   
BP_systolic_2_reading        NaN            NaN             NaN   
BP_Diastolic                 NaN            NaN             NaN   
BP_Diastolic_2reading        NaN            NaN             NaN   
Pulse_rate                   NaN            NaN             NaN   
Pulse_rate_2_reading         NaN            NaN             NaN   
fasting_blood_glucose_mg_dl  NaN            NaN             Na

Unnamed: 0,state_code,district_code,rural_urban,stratum,PSU_ID,ahs_house_unit,house_hold_no,date_survey,test_salt_iodine,record_code_iodine,...,Length_height_cm,Haemoglobin_test,Haemoglobin,Haemoglobin_level,BP_systolic,BP_Diastolic,Pulse_rate,Diabetes_test,fasting_blood_glucose,fasting_blood_glucose_mg_dl
0,JHARKHAND,DHANBAD,Rural,population>=2000,1619324,90,4,12-12-14,30,More than or equal to 15 PPM,...,103.100000,YES,Measured,8.700000,123.885869,80.015751,84.052381,YES,Measured,93.881913
1,JHARKHAND,DHANBAD,Urban,Urban,1615722,81,1,15-09-14,30,More than or equal to 15 PPM,...,155.399990,YES,Measured,13.300000,113.000000,69.000000,80.000000,YES,Measured,87.000000
2,JHARKHAND,DHANBAD,Rural,population>=2000,1618831,52,3,16-12-14,33,More than or equal to 15 PPM,...,158.200000,YES,Measured,8.700000,149.000000,120.000000,85.000000,YES,Measured,95.000000
3,JHARKHAND,DHANBAD,Urban,Urban,1615710,15,1,13-09-14,7,Less than 15 PPM,...,138.855232,YES,Measured,10.196373,123.885869,80.015751,84.052381,YES,Measured,93.881913
4,JHARKHAND,DHANBAD,Urban,Urban,1615117,294,3,12-06-14,7,Less than 15 PPM,...,148.899990,YES,Measured,12.700000,116.000000,87.000000,110.000000,YES,Measured,99.000000
5,JHARKHAND,DHANBAD,Urban,Urban,1616991,124,1,13-11-14,0,No Iodine,...,110.200000,YES,Measured,7.000000,123.885869,80.015751,84.052381,YES,Measured,93.881913
6,JHARKHAND,DHANBAD,Rural,200<population<2000,1619412,33,1,24-12-14,0,No Iodine,...,136.899990,YES,Measured,8.000000,140.000000,80.000000,77.000000,YES,Measured,84.000000
7,JHARKHAND,DHANBAD,Rural,population>=2000,1618467,127,2,12-10-14,30,More than or equal to 15 PPM,...,84.199997,YES,Measured,8.700000,123.885869,80.015751,84.052381,YES,Measured,93.881913
8,JHARKHAND,DHANBAD,Urban,Urban,1615565,97,1,16-09-14,7,Less than 15 PPM,...,140.500000,YES,Measured,9.200000,142.000000,93.000000,105.000000,YES,Measured,96.000000
9,JHARKHAND,DHANBAD,Rural,200<population<2000,1619387,6,2,23-12-14,15,More than or equal to 15 PPM,...,155.100010,YES,Refused,10.196373,100.000000,66.000000,70.000000,YES,Measured,90.000000


In [15]:
dropped

[u'year_of_birth',
 u'BP_systolic_2_reading',
 u'BP_Diastolic_2reading',
 u'Pulse_rate_2_reading',
 'BMI']

In [8]:
def runMain():
    stateNames=["Bihar", "Chhattisgarh", "Jharkhand", "MadhyaPradesh", "Odisha", "Uttarakhand", "UttarPradesh"]
    
    #Defining the file path prefix and suffix
    dataFilePathPrefix="./../Data/cleanedData/cleanedmerged"
    dataFilePathSuffix=".csv"
    
    for state in stateNames:
        dataFrame=pd.read_csv(dataFilePathPrefix+state+dataFilePathSuffix)
        dataFrame=sampling(dataFrame)
        finalDataFrame, dropped=feature_subset(feature_creation (dataFrame))
        finalDataFrame.to_csv('./../Data/PreProcessedData/preprocessed'+state+dataFilePathSuffix, index=False)

In [9]:
runMain()

                             identification_code       Age  date_of_birth  \
identification_code                          NaN  0.098877       0.023622   
Age                                          NaN       NaN       0.008162   
date_of_birth                                NaN       NaN            NaN   
month_of_birth                               NaN       NaN            NaN   
year_of_birth                                NaN       NaN            NaN   
Weight_in_kg                                 NaN       NaN            NaN   
Length_height_cm                             NaN       NaN            NaN   
Haemoglobin_level                            NaN       NaN            NaN   
BP_systolic                                  NaN       NaN            NaN   
BP_systolic_2_reading                        NaN       NaN            NaN   
BP_Diastolic                                 NaN       NaN            NaN   
BP_Diastolic_2reading                        NaN       NaN            NaN   

                             Age  date_of_birth  month_of_birth  \
Age                          NaN       0.040171        0.115545   
date_of_birth                NaN            NaN        0.320156   
month_of_birth               NaN            NaN             NaN   
year_of_birth                NaN            NaN             NaN   
Weight_in_kg                 NaN            NaN             NaN   
Length_height_cm             NaN            NaN             NaN   
Haemoglobin_level            NaN            NaN             NaN   
BP_systolic                  NaN            NaN             NaN   
BP_systolic_2_reading        NaN            NaN             NaN   
BP_Diastolic                 NaN            NaN             NaN   
BP_Diastolic_2reading        NaN            NaN             NaN   
Pulse_rate                   NaN            NaN             NaN   
Pulse_rate_2_reading         NaN            NaN             NaN   
fasting_blood_glucose_mg_dl  NaN            NaN             Na

                             identification_code       Age  date_of_birth  \
identification_code                          NaN  0.111154       0.088758   
Age                                          NaN       NaN       0.031948   
date_of_birth                                NaN       NaN            NaN   
month_of_birth                               NaN       NaN            NaN   
year_of_birth                                NaN       NaN            NaN   
Weight_in_kg                                 NaN       NaN            NaN   
Length_height_cm                             NaN       NaN            NaN   
Haemoglobin_level                            NaN       NaN            NaN   
BP_systolic                                  NaN       NaN            NaN   
BP_systolic_2_reading                        NaN       NaN            NaN   
BP_Diastolic                                 NaN       NaN            NaN   
BP_Diastolic_2reading                        NaN       NaN            NaN   

                             Age  date_of_birth  month_of_birth  \
Age                          NaN       0.316466        0.279346   
date_of_birth                NaN            NaN        0.381839   
month_of_birth               NaN            NaN             NaN   
year_of_birth                NaN            NaN             NaN   
Weight_in_kg                 NaN            NaN             NaN   
Length_height_cm             NaN            NaN             NaN   
Haemoglobin_level            NaN            NaN             NaN   
BP_systolic                  NaN            NaN             NaN   
BP_systolic_2_reading        NaN            NaN             NaN   
BP_Diastolic                 NaN            NaN             NaN   
BP_Diastolic_2reading        NaN            NaN             NaN   
Pulse_rate                   NaN            NaN             NaN   
Pulse_rate_2_reading         NaN            NaN             NaN   
fasting_blood_glucose_mg_dl  NaN            NaN             Na

                             Age  date_of_birth  month_of_birth  \
Age                          NaN       0.029048        0.102033   
date_of_birth                NaN            NaN        0.288526   
month_of_birth               NaN            NaN             NaN   
year_of_birth                NaN            NaN             NaN   
Weight_in_kg                 NaN            NaN             NaN   
Length_height_cm             NaN            NaN             NaN   
Haemoglobin_level            NaN            NaN             NaN   
BP_systolic                  NaN            NaN             NaN   
BP_systolic_2_reading        NaN            NaN             NaN   
BP_Diastolic                 NaN            NaN             NaN   
BP_Diastolic_2reading        NaN            NaN             NaN   
Pulse_rate                   NaN            NaN             NaN   
Pulse_rate_2_reading         NaN            NaN             NaN   
fasting_blood_glucose_mg_dl  NaN            NaN             Na