# Data Exploration -: Part I: Summary Statistics

1. The type of each attribute is determined out of {Nominal, Ordinal, Interval, Ratio} and {Discrete, Continuous}. 
2. The appropriate statistical measure is chosen for that attribute.

Attribute set : Type set : Statistical Measure set
1. {district_code, rural_urban, stratum, PSU_ID, ahs_house_unit, house_hold_no, date_survey, record_code_iodine, sl_no, Sex, usual_residance, identification_code, Age_Code, Weight_measured, Length_height_measured, length_height_code, Haemoglobin_test, Haemoglobin, Diabetes_test, fasting_blood_glucose} : {Nominal, Discrete} : {Frequency, Mode, Chi-Square test}

2. {date_of_birth, month_of_birth, year_of_birth} : {Ordinal, Discrete} : {Range, Median, Percentile}

3. {Age, Pulse_rate, Pulse_rate_2_reading} : {Ratio, Discrete} : {Range, Percentile, Mean, Median, Covariance, Correlation}

3. {test_salt_iodine, Weight_in_kg, Length_height_cm, Haemoglobin_level, BP_systolic, BP_systolic_2_reading, BP_Diastolic, BP_Diastolic_2reading, fasting_blood_glucose_mg_dl} : {Ratio, Continuous} : {Range, Percentile, Mean, Median, Covariance, Correlation}

Importing header files

In [37]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates
import numpy as np

Method to compute the summary statistics of nominal attributes 

In [38]:
def performSummaryStatisticsNominal (dataFrame):

    print("\n\n\n\nSummary statistics for nominal attributes: ")
    #Frequency and Mode:
    for col in dataFrame.columns:
        print("\n\nAttribute Name: "+col)
        print("Frequency:")
        print(dataFrame[col].value_counts())
        print("Mode:")
        print(dataFrame.loc[:,col].mode())


Method to compute the summary statistics of ordinal attributes 

In [39]:
def performSummaryStatisticsOrdinal (dataFrame):

    print("\n\n\n\nSummary statistics for ordinal attributes: ")
    for col in dataFrame.columns:
        print("\n\nAttribute Name: "+col)
        #Range
        print("Range: ")
        print("Min: "+str(dataFrame[col].min()))
        print("Max: "+str(dataFrame[col].max()))
        #Median
        print("Median: "+str(dataFrame[col].median()))
        #Percentile
        print("Percentile: ")
        dataFrame.astype({col: float})
        print(dataFrame[col].quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]))



Method to compute the summary statistics of ratio attributes

In [40]:
def performSummaryStatisticsRatio (dataFrame): 
    print("\n\n\n\nSummary statistics for ratio attributes: ")
    for col in dataFrame.columns:
        print("\n\nAttribute Name: "+col)
        #Range
        print("Range: ")
        print("Min: "+str(dataFrame[col].min()))
        print("Max: "+str(dataFrame[col].max()))
        #Mean
        print("Mean: "+str(round(dataFrame[col].mean(),6))) 
        #Median
        print("Median: "+str(dataFrame[col].median()))
        #Percentile
        print("Percentile: ")
        dataFrame.astype({col: float})
        print(dataFrame[col].quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]))
    #Covariance
    print('\n\nCovariance:')
    print(dataFrame.cov())
    #Correlation
    print("\n\nCorrelation")
    print(dataFrame.corr())



Method to compute the summary statistics of data

In [41]:
def performSummaryStatistics(filePath, nominalAttr, ordinalAttr, ratioAttr):

    #Reading the data file into data frames, for each type of attributes
    nominalDataFrame = pd.read_csv(filePath, usecols=nominalAttr) 
    ordinalDataFrame = pd.read_csv(filePath, usecols=ordinalAttr) 
    ratioDataFrame = pd.read_csv(filePath, usecols=ratioAttr) 

    performSummaryStatisticsNominal (nominalDataFrame)
    performSummaryStatisticsOrdinal (ordinalDataFrame)
    performSummaryStatisticsRatio (ratioDataFrame)

Main function for summary statistics

In [42]:
def runMainSummaryStatistics():
    
    #Defining list containing the names of all states
    stateNames=["Bihar", "Chhattisgarh", "Jharkhand", "MadhyaPradesh", "Odisha", "Uttarakhand", "UttarPradesh"]
    
    #Defining lists for all types of attributes
    nominalAttributes=["district_code", "rural_urban", "stratum", "PSU_ID", "ahs_house_unit", "house_hold_no", "date_survey", "record_code_iodine", "sl_no", "Sex", "usual_residance", "identification_code", "Age_Code", "Weight_measured", "Length_height_measured", "length_height_code", "Haemoglobin_test", "Haemoglobin", "Diabetes_test", "fasting_blood_glucose"]
    ordinalAttributes=["date_of_birth", "month_of_birth", "year_of_birth"]
    ratioAttributes=["Age", "Pulse_rate", "Pulse_rate_2_reading", "test_salt_iodine", "Weight_in_kg", "Length_height_cm", "Haemoglobin_level", "BP_systolic", "BP_systolic_2_reading", "BP_Diastolic", "BP_Diastolic_2reading", "fasting_blood_glucose_mg_dl"]
    
    #Defining the file path prefix and suffix
    dataFilePathPrefix="./../Data/cleanedData/cleanedmerged"
    dataFilePathSuffix=".csv"

    #Performing summary statistics for each state
    for state in stateNames:
        print("\n\n\n\nSummary statistics for "+ state)
        performSummaryStatistics (dataFilePathPrefix+state+dataFilePathSuffix, nominalAttributes, ordinalAttributes, ratioAttributes)

Runnable

In [43]:
runMainSummaryStatistics()





Summary statistics for Bihar




Summary statistics for nominal attributes: 


Attribute Name: district_code
Frequency:
PURBA CHAMPARAN       6922
BEGUSARAI             5814
JEHANABAD             5146
NAWADA                5025
LAKHISARAI            4927
NALANDA               4792
BHAGALPUR             4588
BHOJPUR               4430
MUNGER                4227
MADHUBANI             4143
PATNA                 4119
KAIMUR (BHABUA)       4044
BANKA                 4008
SIWAN                 3757
DARBHANGA             3666
ROHTAS                3497
BUXAR                 3445
ARARIA                3430
SHEOHAR               3428
SITAMARHI             3415
AURANGABAD            3349
GAYA                  3172
SHEIKHPURA            3063
JAMUI                 2989
PASHCHIM CHAMPARAN    2984
SUPAUL                2944
KHAGARIA              2688
MADHEPURA             2656
KATIHAR               2400
KISHANGANJ            2389
PURNIA                2385
GOPALGANJ             2009
SAHARSA     

Year      127938
Months       100
Days           5
Name: Age_Code, dtype: int64
Mode:
0    Year
dtype: object


Attribute Name: Weight_measured
Frequency:
Measured    128003
Other           34
Refused          6
Name: Weight_measured, dtype: int64
Mode:
0    Measured
dtype: object


Attribute Name: Length_height_measured
Frequency:
Measured    127998
Other           35
Refused         10
Name: Length_height_measured, dtype: int64
Mode:
0    Measured
dtype: object


Attribute Name: length_height_code
Frequency:
Height    127967
Length        76
Name: length_height_code, dtype: int64
Mode:
0    Height
dtype: object


Attribute Name: Haemoglobin_test
Frequency:
YES    119985
NO       8058
Name: Haemoglobin_test, dtype: int64
Mode:
0    YES
dtype: object


Attribute Name: Haemoglobin
Frequency:
Measured    112894
Refused      15143
Other            6
Name: Haemoglobin, dtype: int64
Mode:
0    Measured
dtype: object


Attribute Name: Diabetes_test
Frequency:
YES    124558
NO       3485
Name





Summary statistics for nominal attributes: 


Attribute Name: district_code
Frequency:
KORIYA              7931
SURGUJA             6988
DHAMTARI            6596
BILASPUR            6371
JANJGIR - CHAMPA    6226
RAJNANDGAON         6013
MAHASAMUND          5999
KAWARDHA            5982
RAIPUR              5774
KANKER              5601
RAIGARH             5307
BASTAR              5049
KORBA               4849
JASHPUR             4742
DANTEWADA           4261
DURG                4175
Name: district_code, dtype: int64
Mode:
0    KORIYA
dtype: object


Attribute Name: rural_urban
Frequency:
Rural    73894
Urban    17970
Name: rural_urban, dtype: int64
Mode:
0    Rural
dtype: object


Attribute Name: stratum
Frequency:
200<population<2000    38660
population>=2000       35234
Urban                  17970
Name: stratum, dtype: int64
Mode:
0    200<population<2000
dtype: object


Attribute Name: PSU_ID
Frequency:
1443839    1
1456914    1
1514230    1
1512183    1
1526524    1
1528575    

0.1    67.00000
0.2    73.00000
0.3    77.10864
0.4    77.10864
0.5    77.10864
0.6    77.10864
0.7    77.10864
0.8    81.00000
0.9    87.00000
Name: BP_Diastolic, dtype: float64


Attribute Name: BP_Diastolic_2reading
Range: 
Min: 40.0
Max: 122.0
Mean: 77.720502
Median: 77.72168790574658
Percentile: 
0.1    69.000000
0.2    74.000000
0.3    77.721688
0.4    77.721688
0.5    77.721688
0.6    77.721688
0.7    77.721688
0.8    81.000000
0.9    87.000000
Name: BP_Diastolic_2reading, dtype: float64


Attribute Name: Pulse_rate
Range: 
Min: 40.0
Max: 140.0
Mean: 79.660495
Median: 79.66472309048649
Percentile: 
0.1    70.000000
0.2    75.000000
0.3    79.664723
0.4    79.664723
0.5    79.664723
0.6    79.664723
0.7    79.664723
0.8    82.000000
0.9    90.000000
Name: Pulse_rate, dtype: float64


Attribute Name: Pulse_rate_2_reading
Range: 
Min: 40.0
Max: 138.0
Mean: 80.309786
Median: 80.31317624428951
Percentile: 
0.1    71.000000
0.2    76.000000
0.3    80.313176
0.4    80.313176
0.5    80.

  exec(code_obj, self.user_global_ns, self.user_ns)






Summary statistics for nominal attributes: 


Attribute Name: district_code
Frequency:
RANCHI                 6292
BOKARO                 6175
PURBI SINGHBHUM        5874
LOHARDAGA              5692
DUMKA                  5424
PASHCHIMI SINGHBHUM    5080
GUMLA                  4780
DHANBAD                2427
HAZARIBAGH             1675
SAHIBGANJ              1484
CHATRA                 1475
KODARMA                1371
PAKAUR                 1291
GARHWA                 1242
DEOGHAR                 964
PALAMU                  826
GIRIDIH                 806
GODDA                   621
Name: district_code, dtype: int64
Mode:
0    RANCHI
dtype: object


Attribute Name: rural_urban
Frequency:
Rural    40117
Urban    13382
Name: rural_urban, dtype: int64
Mode:
0    Rural
dtype: object


Attribute Name: stratum
Frequency:
population>=2000       23106
200<population<2000    17011
Urban                  13382
Name: stratum, dtype: int64
Mode:
0    population>=2000
dtype: object


Attribute

0    YES
dtype: object


Attribute Name: fasting_blood_glucose
Frequency:
Measured    52917
Refused       342
Other         240
Name: fasting_blood_glucose, dtype: int64
Mode:
0    Measured
dtype: object




Summary statistics for ordinal attributes: 


Attribute Name: date_of_birth
Range: 
Min: 1.0
Max: 31.0
Median: 1.0
Percentile: 
0.1     1.0
0.2     1.0
0.3     1.0
0.4     1.0
0.5     1.0
0.6     4.0
0.7    10.0
0.8    15.0
0.9    22.0
Name: date_of_birth, dtype: float64


Attribute Name: month_of_birth
Range: 
Min: 1.0
Max: 12.0
Median: 3.0
Percentile: 
0.1     1.0
0.2     1.0
0.3     1.0
0.4     1.0
0.5     3.0
0.6     5.0
0.7     7.0
0.8     8.0
0.9    10.0
Name: month_of_birth, dtype: float64


Attribute Name: year_of_birth
Range: 
Min: 1903.0
Max: 2013.0
Median: 1993.0
Percentile: 
0.1    1960.0
0.2    1972.0
0.3    1981.0
0.4    1988.0
0.5    1993.0
0.6    1997.0
0.7    2001.0
0.8    2005.0
0.9    2009.0
Name: year_of_birth, dtype: float64




Summary statistics for ratio att

                             test_salt_iodine       Age  Weight_in_kg  \
test_salt_iodine                     1.000000 -0.060149     -0.008808   
Age                                 -0.060149  1.000000      0.327924   
Weight_in_kg                        -0.008808  0.327924      1.000000   
Length_height_cm                    -0.049536  0.532324      0.515925   
Haemoglobin_level                    0.007397  0.055602      0.106751   
BP_systolic                          0.002514  0.188563      0.067255   
BP_systolic_2_reading               -0.008993  0.190931      0.064497   
BP_Diastolic                        -0.005426  0.083533      0.062754   
BP_Diastolic_2reading               -0.020169  0.090864      0.068514   
Pulse_rate                          -0.012856 -0.020939     -0.031928   
Pulse_rate_2_reading                -0.027696 -0.020791     -0.029789   
fasting_blood_glucose_mg_dl          0.008825  0.107946      0.035361   

                             Length_height_cm  Hae

1.0        257
2.0        210
3.0        201
4.0        185
160.0      164
78.0       162
158.0      162
5.0        162
56.0       162
77.0       158
79.0       158
76.0       155
11.0       154
54.0       153
156.0      151
68.0       151
57.0       151
12.0       150
34.0       150
13.0       150
19.0       150
14.0       148
80.0       148
70.0       148
24.0       146
89.0       146
31.0       146
16.0       146
15.0       146
25.0       146
          ... 
10146.0      1
6367.0       1
54805.0      1
7983.0       1
10112.0      1
4920.0       1
4581.0       1
7975.0       1
9524.0       1
4078.0       1
6648.0       1
9256.0       1
8082.0       1
4922.0       1
5618.0       1
4815.0       1
5921.0       1
7070.0       1
5132.0       1
6204.0       1
6781.0       1
8084.0       1
8358.0       1
6885.0       1
9252.0       1
9042.0       1
4233.0       1
3414.0       1
4808.0       1
4152.0       1
Name: identification_code, Length: 7677, dtype: int64
Mode:
0    1.0
dtype: float64







Summary statistics for nominal attributes: 


Attribute Name: district_code
Frequency:
KORAPUT           7305
JAGATSINGHAPUR    6640
NABARANGAPUR      6576
SONAPUR           6510
PURI              5910
MALKANGIRI        5772
KALAHANDI         5665
RAYAGADA          5615
GANJAM            5412
GAJAPATI          5398
NUAPADA           4478
BAUDH             4428
NAYAGARH          4384
KANDHAMAL         4298
KHORDHA           3938
CUTTACK           3747
KENDRAPARA        2279
BHADRAK           2136
JHARSUGUDA        1979
SUNDARGARH        1909
DEBAGARH          1616
JAJAPUR           1348
BALANGIR          1344
MAYURBHANJ        1327
BALESHWAR         1320
KENDUJHAR         1315
SAMBALPUR         1269
DHENKANAL         1159
ANUGUL            1128
BARGARH           1114
Name: district_code, dtype: int64
Mode:
0    KORAPUT
dtype: object


Attribute Name: rural_urban
Frequency:
Rural    93136
Urban    14183
Name: rural_urban, dtype: int64
Mode:
0    Rural
dtype: object


Attribute Name: 

0.1     5.0
0.2     9.0
0.3    13.0
0.4    18.0
0.5    24.0
0.6    30.0
0.7    37.0
0.8    45.0
0.9    58.0
Name: Age, dtype: float64


Attribute Name: Weight_in_kg
Range: 
Min: 2.5999999
Max: 162.39999
Mean: 40.240195
Median: 41.69471919586711
Percentile: 
0.1    14.500000
0.2    22.500000
0.3    35.500000
0.4    41.400002
0.5    41.694719
0.6    43.900002
0.7    48.299999
0.8    52.799999
0.9    59.900002
Name: Weight_in_kg, dtype: float64


Attribute Name: Length_height_cm
Range: 
Min: 45.299999
Max: 199.89999
Mean: 141.398588
Median: 146.5
Percentile: 
0.1    103.300000
0.2    126.100000
0.3    143.200000
0.4    143.849247
0.5    146.500000
0.6    150.600010
0.7    154.300000
0.8    158.500000
0.9    163.399990
Name: Length_height_cm, dtype: float64


Attribute Name: Haemoglobin_level
Range: 
Min: 4.0
Max: 17.0
Mean: 10.552586
Median: 10.552585690157708
Percentile: 
0.1     8.100000
0.2     9.100000
0.3    10.000000
0.4    10.500000
0.5    10.552586
0.6    10.552586
0.7    11.00000





Summary statistics for nominal attributes: 


Attribute Name: district_code
Frequency:
GARHWAL              9943
HARDWAR              7437
TEHRI GARHWAL        5844
UTTARKASHI           5510
DEHRADUN             4146
CHAMPAWAT            3660
NAINITAL             3528
BAGESHWAR            3224
UDHAM SINGH NAGAR    3175
ALMORA               2236
PITHORAGARH          2088
Name: district_code, dtype: int64
Mode:
0    GARHWAL
dtype: object


Attribute Name: rural_urban
Frequency:
Rural    40452
Urban    10339
Name: rural_urban, dtype: int64
Mode:
0    Rural
dtype: object


Attribute Name: stratum
Frequency:
200<population<2000    23459
population>=2000       16993
Urban                  10339
Name: stratum, dtype: int64
Mode:
0    200<population<2000
dtype: object


Attribute Name: PSU_ID
Frequency:
2887678    1
2890262    1
2841086    1
2834941    1
2836988    1
2847227    1
2849274    1
2843129    1
2845176    1
2822647    1
2832882    1
2828784    1
2867693    1
2869740    1
2879979

Min: 50.0
Max: 300.0
Mean: 98.332745
Median: 98.33424452349749
Percentile: 
0.1     89.000000
0.2     94.000000
0.3     98.000000
0.4     98.334245
0.5     98.334245
0.6     98.334245
0.7     98.334245
0.8     99.000000
0.9    105.000000
Name: fasting_blood_glucose_mg_dl, dtype: float64


Covariance:
                             test_salt_iodine         Age  Weight_in_kg  \
test_salt_iodine                    60.989824    2.675025      6.159933   
Age                                  2.675025  409.917846    187.935760   
Weight_in_kg                         6.159933  187.935760    270.603367   
Length_height_cm                     5.460779  230.444649    296.611117   
Haemoglobin_level                    0.058790    3.551623      5.968660   
BP_systolic                          2.810358   57.633451     34.664469   
BP_systolic_2_reading                2.289998   53.618638     32.587554   
BP_Diastolic                         1.497918   21.825823     26.335254   
BP_Diastolic_2reading  





Summary statistics for nominal attributes: 


Attribute Name: district_code
Frequency:
SHAHJAHANPUR                  7591
FIROZABAD                     7513
BALRAMPUR                     7237
SHRAWASTI                     7232
BULANDSHAHR                   7024
KUSHINAGAR                    6815
ETAH                          6678
KAUSHAMBI                     6677
VARANASI                      6279
GHAZIPUR                      6278
SONBHADRA                     6177
BUDAUN                        6158
CHANDAULI                     5830
MUZAFFARNAGAR                 5817
KANNAUJ                       5649
RAMPUR                        5620
AGRA                          5614
FAIZABAD                      5559
ALLAHABAD                     5462
MATHURA                       5290
PRATAPGARH                    5259
DEORIA                        5159
JYOTIBA PHULE NAGAR           5074
GAUTAM BUDDHA NAGAR           4986
SANT KABIR NAGAR              4932
MIRZAPUR                      4885

Year      283211
Months       262
Days          22
y              3
Name: Age_Code, dtype: int64
Mode:
0    Year
dtype: object


Attribute Name: Weight_measured
Frequency:
Measured    281102
Refused       1881
Other          515
Name: Weight_measured, dtype: int64
Mode:
0    Measured
dtype: object


Attribute Name: Length_height_measured
Frequency:
Measured    280752
Refused       2042
Other          704
Name: Length_height_measured, dtype: int64
Mode:
0    Measured
dtype: object


Attribute Name: length_height_code
Frequency:
Height    283227
Length       271
Name: length_height_code, dtype: int64
Mode:
0    Height
dtype: object


Attribute Name: Haemoglobin_test
Frequency:
YES    260991
NO      22502
3           3
4           2
Name: Haemoglobin_test, dtype: int64
Mode:
0    YES
dtype: object


Attribute Name: Haemoglobin
Frequency:
Measured    240235
Refused      42881
Other          382
Name: Haemoglobin, dtype: int64
Mode:
0    Measured
dtype: object


Attribute Name: Diabetes_tes

# Data Exploration -: Part II: Data Visualisation

Method to construct a count plot for categorical attributes

In [28]:
def makeCountPlot(dataFrame, attr):
    sns.set(font_scale = 1)
    dims=(12, 12)
    fig, ax=plt.subplots(figsize=dims)
    sns.countplot(ax=ax, x=attr, data=dataFrame, palette="Set2")
    plt.title('Count of each category of '+attr+" in data")
    plt.subplots_adjust(left=0.05, right=0.94, top=0.99, bottom=0.08, hspace=0.17, wspace=0.05)
    plt.xticks(rotation=90)
    plt.show()

Method to construct a cat plot

In [29]:
def makeCatPlot(dataFrame, groupByAttr, hueByAttr, colByAttr):
    sns.set(font_scale = 1)
    dims=(12, 12)
    g=sns.catplot(x=groupByAttr, hue=hueByAttr, col=colByAttr, data=dataFrame, kind="count", sharey=False, height=12, aspect=1, palette="Set2")
    plt.subplots_adjust(left=0.05, right=0.94, top=0.99, bottom=0.08, hspace=0.17, wspace=0.05)
    g.set_xticklabels(rotation=90)
    plt.show()

Method to construct a violin plot

In [30]:
def makeViolinPlot(df, categoricalAttr, ratioAttrList, hueByAttr, hueWanted):
    sns.set(style="whitegrid", font_scale = 1)
    for i in range(len(ratioAttrList)):
        if hueWanted=="Yes":
            sns.violinplot(x = df[categoricalAttr], y = df[ratioAttrList[i]], hue=df[hueByAttr], inner="stick", height=12, aspect=1, palette="Set2")
        else:
            sns.violinplot(x = df[categoricalAttr], y = df[ratioAttrList[i]], inner="stick", height=12, aspect=1, palette="Set2")
        legend_x = 1
        legend_y = 0.5
        plt.legend(loc='center left', bbox_to_anchor=(legend_x, legend_y))
        plt.subplots_adjust(left=0.05, right=0.94, top=0.99, bottom=0.08, hspace=0.17, wspace=0.05)
        plt.xticks(rotation=90)
        plt.show()

Method to construct a pair plot

In [31]:
def makePairPlot(dataFrame, hueByAttr):
    sns.set(font_scale = 1)
    sns.pairplot(dataFrame, hue=hueByAttr)
    plt.subplots_adjust(left=0.05, right=0.94, top=0.99, bottom=0.08, hspace=0.17, wspace=0.05)
    plt.xticks(rotation=90)
    plt.show()

Method to construct a parallel coordinates graph

In [32]:
def makeParallelCoordinatesPlot(dataFrame): 
    sns.set(font_scale = 1)
    parallel_coordinates(dataFrame, 'Sex', color=('deepskyblue','crimson'))
    plt.subplots_adjust(left=0.05, right=0.94, top=0.99, bottom=0.08, hspace=0.17, wspace=5)
    plt.xticks(rotation=90)
    plt.show()

Method to perform data visualisation

In [36]:
def performDataVisualisation(filePath):
    df=pd.read_csv(filePath)
    dictionary = {"Age": df["Age"], "Haemoglobin":df["Haemoglobin_level"], "Pulse rate": df["Pulse_rate"], 'Iodine level': df["test_salt_iodine"], "Weight": df["Weight_in_kg"], "Height": df["Length_height_cm"], "Systolic BP": df["BP_systolic"], "Diastolic BP": df["BP_Diastolic"], "Glucose": df["fasting_blood_glucose_mg_dl"], 'Sex':df['Sex']}
    df1 = pd.DataFrame(dictionary)
    X=["test_salt_iodine", "Haemoglobin_level", "Weight_in_kg", "Length_height_cm", "BP_systolic", "BP_Diastolic", "Pulse_rate", "fasting_blood_glucose_mg_dl"]
    
    #Making Count Plots
    #---For district
    makeCountPlot(df, 'district_code')
    #---For rural-urban
    makeCountPlot(df, 'rural_urban')
    
    #Making Cat Plot
    #---For district wise  frequency of sex in rural/urban area
    makeCatPlot(df, 'district_code', 'Sex', 'rural_urban')
    
    #Making Pair Plot
    makePairPlot(df1, "Sex")
    
    #Making Parallel Coordinates Graph
    makeParallelCoordinatesPlot(df1)
    
    #Making Violin Plots With Hue
    #---District-wise violin plot with sex as the legend
    makeViolinPlot(df, 'district_code', X, 'Sex', "Yes")
    #---Sex-wise violin plot with district as the legend
    makeViolinPlot(df, 'Sex', X, 'district_code', "Yes")
    
    #Making Violin Plots Without Hue
    #---District-wise violin plot with sex as the legend
    makeViolinPlot(df, 'district_code', X, 'Sex', "No")
    #---Sex-wise violin plot with district as the legend
    makeViolinPlot(df, 'Sex', X, 'district_code', "No")

Main function for data visualisation

In [34]:
def runMainDataVisualisation():
    
    #Defining list containing the names of all states
    stateNames=["Bihar", "Chhattisgarh", "Jharkhand", "MadhyaPradesh", "Odisha", "Uttarakhand", "UttarPradesh"]
    
    #Defining the file path prefix and suffix
    dataFilePathPrefix="./../Data/cleanedData/cleanedmerged"
    dataFilePathSuffix=".csv"

    #Performing data visualisation for each state
    for state in stateNames:
        print("Data Visualisation for "+ state)
        performDataVisualisation (dataFilePathPrefix+state+dataFilePathSuffix)

Runnable

In [44]:
runMainDataVisualisation()