# Chronic Disease Dataset Exploration
### Do 

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [3]:
# df = pd.read_csv('chronic.csv', dtype=types_dict, header=0)
df = pd.read_csv('chronic.csv', header=0)

In [4]:
df.columns

Index(['YearStart', 'YearEnd', 'LocationAbbr', 'LocationDesc', 'DataSource',
       'Topic', 'Question', 'Response', 'DataValueUnit', 'DataValueType',
       'DataValue', 'DataValueAlt', 'DataValueFootnoteSymbol',
       'DatavalueFootnote', 'LowConfidenceLimit', 'HighConfidenceLimit',
       'StratificationCategory1', 'Stratification1', 'StratificationCategory2',
       'Stratification2', 'StratificationCategory3', 'Stratification3',
       'GeoLocation', 'ResponseID', 'LocationID', 'TopicID', 'QuestionID',
       'DataValueTypeID', 'StratificationCategoryID1', 'StratificationID1',
       'StratificationCategoryID2', 'StratificationID2',
       'StratificationCategoryID3', 'StratificationID3'],
      dtype='object')

In [5]:
df.isna().sum() / df.shape[0]

YearStart                    0.000000
YearEnd                      0.000000
LocationAbbr                 0.000000
LocationDesc                 0.000000
DataSource                   0.000000
Topic                        0.000000
Question                     0.000000
Response                     0.847633
DataValueUnit                0.084030
DataValueType                0.000000
DataValue                    0.279294
DataValueAlt                 0.325913
DataValueFootnoteSymbol      0.562613
DatavalueFootnote            0.562932
LowConfidenceLimit           0.401479
HighConfidenceLimit          0.401479
StratificationCategory1      0.000000
Stratification1              0.000000
StratificationCategory2      0.847633
Stratification2              0.847633
StratificationCategory3      0.847633
Stratification3              0.847633
GeoLocation                  0.006933
ResponseID                   0.847656
LocationID                   0.000000
TopicID                      0.000000
QuestionID  

In [6]:
clean_df = df.drop(["Response", "ResponseID", "StratificationCategory2", "Stratification2", "StratificationCategory3", "Stratification3", "StratificationCategoryID2", "StratificationID2", "StratificationID3", "StratificationCategoryID3"], axis=1)

In [7]:
clean_df.head(1)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,...,HighConfidenceLimit,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
0,2015,2015,US,United States,APIS,Alcohol,Amount of alcohol excise tax by beverage type ...,$,US Dollars,,...,,Overall,Overall,,59,ALC,ALC8_0_2,USD,OVERALL,OVR


In [77]:
clean_df['Stratification1'].unique()

array(['Overall', 'Multiracial, non-Hispanic', 'Black, non-Hispanic',
       'Female', 'White, non-Hispanic', 'Male',
       'American Indian or Alaska Native', 'Hispanic',
       'Other, non-Hispanic', 'Asian or Pacific Islander',
       'Asian, non-Hispanic'], dtype=object)

In [8]:
clean_df[clean_df['Topic'] == 'Diabetes'].Question.unique()

array(['Foot examination among adults aged >= 18 years with diagnosed diabetes',
       'Mortality due to diabetes reported as any listed cause of death',
       'Mortality with diabetic ketoacidosis reported as any listed cause of death',
       'Influenza vaccination among noninstitutionalized adults aged 18-64 years with diagnosed diabetes',
       'Diabetes prevalence among women aged 18-44 years',
       'Adults with diagnosed diabetes aged >= 18 years who have taken a diabetes self-management course',
       'Prevalence of diagnosed diabetes among adults aged >= 18 years',
       'Dilated eye examination among adults aged >= 18 years with diagnosed diabetes',
       'Prevalence of depressive disorders among adults aged >= 18 years with diagnosed diabetes',
       'Glycosylated hemoglobin measurement among adults aged >= 18 years with diagnosed diabetes',
       'Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with diagnosed diabetes',
       'Hospitali

In [9]:
clean_df.Topic.value_counts(normalize=True)

Diabetes                                           0.153220
Chronic Obstructive Pulmonary Disease              0.151484
Cardiovascular Disease                             0.145823
Arthritis                                          0.080361
Overarching Conditions                             0.075737
Asthma                                             0.075543
Nutrition, Physical Activity, and Weight Status    0.064691
Alcohol                                            0.061283
Tobacco                                            0.056388
Older Adults                                       0.029439
Cancer                                             0.027132
Chronic Kidney Disease                             0.023849
Oral Health                                        0.021429
Mental Health                                      0.013902
Immunization                                       0.010044
Disability                                         0.005711
Reproductive Health                     

In [10]:
diabetes = clean_df[clean_df['Topic'] == 'Diabetes'].reset_index(drop=True)

In [11]:
diabetes.Question.unique()

array(['Foot examination among adults aged >= 18 years with diagnosed diabetes',
       'Mortality due to diabetes reported as any listed cause of death',
       'Mortality with diabetic ketoacidosis reported as any listed cause of death',
       'Influenza vaccination among noninstitutionalized adults aged 18-64 years with diagnosed diabetes',
       'Diabetes prevalence among women aged 18-44 years',
       'Adults with diagnosed diabetes aged >= 18 years who have taken a diabetes self-management course',
       'Prevalence of diagnosed diabetes among adults aged >= 18 years',
       'Dilated eye examination among adults aged >= 18 years with diagnosed diabetes',
       'Prevalence of depressive disorders among adults aged >= 18 years with diagnosed diabetes',
       'Glycosylated hemoglobin measurement among adults aged >= 18 years with diagnosed diabetes',
       'Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with diagnosed diabetes',
       'Hospitali

In [64]:
def chloro(df, topic, question, dvtype, dvunit, year, strat):
    """ Plots a chloropleth map based on specific 
    topic, question, data value type, year, and stratification.
    
    Keyword Arguments:
    df       -- cleaned dataframe     (pd.DataFrame)
    topic    -- health topic          (str)
    question -- topic question        (str)
    dvtype   -- DataValueType value   (str)
    dvunit   -- DataValueUnit value   (str)
    year     -- YearStart value       (str)
    strat    -- Stratification1 value (str)
    """
    
    assert type(df) == pd.DataFrame;
    assert type(topic) == str;
    assert type(question) == str;
    assert type(dvtype) == str;
    assert type(dvunit) == str;
    assert type(year) == int;
    assert type(strat) == str;
    
    # specify topic
    assert topic in df['Topic'].unique(), "Invalid topic, choose one: " + str(df['Topic'].unique());
    topic_df = df[df['Topic'] == topic].reset_index(drop=True);
    
    # specify question
    assert question in topic_df['Question'].unique(), "Invalid question, choose one: " + str(topic_df['Question'].unique());
    question_df = topic_df[topic_df['Question'] == question].reset_index(drop=True);
    
    # convert datavalue to numeric/NaN 
    question_df['DataValue'] = pd.to_numeric(question_df['DataValue'], errors = 'coerce');

    # drop nan datavalue rows
    clean_question_df = question_df[question_df['DataValue'].notna()];
    
    # drop all non-mainland states
    clean_question_df = clean_question_df[clean_question_df['LocationAbbr'] != 'US'];
    clean_question_df = clean_question_df[clean_question_df['LocationAbbr'] != 'DC'];
    clean_question_df = clean_question_df[clean_question_df['LocationAbbr'] != 'PR'];
    clean_question_df = clean_question_df[clean_question_df['LocationAbbr'] != 'GU'];
    
    # specify map data value type
    assert dvtype in clean_question_df['DataValueType'].unique(), "Invalid dvtype, choose one: " + str(clean_question_df['DataValueType'].unique())
    map_data = clean_question_df[clean_question_df['DataValueType'] == dvtype];
    
    # specify map data year
    assert year in clean_question_df['YearStart'].unique(), "Invalid year, choose one: " + str(clean_question_df['YearStart'].unique())
    map_data = map_data[map_data['YearStart'] == year];
    
    # specify map data strat
    assert strat in clean_question_df['Stratification1'].unique(), "Invalid strat, choose one: " + str(clean_question_df['Stratification1'].unique())
    map_data = map_data[map_data['Stratification1'] == strat];
    
    # plot chloropleth map
    assert dvunit in map_data['DataValueUnit'].unique(), "Invalid dvunit, choose one: " + str(map_data['DataValueUnit'].unique())
    fig = px.choropleth(
        locations = map_data['LocationAbbr'], 
        locationmode = "USA-states", 
        color = map_data['DataValue'], 
        scope = "usa", 
        title = "Chloropleth of " + question.title(),
        labels = {'color': dvunit}
    )
    fig.show()

In [65]:
chloro(clean_df, 'Alcohol', 'Chronic liver disease mortality', 'Age-adjusted Rate', 'cases per 100,000', 2014, 'Black, non-Hispanic')

In [63]:
chloro(clean_df, 'Diabetes', 'Mortality due to diabetes reported as any listed cause of death', 'Age-adjusted Rate', 'cases per 100,000', 2010, 'Overall')

In [51]:
chloro(clean_df, 'Diabetes', 'Prevalence of gestational diabetes', 'Prevalence', 2010, 'Overall')

In [197]:
# limited diabetes responses to single question
mortality = diabetes[diabetes['Question'] == "Mortality due to diabetes reported as any listed cause of death"].reset_index(drop=True)

# convert datavalue to numeric values #CLEANING
mortality.DataValue = pd.to_numeric(mortality['DataValue'], errors = 'coerce')

# drop nan data value rows
clean_mortality = mortality[mortality['DataValue'].notna()]

# drop US and DC
clean_mortality = clean_mortality[clean_mortality['LocationAbbr'] != 'US']
clean_mortality = clean_mortality[clean_mortality['LocationAbbr'] != 'DC']

In [201]:
# map_data = clean_mortality.groupby('LocationAbbr').mean()
map_data = clean_mortality[clean_mortality['DataValueType'] == 'Age-adjusted Rate']
map_data = map_data[map_data['YearStart'] == 2011]
map_data = map_data[map_data['Stratification1'] == 'Overall']

In [202]:
fig = px.choropleth(locations = map_data['LocationAbbr'], locationmode="USA-states", color= map_data['DataValue'], scope="usa")
fig.show()

In [76]:
df1 = diabetes[diabetes['Question'] == "Mortality due to diabetes reported as any listed cause of death"]
df2 = df1[df1['YearStart'] == 2010]
df3 = df2[df2['DataValueType'] == 'Age-adjusted Rate']
df4 = df3[df3['Stratification1'].isin(['Overall', 'White, non-Hispanic'])]

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,...,HighConfidenceLimit,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
485,2010,2010,ID,Idaho,NVSS,Diabetes,Mortality due to diabetes reported as any list...,"cases per 100,000",Age-adjusted Rate,69.3,...,73.5,Overall,Overall,"(43.682630005, -114.363730042)",16,DIA,DIA1_1,AGEADJRATE,OVERALL,OVR
639,2010,2010,MN,Minnesota,NVSS,Diabetes,Mortality due to diabetes reported as any list...,"cases per 100,000",Age-adjusted Rate,69.9,...,72.1,Overall,Overall,"(46.355648736, -94.794200503)",27,DIA,DIA1_1,AGEADJRATE,OVERALL,OVR
2106,2010,2010,KS,Kansas,NVSS,Diabetes,Mortality due to diabetes reported as any list...,"cases per 100,000",Age-adjusted Rate,61.5,...,64.3,Overall,Overall,"(38.3477403, -98.200781227)",20,DIA,DIA1_1,AGEADJRATE,OVERALL,OVR
2260,2010,2010,MS,Mississippi,NVSS,Diabetes,Mortality due to diabetes reported as any list...,"cases per 100,000",Age-adjusted Rate,102.6,...,106.2,Overall,Overall,"(32.745510099, -89.538030825)",28,DIA,DIA1_1,AGEADJRATE,OVERALL,OVR
2331,2010,2010,NV,Nevada,NVSS,Diabetes,Mortality due to diabetes reported as any list...,"cases per 100,000",Age-adjusted Rate,37.5,...,40.1,Overall,Overall,"(39.49324039, -117.071840564)",32,DIA,DIA1_1,AGEADJRATE,OVERALL,OVR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76461,2010,2010,NE,Nebraska,NVSS,Diabetes,Mortality due to diabetes reported as any list...,"cases per 100,000",Age-adjusted Rate,78.1,...,82.0,Race/Ethnicity,"White, non-Hispanic","(41.641040988001, -99.365720623)",31,DIA,DIA1_1,AGEADJRATE,RACE,WHT
76668,2010,2010,KY,Kentucky,NVSS,Diabetes,Mortality due to diabetes reported as any list...,"cases per 100,000",Age-adjusted Rate,92.4,...,95.2,Overall,Overall,"(37.645970271, -84.774971048)",21,DIA,DIA1_1,AGEADJRATE,OVERALL,OVR
76746,2010,2010,TX,Texas,NVSS,Diabetes,Mortality due to diabetes reported as any list...,"cases per 100,000",Age-adjusted Rate,76.3,...,77.7,Overall,Overall,"(31.827240407, -99.426770206)",48,DIA,DIA1_1,AGEADJRATE,OVERALL,OVR
76837,2010,2010,AZ,Arizona,NVSS,Diabetes,Mortality due to diabetes reported as any list...,"cases per 100,000",Age-adjusted Rate,45.9,...,47.6,Overall,Overall,"(34.86597028, -111.763811277)",4,DIA,DIA1_1,AGEADJRATE,OVERALL,OVR
