# Steps

SMART BRFSS Data Processing
- columns preprocessing
- select topic with on 'depression'
- deal with nan values for answer (1 - no%) >  yes, if applicable
- check and merge with the metropolitandivisioncode/cbsacode, maximize the match numbers


[Behavioral Risk Factors: Selected Metropolitan Area Risk Trends (SMART) MMSA Prevalence Data (2011 to Present)](https://data.cdc.gov/Behavioral-Risk-Factors/Behavioral-Risk-Factors-Selected-Metropolitan-Area/j32a-sa6u/about_data)

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
data_dir = "..\\data\\"

In [3]:
smart = pd.read_csv(f"{data_dir}Behavioral_Risk_Factors__Selected_Metropolitan_Area_Risk_Trends__SMART__MMSA_Prevalence_Data__2011_to_Present__20241004.csv")

In [4]:
smart.Topic.unique()

array(['Disability status', 'Hearing', 'Physical Activity Index',
       'HIV Test', 'Tetanus Shot', 'Cardiovascular Disease',
       'Overall Health', 'Health Care Coverage', 'Seatbelt Use',
       'Cholesterol Checked', 'Healthy Days', 'COPD', 'Asthma',
       'Veteran Status', 'PSA Test', 'Last Checkup', 'Diabetes',
       'Skin Cancer', 'Other Cancer', 'BMI Categories', 'Smoker Status',
       'Teeth Removed', 'All Teeth Removed', 'Flu Shot', 'Exercise',
       'Vegetable Consumption', 'Alcohol Consumption',
       'Fair or Poor Health', 'Kidney', 'Health Care Cost',
       'Current Smoker Status', 'Pneumonia Vaccination', 'Arthritis',
       'Blood Stool Test', 'Mammogram', 'Smokeless Tobacco', 'Depression',
       'High Blood Pressure', 'Vision', 'Binge Drinking',
       'Drink and Drive', 'USPSTF Recommendations', 'Heavy Drinking',
       'Personal Care Provider', 'Dental Visit', 'Under 65 Coverage',
       'Pap Test', 'Aerobic Activity', 'Cholesterol High',
       'Fruit Consum

## Data Preprocessing

In [5]:
# filter out depression topic only
smart_depr = smart[smart.Topic == 'Depression']
smart_depr.head(2)

Unnamed: 0,Year,Locationabbr,Locationdesc,Class,Topic,Question,Response,Break_Out,Break_Out_Category,Sample_Size,Data_value,Confidence_limit_Low,Confidence_limit_High,Display_order,Data_value_unit,Data_value_type,Data_Value_Footnote_Symbol,Data_Value_Footnote,DataSource,ClassId,TopicId,LocationID,BreakoutID,BreakOutCategoryID,QuestionID,RESPONSEID,GeoLocation
200,2011,15804,"Camden, NJ Metropolitan Division",Chronic Health Indicators,Depression,Ever told you that you have a form of depression?,Yes,Overall,Overall,253,12.52,10.34,14.7,50,%,Crude Prevalence,,,BRFSS,CLASS03,Topic17,15804,BO1,CAT1,ADDEPEV2,RESP046,"(39.8098807, -74.8232086)"
240,2020,41700,"San Antonio-New Braunfels, TX Metropolitan Sta...",Chronic Health Indicators,Depression,Ever told you that you have a form of depression?,No,Overall,Overall,516,81.42,77.17,85.67,10,%,Crude Prevalence,,,BRFSS,CLASS03,TOPIC17,41700,BO1,CAT1,ADDEPEV3,RESP054,"(29.4330549, -98.6069656)"


In [6]:
smart_depr.shape

(3432, 27)

In [7]:
smart_depr.groupby('Year')['Locationdesc'].nunique()


Year
2011    198
2012    187
2013    145
2014    132
2015    130
2016    143
2017    136
2018    134
2019    136
2020    118
2021    126
2022    131
Name: Locationdesc, dtype: int64

In [8]:
smart_depr.groupby('Year')['Locationdesc'].nunique()

Year
2011    198
2012    187
2013    145
2014    132
2015    130
2016    143
2017    136
2018    134
2019    136
2020    118
2021    126
2022    131
Name: Locationdesc, dtype: int64

In [9]:
len(smart_depr.Locationdesc.unique())

270

In [10]:
smart_depr.groupby('Year').size()

Year
2011    396
2012    374
2013    290
2014    264
2015    260
2016    286
2017    272
2018    268
2019    272
2020    236
2021    252
2022    262
dtype: int64

In [11]:
# check unique values of each column
for col, values in {col: smart_depr[col].unique() for col in smart_depr.columns}.items():
    print(f"Unique values in column '{col}': {values}")

Unique values in column 'Year': [2011 2020 2019 2012 2018 2013 2014 2015 2016 2017 2021 2022]
Unique values in column 'Locationabbr': [15804 41700 41420 40860 23104 45060 10100 16300 13740 31080 42644 38900
 43580 36740 36540 10420 29180 17820 24260 33100 12940 22220 36420 41180
 16980 30700 48620 46140 16620 48864 44060 39300 41980 37964 25060 26580
 36260 41060 28140 15380 19740 38060 45780 20260 26420 17460 33340 41620
 13820 19780 18580 17140 31140 35380 33460 12260 29620 35840 39580 24860
 18140 32820 26820 40380 16740 22020 11260 19124 26900 23540 28700 25180
 24340 16700 15540 10740 36084 39340 33874 15764 17900 40900 19430 37860
 39900 12060 40484 23060 40060 38860 27260 14454 30100 41540 10580 25540
 39660 40140 31740 23224 38300 27140 40340 34820 28940 14260 12420 30780
 35004 19660 34980 33660 12580 35614 13900 25720 16860 33500 10380 35740
 13140 30860 39100 35820 31540 47894 45300 49340 41940 45820 43620 45220
 46220 49660 47664 44140 47940 47980 47260 21340 19340 35084 35

In [12]:
def process_columns(df, selected_columns_to_keep):
    """
    This function counts unique values for each column in the dataframe (ignoring case for strings),
    memorizes the columns with only 1 unique value (case insensitive), excludes the selected columns,
    and prints the remaining columns with their original unique value(s).
    
    Parameters:
    - df: DataFrame to process
    - selected_columns_to_keep: list of column names to exclude from the final results
    
    Returns:
    - A list of columns that have only 1 unique value (case insensitive) and were not selected to keep.
    """
    
    columns_with_one_unique_value = []

    for col in df.columns:
        if df[col].dtype == 'object':
            unique_values_lower = df[col].str.lower().unique()
            unique_values_original = df[col].unique()
        else:
            unique_values_lower = unique_values_original = df[col].unique()

        if len(unique_values_lower) == 1:
            columns_with_one_unique_value.append((col, unique_values_original))

    columns_with_one_unique_value = [
        (col, values) for col, values in columns_with_one_unique_value if col not in selected_columns_to_keep
    ]

    for col, unique_values in columns_with_one_unique_value:
        print(f"Column '{col}' has only 1 unique value(s): {list(unique_values)}")

    return [col for col, _ in columns_with_one_unique_value]

In [13]:
columns_to_drop = process_columns(smart_depr, ['Data_value_unit', 'Data_value_type', 'Data_Value_Footnote'])
smart_depr_selected = smart_depr.drop(columns=columns_to_drop)
smart_depr_selected.head(3)

Column 'Class' has only 1 unique value(s): ['Chronic Health Indicators']
Column 'Topic' has only 1 unique value(s): ['Depression']
Column 'Question' has only 1 unique value(s): ['Ever told you that you have a form of depression?']
Column 'Break_Out' has only 1 unique value(s): ['Overall']
Column 'Break_Out_Category' has only 1 unique value(s): ['Overall']
Column 'DataSource' has only 1 unique value(s): ['BRFSS']
Column 'ClassId' has only 1 unique value(s): ['CLASS03']
Column 'TopicId' has only 1 unique value(s): ['Topic17', 'TOPIC17']
Column 'BreakoutID' has only 1 unique value(s): ['BO1']
Column 'BreakOutCategoryID' has only 1 unique value(s): ['CAT1']


Unnamed: 0,Year,Locationabbr,Locationdesc,Response,Sample_Size,Data_value,Confidence_limit_Low,Confidence_limit_High,Display_order,Data_value_unit,Data_value_type,Data_Value_Footnote_Symbol,Data_Value_Footnote,LocationID,QuestionID,RESPONSEID,GeoLocation
200,2011,15804,"Camden, NJ Metropolitan Division",Yes,253,12.52,10.34,14.7,50,%,Crude Prevalence,,,15804,ADDEPEV2,RESP046,"(39.8098807, -74.8232086)"
240,2020,41700,"San Antonio-New Braunfels, TX Metropolitan Sta...",No,516,81.42,77.17,85.67,10,%,Crude Prevalence,,,41700,ADDEPEV3,RESP054,"(29.4330549, -98.6069656)"
256,2011,41420,"Salem, OR Metropolitan Statistical Area",No,448,75.99,71.46,80.52,50,%,Crude Prevalence,,,41420,ADDEPEV2,RESP054,"(44.9033791, -122.9017427)"


### Only Select Yes Answers for Depression Topic

In [14]:
# Iterate through the rows where Response is 'Yes' and Data_value is NaN
for index, row in smart_depr_selected[smart_depr_selected['Response'] == 'Yes'].iterrows():
    location = row['Locationdesc']
    year = row['Year']
    
    # Find the corresponding 'No' response for the same location and year
    no_value = smart_depr_selected[(smart_depr_selected['Locationdesc'] == location) & 
                                   (smart_depr_selected['Year'] == year) & 
                                   (smart_depr_selected['Response'] == 'No')]['Data_value'].values
    
    # If a 'No' response exists and Yes is NaN, replace Yes with 100 - No
    if len(no_value) > 0 and pd.isna(row['Data_value']):
        smart_depr_selected.at[index, 'Data_value'] = 100 - no_value[0]

smart_depr_selected_yes = smart_depr_selected[smart_depr_selected.Response == 'Yes']
print('smart_depr shape:', smart_depr.shape)
print('smart_depr_selected shape:', smart_depr_selected.shape)
print('smart_depr_selected_yes shape:', smart_depr_selected_yes.shape)
print('smart_depr_selected_yes max year:', smart_depr_selected_yes .Year.max(), 'smart_depr_selected_yes min year:', smart_depr_selected_yes .Year.min())

smart_depr shape: (3432, 27)
smart_depr_selected shape: (3432, 17)
smart_depr_selected_yes shape: (1716, 17)
smart_depr_selected_yes max year: 2022 smart_depr_selected_yes min year: 2011


In [15]:
smart_depr_selected_yes.nlargest(5, 'Data_value')

Unnamed: 0,Year,Locationabbr,Locationdesc,Response,Sample_Size,Data_value,Confidence_limit_Low,Confidence_limit_High,Display_order,Data_value_unit,Data_value_type,Data_Value_Footnote_Symbol,Data_Value_Footnote,LocationID,QuestionID,RESPONSEID,GeoLocation
188495,2022,28700,"Kingsport-Bristol-Bristol, TN-VA Metropolitan ...",Yes,183,36.85,31.19,42.51,2,%,Crude Prevalence,,,28700,ADDEPEV3,RESP046,"(36.6041558, -82.4401118)"
184666,2022,16860,"Chattanooga, TN-GA Metropolitan Statistical Area",Yes,176,33.28,28.3,38.26,2,%,Crude Prevalence,,,16860,ADDEPEV3,RESP046,"(35.0489417, -85.3611582)"
188615,2022,28940,"Knoxville, TN Metropolitan Statistical Area",Yes,188,32.23,27.6,36.86,2,%,Crude Prevalence,,,28940,ADDEPEV3,RESP046,"(35.9294445, -84.0154928)"
14017,2019,13740,"Billings, MT Metropolitan Statistical Area",Yes,238,31.04,27.41,34.67,2,%,Crude Prevalence,,,13740,ADDEPEV3,RESP046,"(45.6371402, -108.8323611)"
4719,2019,28700,"Kingsport-Bristol-Bristol, TN-VA Metropolitan ...",Yes,168,30.62,25.82,35.42,2,%,Crude Prevalence,,,28700,ADDEPEV3,RESP046,"(36.6041611, -82.4401446)"


In [16]:
smart_depr_selected_yes.nsmallest(5, 'Data_value')

Unnamed: 0,Year,Locationabbr,Locationdesc,Response,Sample_Size,Data_value,Confidence_limit_Low,Confidence_limit_High,Display_order,Data_value_unit,Data_value_type,Data_Value_Footnote_Symbol,Data_Value_Footnote,LocationID,QuestionID,RESPONSEID,GeoLocation
79101,2013,41940,"San Jose-Sunnyvale-Santa Clara, CA Metropolita...",Yes,66,7.77,5.52,10.02,49,%,Crude Prevalence,,,41940,ADDEPEV2,RESP046,"(36.9084669, -121.3713591)"
139943,2016,35004,"Nassau County-Suffolk County, NY Metropolitan ...",Yes,101,8.6,6.31,10.89,1,%,Crude Prevalence,,,35004,ADDEPEV2,RESP046,"(40.9057199, -72.8348299)"
58436,2011,42044,"Santa Ana-Anaheim-Irvine, CA Metropolitan Divi...",Yes,121,9.09,6.82,11.36,50,%,Crude Prevalence,,,42044,ADDEPEV2,RESP046,"(33.6756824, -117.7771947)"
76858,2012,11244,"Los Angeles-Long Beach-Anaheim, CA Metropolita...",Yes,113,9.2,6.93,11.47,41,%,Crude Prevalence,,,11244,ADDEPEV2,RESP046,"(34.065193, -118.244222)"
56755,2011,26180,"Honolulu, HI Metropolitan Statistical Area",Yes,389,9.58,8.21,10.95,50,%,Crude Prevalence,,,26180,ADDEPEV2,RESP046,"(27.606, -105.718)"


## Merge with fips

SMART merge with cbsa code: [cbsa-csa-fips-county-crosswalk](https://data.nber.org/cbsa-csa-fips-county-crosswalk/cbsa2fipsxw.csv)

In [17]:
smart_depr_selected_yes.head(3)

Unnamed: 0,Year,Locationabbr,Locationdesc,Response,Sample_Size,Data_value,Confidence_limit_Low,Confidence_limit_High,Display_order,Data_value_unit,Data_value_type,Data_Value_Footnote_Symbol,Data_Value_Footnote,LocationID,QuestionID,RESPONSEID,GeoLocation
200,2011,15804,"Camden, NJ Metropolitan Division",Yes,253,12.52,10.34,14.7,50,%,Crude Prevalence,,,15804,ADDEPEV2,RESP046,"(39.8098807, -74.8232086)"
283,2011,40860,"Rutland, VT Micropolitan Statistical Area",Yes,153,21.61,17.44,25.78,49,%,Crude Prevalence,,,40860,ADDEPEV2,RESP046,"(43.5808351, -73.0381951)"
328,2020,23104,"Fort Worth-Arlington, TX Metropolitan Division",Yes,120,18.22,14.61,21.83,2,%,Crude Prevalence,,,23104,ADDEPEV3,RESP046,"(32.8073735, -97.537101)"


In [18]:
cbsa = pd.read_csv(f"{data_dir}cbsa2fipsxw.csv")
cbsa.head(5)

Unnamed: 0,cbsacode,metropolitandivisioncode,csacode,cbsatitle,metropolitanmicropolitanstatis,metropolitandivisiontitle,csatitle,countycountyequivalent,statename,fipsstatecode,fipscountycode,centraloutlyingcounty
0,33860,,388.0,"Montgomery, AL",Metropolitan Statistical Area,,"Montgomery-Selma, AL",Autauga County,Alabama,1,1,Central
1,19300,,380.0,"Daphne-Fairhope-Foley, AL",Metropolitan Statistical Area,,"Mobile-Daphne-Fairhope, AL",Baldwin County,Alabama,1,3,Central
2,21640,,,"Eufaula, AL-GA",Micropolitan Statistical Area,,,Barbour County,Alabama,1,5,Central
3,13820,,142.0,"Birmingham, AL",Metropolitan Statistical Area,,"Birmingham-Cullman-Talladega, AL",Bibb County,Alabama,1,7,Outlying
4,13820,,142.0,"Birmingham, AL",Metropolitan Statistical Area,,"Birmingham-Cullman-Talladega, AL",Blount County,Alabama,1,9,Outlying


### Deal with NaN in metropolitandivisioncode

In [19]:
smart_depr_selected_yes.Locationdesc.nunique(), smart_depr_selected_yes.Locationabbr.nunique(), cbsa.shape

(270, 268, (1915, 12))

In [20]:
cbsa['metropolitandivisioncode'].isna().sum()

1776

In [21]:
cbsa['metropolitandivisioncode'] = cbsa['metropolitandivisioncode'].fillna(cbsa['cbsacode']) # brutal force
cbsa.head(3) 

Unnamed: 0,cbsacode,metropolitandivisioncode,csacode,cbsatitle,metropolitanmicropolitanstatis,metropolitandivisiontitle,csatitle,countycountyequivalent,statename,fipsstatecode,fipscountycode,centraloutlyingcounty
0,33860,33860.0,388.0,"Montgomery, AL",Metropolitan Statistical Area,,"Montgomery-Selma, AL",Autauga County,Alabama,1,1,Central
1,19300,19300.0,380.0,"Daphne-Fairhope-Foley, AL",Metropolitan Statistical Area,,"Mobile-Daphne-Fairhope, AL",Baldwin County,Alabama,1,3,Central
2,21640,21640.0,,"Eufaula, AL-GA",Micropolitan Statistical Area,,,Barbour County,Alabama,1,5,Central


In [22]:

cbsa = cbsa[["cbsacode", "metropolitandivisioncode","csacode", "cbsatitle", "metropolitanmicropolitanstatis"]]

In [23]:
smart_depr_selected_yes.shape, cbsa.shape

((1716, 17), (1915, 5))

In [24]:
smart_depr_selected_yes.Locationabbr.dtypes

dtype('int64')

In [25]:
inner_merge = pd.merge(
    smart_depr_selected_yes, cbsa,
    left_on='Locationabbr', right_on='metropolitandivisioncode',
    how='inner'
)

unmatched_rows = smart_depr_selected_yes[
    ~smart_depr_selected_yes['Locationabbr'].isin(inner_merge['Locationabbr'])
]

print("Unmatched rows after primary inner merge:")
print(len(unmatched_rows))

if not unmatched_rows.empty:
    alternative_merge = pd.merge(
        unmatched_rows, cbsa,
        left_on='Locationabbr', right_on='cbsacode', # alternative column # brutal merge
        how='inner'
    )

    final_result = pd.concat([inner_merge, alternative_merge]).drop_duplicates()
else:
    final_result = inner_merge


still_unmatched = smart_depr_selected_yes[
    ~smart_depr_selected_yes['Locationabbr'].isin(final_result['Locationabbr'])
]

print("Still-unmatched rows after both merges:")
print(len(still_unmatched))


Unmatched rows after primary inner merge:
131
Still-unmatched rows after both merges:
74


In [26]:
final_result

Unnamed: 0,Year,Locationabbr,Locationdesc,Response,Sample_Size,Data_value,Confidence_limit_Low,Confidence_limit_High,Display_order,Data_value_unit,Data_value_type,Data_Value_Footnote_Symbol,Data_Value_Footnote,LocationID,QuestionID,RESPONSEID,GeoLocation,cbsacode,metropolitandivisioncode,csacode,cbsatitle,metropolitanmicropolitanstatis
0,2011,15804,"Camden, NJ Metropolitan Division",Yes,253,12.52,10.34,14.70,50,%,Crude Prevalence,,,15804,ADDEPEV2,RESP046,"(39.8098807, -74.8232086)",37980,15804.0,428.0,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",Metropolitan Statistical Area
3,2020,15804,"Camden, NJ Metropolitan Division",Yes,338,18.15,15.82,20.48,2,%,Crude Prevalence,,,15804,ADDEPEV3,RESP046,"(39.8097724, -74.8232021)",37980,15804.0,428.0,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",Metropolitan Statistical Area
6,2013,15804,"Camden, NJ Metropolitan Division",Yes,346,17.91,15.58,20.24,49,%,Crude Prevalence,,,15804,ADDEPEV2,RESP046,"(39.8098807, -74.8232086)",37980,15804.0,428.0,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",Metropolitan Statistical Area
9,2012,15804,"Camden, NJ Metropolitan Division",Yes,365,16.88,14.67,19.09,41,%,Crude Prevalence,,,15804,ADDEPEV2,RESP046,"(39.8098807, -74.8232086)",37980,15804.0,428.0,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",Metropolitan Statistical Area
12,2014,15804,"Camden, NJ Metropolitan Division",Yes,271,14.63,12.30,16.96,42,%,Crude Prevalence,,,15804,ADDEPEV2,RESP046,"(39.8098807, -74.8232086)",37980,15804.0,428.0,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",Metropolitan Statistical Area
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,2022,45300,"Tampa-St. Petersburg-Clearwater, FL Metropolit...",Yes,203,20.28,16.71,23.85,2,%,Crude Prevalence,,,45300,ADDEPEV3,RESP046,"(28.1259019, -82.4652824)",45300,45294.0,,"Tampa-St. Petersburg-Clearwater, FL",Metropolitan Statistical Area
600,2022,45300,"Tampa-St. Petersburg-Clearwater, FL Metropolit...",Yes,203,20.28,16.71,23.85,2,%,Crude Prevalence,,,45300,ADDEPEV3,RESP046,"(28.1259019, -82.4652824)",45300,41304.0,,"Tampa-St. Petersburg-Clearwater, FL",Metropolitan Statistical Area
601,2011,41860,"San Francisco-Oakland-Fremont, CA Metropolitan...",Yes,300,12.38,10.42,14.34,50,%,Crude Prevalence,,,41860,ADDEPEV2,RESP046,"(37.7737135, -122.2744185)",41860,36084.0,488.0,"San Francisco-Oakland-Fremont, CA",Metropolitan Statistical Area
603,2011,41860,"San Francisco-Oakland-Fremont, CA Metropolitan...",Yes,300,12.38,10.42,14.34,50,%,Crude Prevalence,,,41860,ADDEPEV2,RESP046,"(37.7737135, -122.2744185)",41860,42034.0,488.0,"San Francisco-Oakland-Fremont, CA",Metropolitan Statistical Area


# Save the Final CSV

In [27]:
final_result.to_csv(f"{data_dir}20241025smart_with_cbsa.csv", index=False)

# Additional url might be helpful:

https://www2.census.gov/programs-surveys/cbp/technical-documentation/reference/metro-area-geography-reference/msa_county_reference22.txt