# Part 1: Extract
Extract the data from two collections in a Mongo Database.


In [1]:
#Import the dataset with mongoimport --type csv -d project_two -c npao --headerline --drop Nutrition__Physical_Activity__and_Obesity_-_Behavioral_Risk_Factor_Surveillance_System.csv
    
    
#Import the dataset with mongoimport --type csv -d project_two -c nchs --headerline --drop NCHS_-_Death_rates_and_life_expectancy_at_birth.csv


In [2]:
# Import dependencies
from pymongo import MongoClient
from pprint import pprint
import pandas as pd

In [3]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

# assign the met database to a variable name
db = mongo['project_two']

In [4]:
# review the collections in our new database
print(db.list_collection_names())

['nchs', 'npao']


In [5]:
# assign the collections to a variable
nchs = db['nchs']

npao = db['npao']

In [6]:
# Retrieve and print the data in nchs
for document in nchs.find():
    print(document)

{'_id': ObjectId('64f16338a441037430f7b1bf'), 'Year': 1902, 'Race': 'All Races', 'Sex': 'Both Sexes', 'Average Life Expectancy (Years)': 51.5, 'Age-adjusted Death Rate': 2301.3}
{'_id': ObjectId('64f16338a441037430f7b1c0'), 'Year': 1900, 'Race': 'All Races', 'Sex': 'Both Sexes', 'Average Life Expectancy (Years)': 47.3, 'Age-adjusted Death Rate': 2518.0}
{'_id': ObjectId('64f16338a441037430f7b1c1'), 'Year': 1908, 'Race': 'All Races', 'Sex': 'Both Sexes', 'Average Life Expectancy (Years)': 51.1, 'Age-adjusted Death Rate': 2298.9}
{'_id': ObjectId('64f16338a441037430f7b1c2'), 'Year': 1910, 'Race': 'All Races', 'Sex': 'Both Sexes', 'Average Life Expectancy (Years)': 50.0, 'Age-adjusted Death Rate': 2317.2}
{'_id': ObjectId('64f16338a441037430f7b1c3'), 'Year': 1909, 'Race': 'All Races', 'Sex': 'Both Sexes', 'Average Life Expectancy (Years)': 52.1, 'Age-adjusted Death Rate': 2249.2}
{'_id': ObjectId('64f16338a441037430f7b1c4'), 'Year': 1911, 'Race': 'All Races', 'Sex': 'Both Sexes', 'Average

In [7]:
sample_document = nchs.find_one()
sample_document

{'_id': ObjectId('64f16338a441037430f7b1bf'),
 'Year': 1902,
 'Race': 'All Races',
 'Sex': 'Both Sexes',
 'Average Life Expectancy (Years)': 51.5,
 'Age-adjusted Death Rate': 2301.3}

In [8]:
# Retrieve and print the data in npao
for document in npao.find():
    print(document)

{'_id': ObjectId('64f163a6182b38295adad2d9'), 'YearStart': 2020, 'YearEnd': 2020, 'LocationAbbr': 'US', 'LocationDesc': 'National', 'Datasource': 'Behavioral Risk Factor Surveillance System', 'Class': 'Physical Activity', 'Topic': 'Physical Activity - Behavior', 'Question': 'Percent of adults who engage in no leisure-time physical activity', 'Data_Value_Unit': '', 'Data_Value_Type': 'Value', 'Data_Value': 30.6, 'Data_Value_Alt': 30.6, 'Data_Value_Footnote_Symbol': '', 'Data_Value_Footnote': '', 'Low_Confidence_Limit': 29.4, 'High_Confidence_Limit': 31.8, 'Sample_Size': 31255, 'Total': '', 'Age(years)': '', 'Education': '', 'Gender': '', 'Income': '', 'Race/Ethnicity': 'Hispanic', 'GeoLocation': '', 'ClassID': 'PA', 'TopicID': 'PA1', 'QuestionID': 'Q047', 'DataValueTypeID': 'VALUE', 'LocationID': 59, 'StratificationCategory1': 'Race/Ethnicity', 'Stratification1': 'Hispanic', 'StratificationCategoryId1': 'RACE', 'StratificationID1': 'RACEHIS'}
{'_id': ObjectId('64f163a6182b38295adad2da')

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Part 2: Transform the Data


In [9]:
pipeline = [
    {
        "$match": {
            "Race": "All Races"
        }
    },
    {
        "$group": {
            "_id": "$Year",
            "max_life_expectancy": {"$max": "$Average Life Expectancy (Years)"}
        }
    },
    {
        "$sort": {
            "max_life_expectancy": -1  # Sort in descending order (highest first)
        }
    },
    {
        "$limit": 1
    }
]

# Execute the aggregation pipeline
result = list(nchs.aggregate(pipeline))

# Check if any result was found
if result:
    highest_life_expectancy_year = result[0]["_id"]
    highest_life_expectancy_value = result[0]["max_life_expectancy"]
    print(f"The year with the highest life expectancy for 'All Races' is {highest_life_expectancy_year} with an average life expectancy of {highest_life_expectancy_value} years.")

The year with the highest life expectancy for 'All Races' is 2014 with an average life expectancy of 81.3 years.


In [10]:
# Define the field to c81.3 years.heck for empty values
field_to_check = "Average Life Expectancy (Years)"

# Define the aggregation pipeline to filter out documents with empty data
pipeline = [
    {
        "$match": {
            field_to_check: {"$exists": True, "$ne": None, "$ne": ""}
        }
    },
    {
        "$match": {
            "Race": "White"
        }
    },
    {
        "$group": {
            "_id": "$Year",
            "max_life_expectancy": {"$max": "$Average Life Expectancy (Years)"}
        }
    },
    {
        "$sort": {
            "max_life_expectancy": -1  # Sort in descending order (highest first)
        }
    },
    {
        "$limit": 1
    }
]

# Execute the aggregation pipeline
result = list(nchs.aggregate(pipeline))

# Check if any result was found
if result:
    highest_life_expectancy_year = result[0]["_id"]
    highest_life_expectancy_value = result[0]["max_life_expectancy"]
    print(f"The year with the highest life expectancy for 'White' is {highest_life_expectancy_year} with an average life expectancy of {highest_life_expectancy_value} years.")

The year with the highest life expectancy for 'White' is 2013 with an average life expectancy of 81.4 years.


In [11]:
# Define the aggregation pipeline to filter out documents with empty data
pipeline = [
    {
        "$match": {
            field_to_check: {"$exists": True, "$ne": None, "$ne": ""}
        }
    },
    {
        "$match": {
            "Race": "Black"
        }
    },
    {
        "$group": {
            "_id": "$Year",
            "max_life_expectancy": {"$max": "$Average Life Expectancy (Years)"}
        }
    },
    {
        "$sort": {
            "max_life_expectancy": -1  # Sort in descending order (highest first)
        }
    },
    {
        "$limit": 1
    }
]

# Execute the aggregation pipeline
result = list(nchs.aggregate(pipeline))

# Check if any result was found
if result:
    highest_life_expectancy_year = result[0]["_id"]
    highest_life_expectancy_value = result[0]["max_life_expectancy"]
    print(f"The year with the highest life expectancy for 'Black' is {highest_life_expectancy_year} with an average life expectancy of {highest_life_expectancy_value} years.")

The year with the highest life expectancy for 'Black' is 2017 with an average life expectancy of 78.5 years.


In [12]:
#Create dicionary to store questions
question_dict = {}


for document in npao.find({}, {"QuestionID": 1, "Question": 1, "_id": 0}):
    question_id = document.get('QuestionID')
    question = document.get('Question')
    
    # Check if the QuestionID is already in the dictionary
    if question_id not in question_dict:
        question_dict[question_id] = question


print(question_dict)

{'Q047': 'Percent of adults who engage in no leisure-time physical activity', 'Q036': 'Percent of adults aged 18 years and older who have obesity', 'Q037': 'Percent of adults aged 18 years and older who have an overweight classification', 'Q043': 'Percent of adults who achieve at least 150 minutes a week of moderate-intensity aerobic physical activity or 75 minutes a week of vigorous-intensity aerobic activity (or an equivalent combination)', 'Q045': 'Percent of adults who achieve at least 300 minutes a week of moderate-intensity aerobic physical activity or 150 minutes a week of vigorous-intensity aerobic activity (or an equivalent combination)', 'Q046': 'Percent of adults who engage in muscle-strengthening activities on 2 or more days a week', 'Q018': 'Percent of adults who report consuming fruit less than one time daily', 'Q044': 'Percent of adults who achieve at least 150 minutes a week of moderate-intensity aerobic physical activity or 75 minutes a week of vigorous-intensity aerob

In [13]:
#convert dictironary into df

question_dict = {
    'QuestionID': ['Q047', 'Q036', 'Q037', 'Q045', 'Q044', 'Q043', 'Q046', 'Q018', 'Q019'],
    'Question': [
        'Percent of adults who engage in no leisure-time physical activity',
        'Percent of adults aged 18 years and older who have obesity',
        'Percent of adults aged 18 years and older who have an overweight classification',
        'Percent of adults who achieve at least 300 minutes a week of moderate-intensity aerobic physical activity or 150 minutes a week of vigorous-intensity aerobic activity (or an equivalent combination)',
        'Percent of adults who achieve at least 150 minutes a week of moderate-intensity aerobic physical activity or 75 minutes a week of vigorous-intensity aerobic physical activity and engage in muscle-strengthening activities on 2 or more days a week',
        'Percent of adults who achieve at least 150 minutes a week of moderate-intensity aerobic physical activity or 75 minutes a week of vigorous-intensity aerobic activity (or an equivalent combination)',
        'Percent of adults who engage in muscle-strengthening activities on 2 or more days a week',
        'Percent of adults who report consuming fruit less than one time daily',
        'Percent of adults who report consuming vegetables less than one time daily'
    ]
}

# Create a DataFrame from the dictionary
questions_df = pd.DataFrame(question_dict)


# Display the DataFrame
questions_df

Unnamed: 0,QuestionID,Question
0,Q047,Percent of adults who engage in no leisure-tim...
1,Q036,Percent of adults aged 18 years and older who ...
2,Q037,Percent of adults aged 18 years and older who ...
3,Q045,Percent of adults who achieve at least 300 min...
4,Q044,Percent of adults who achieve at least 150 min...
5,Q043,Percent of adults who achieve at least 150 min...
6,Q046,Percent of adults who engage in muscle-strengt...
7,Q018,Percent of adults who report consuming fruit l...
8,Q019,Percent of adults who report consuming vegetab...


In [14]:
column_names = npao.find_one().keys()

# Print or manipulate the column names as needed
print(column_names)

dict_keys(['_id', 'YearStart', 'YearEnd', 'LocationAbbr', 'LocationDesc', 'Datasource', 'Class', 'Topic', 'Question', 'Data_Value_Unit', 'Data_Value_Type', 'Data_Value', 'Data_Value_Alt', 'Data_Value_Footnote_Symbol', 'Data_Value_Footnote', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'Sample_Size', 'Total', 'Age(years)', 'Education', 'Gender', 'Income', 'Race/Ethnicity', 'GeoLocation', 'ClassID', 'TopicID', 'QuestionID', 'DataValueTypeID', 'LocationID', 'StratificationCategory1', 'Stratification1', 'StratificationCategoryId1', 'StratificationID1'])


In [15]:
data_list = []

# Iterate through the MongoDB cursor and append each document to the list
for document in npao.find():
    data_list.append(document)

# Convert the list of documents into a pandas DataFrame
npao_df = pd.DataFrame(data_list)

In [16]:
npao_df

Unnamed: 0,_id,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,64f163a6182b38295adad2d9,2020,2020,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,,PA,PA1,Q047,VALUE,59,Race/Ethnicity,Hispanic,RACE,RACEHIS
1,64f163a6182b38295adad2da,2015,2015,PR,Puerto Rico,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,"(18.220833, -66.590149)",PA,PA1,Q047,VALUE,72,Income,"$25,000 - $34,999",INC,INC2535
2,64f163a6182b38295adad2db,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,...,"(32.840571122, -86.631860762)",OWS,OWS1,Q036,VALUE,1,Age (years),25 - 34,AGEYR,AGEYR2534
3,64f163a6182b38295adad2dc,2015,2015,GU,Guam,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,"(13.444304, 144.793731)",PA,PA1,Q047,VALUE,66,Education,High school graduate,EDU,EDUHSGRAD
4,64f163a6182b38295adad2dd,2015,2015,RI,Rhode Island,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,...,"(41.708280193, -71.522470314)",OWS,OWS1,Q037,VALUE,44,Race/Ethnicity,Hispanic,RACE,RACEHIS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88624,64f163aa182b38295adc2d09,2021,2021,ND,North Dakota,Behavioral Risk Factor Surveillance System,Fruits and Vegetables,Fruits and Vegetables - Behavior,Percent of adults who report consuming vegetab...,,...,"(47.47531977900047, -100.11842104899966)",FV,FV1,Q019,VALUE,38,Gender,Male,GEN,MALE
88625,64f163aa182b38295adc2d0a,2021,2021,PR,Puerto Rico,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,"(18.220833, -66.590149)",PA,PA1,Q047,VALUE,72,Income,"$35,000 - $49,999",INC,INC3550
88626,64f163aa182b38295adc2d0b,2021,2021,UT,Utah,Behavioral Risk Factor Surveillance System,Fruits and Vegetables,Fruits and Vegetables - Behavior,Percent of adults who report consuming fruit l...,,...,"(39.360700171000474, -111.58713063499971)",FV,FV1,Q018,VALUE,49,Income,"$50,000 - $74,999",INC,INC5075
88627,64f163aa182b38295adc2d0c,2021,2021,WI,Wisconsin,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,"(44.39319117400049, -89.81637074199966)",PA,PA1,Q047,VALUE,55,Income,"$75,000 or greater",INC,INC75PLUS


In [17]:
#Review data type on 'YearEnd'
for x in npao_df['YearEnd']:
    
    print(type(x))

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class

In [18]:
#Filter all the data after 2010
npao_byyear= npao_df.query("YearEnd > 2010")
npao_byyear

Unnamed: 0,_id,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,64f163a6182b38295adad2d9,2020,2020,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,,PA,PA1,Q047,VALUE,59,Race/Ethnicity,Hispanic,RACE,RACEHIS
1,64f163a6182b38295adad2da,2015,2015,PR,Puerto Rico,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,"(18.220833, -66.590149)",PA,PA1,Q047,VALUE,72,Income,"$25,000 - $34,999",INC,INC2535
2,64f163a6182b38295adad2db,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,...,"(32.840571122, -86.631860762)",OWS,OWS1,Q036,VALUE,1,Age (years),25 - 34,AGEYR,AGEYR2534
3,64f163a6182b38295adad2dc,2015,2015,GU,Guam,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,"(13.444304, 144.793731)",PA,PA1,Q047,VALUE,66,Education,High school graduate,EDU,EDUHSGRAD
4,64f163a6182b38295adad2dd,2015,2015,RI,Rhode Island,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,...,"(41.708280193, -71.522470314)",OWS,OWS1,Q037,VALUE,44,Race/Ethnicity,Hispanic,RACE,RACEHIS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88624,64f163aa182b38295adc2d09,2021,2021,ND,North Dakota,Behavioral Risk Factor Surveillance System,Fruits and Vegetables,Fruits and Vegetables - Behavior,Percent of adults who report consuming vegetab...,,...,"(47.47531977900047, -100.11842104899966)",FV,FV1,Q019,VALUE,38,Gender,Male,GEN,MALE
88625,64f163aa182b38295adc2d0a,2021,2021,PR,Puerto Rico,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,"(18.220833, -66.590149)",PA,PA1,Q047,VALUE,72,Income,"$35,000 - $49,999",INC,INC3550
88626,64f163aa182b38295adc2d0b,2021,2021,UT,Utah,Behavioral Risk Factor Surveillance System,Fruits and Vegetables,Fruits and Vegetables - Behavior,Percent of adults who report consuming fruit l...,,...,"(39.360700171000474, -111.58713063499971)",FV,FV1,Q018,VALUE,49,Income,"$50,000 - $74,999",INC,INC5075
88627,64f163aa182b38295adc2d0c,2021,2021,WI,Wisconsin,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,"(44.39319117400049, -89.81637074199966)",PA,PA1,Q047,VALUE,55,Income,"$75,000 or greater",INC,INC75PLUS


In [19]:
dropped_years = npao_byyear[npao_byyear['Age(years)'] != '']

dropped_years

Unnamed: 0,_id,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
2,64f163a6182b38295adad2db,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,...,"(32.840571122, -86.631860762)",OWS,OWS1,Q036,VALUE,1,Age (years),25 - 34,AGEYR,AGEYR2534
17,64f163a6182b38295adad2ea,2015,2015,PR,Puerto Rico,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,...,"(18.220833, -66.590149)",OWS,OWS1,Q037,VALUE,72,Age (years),55 - 64,AGEYR,AGEYR5564
18,64f163a6182b38295adad2eb,2015,2015,GU,Guam,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,...,"(13.444304, 144.793731)",OWS,OWS1,Q036,VALUE,66,Age (years),55 - 64,AGEYR,AGEYR5564
25,64f163a6182b38295adad2f2,2011,2011,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,,PA,PA1,Q047,VALUE,59,Age (years),18 - 24,AGEYR,AGEYR1824
29,64f163a6182b38295adad2f6,2015,2015,PR,Puerto Rico,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,"(18.220833, -66.590149)",PA,PA1,Q047,VALUE,72,Age (years),45 - 54,AGEYR,AGEYR4554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88609,64f163aa182b38295adc2cfa,2021,2021,US,National,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,...,,OWS,OWS1,Q037,VALUE,59,Age (years),45 - 54,AGEYR,AGEYR4554
88610,64f163aa182b38295adc2cfb,2021,2021,LA,Louisiana,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,...,"(31.31266064400046, -92.44568007099969)",OWS,OWS1,Q037,VALUE,22,Age (years),55 - 64,AGEYR,AGEYR5564
88617,64f163aa182b38295adc2d02,2021,2021,SC,South Carolina,Behavioral Risk Factor Surveillance System,Fruits and Vegetables,Fruits and Vegetables - Behavior,Percent of adults who report consuming fruit l...,,...,"(33.998821303000454, -81.04537120699968)",FV,FV1,Q018,VALUE,45,Age (years),35 - 44,AGEYR,AGEYR3544
88621,64f163aa182b38295adc2d06,2021,2021,HI,Hawaii,Behavioral Risk Factor Surveillance System,Fruits and Vegetables,Fruits and Vegetables - Behavior,Percent of adults who report consuming fruit l...,,...,"(21.304850435000446, -157.85774940299973)",FV,FV1,Q018,VALUE,15,Age (years),18 - 24,AGEYR,AGEYR1824


In [20]:
#Additional inquiries with both datasets:
# Print out the unique locations in NPAO collection
print(f'The number of unique locations: {len(dropped_years["LocationAbbr"].unique())}')

dropped_years["LocationAbbr"].unique()

#This data set includes only data from the United States(including Guam and Puerto Rico).

The number of unique locations: 55


array(['AL', 'PR', 'GU', 'US', 'VA', 'WA', 'SD', 'WY', 'ID', 'VI', 'DC',
       'MN', 'CO', 'KS', 'SC', 'PA', 'NE', 'TX', 'IL', 'NH', 'MI', 'NM',
       'UT', 'MD', 'MS', 'AZ', 'IN', 'NY', 'HI', 'FL', 'OH', 'IA', 'KY',
       'MO', 'CT', 'NC', 'WI', 'MT', 'RI', 'NJ', 'NV', 'MA', 'AR', 'ND',
       'ME', 'AK', 'DE', 'VT', 'WV', 'GA', 'OR', 'TN', 'OK', 'LA', 'CA'],
      dtype=object)

In [21]:
#Checking the "Data_Value" column for data type 

for x in dropped_years['Data_Value']:
    
    print(type(x))

<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class '

In [22]:
#Dropping 'Data_value' that has 0 value (No sure if it happened)

dropped_years['Data_Value'] = pd.to_numeric(dropped_years['Data_Value'], errors='coerce')

dropped_years.dropna(subset=['Data_Value'], inplace = True)

dropped_years['Data_Value'].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dropped_years['Data_Value'] = pd.to_numeric(dropped_years['Data_Value'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dropped_years.dropna(subset=['Data_Value'], inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dropped_years['Data_Value'].fillna(0, inplace=True)


In [23]:
dropped_years
    
dropped_years.to_csv(' dropped_years.csv', index=False)

In [24]:
grouped_data =dropped_years.groupby('YearEnd')['Data_Value'].mean()

grouped_data

YearEnd
2011    31.568178
2012    28.415491
2013    31.745993
2014    28.834877
2015    32.024780
2016    29.187475
2017    31.046228
2018    29.679527
2019    32.682739
2020    29.631790
2021    30.609321
Name: Data_Value, dtype: float64

In [25]:
#Find the results by 'QuestionID'
results2= dropped_years.groupby(['QuestionID','YearEnd','Age(years)'])['Data_Value'].mean()
print(results2)


QuestionID  YearEnd  Age(years) 
Q018        2017     18 - 24        40.638889
                     25 - 34        37.672222
                     35 - 44        36.716667
                     45 - 54        39.318519
                     55 - 64        37.603704
                                      ...    
Q047        2021     25 - 34        20.005556
                     35 - 44        20.664815
                     45 - 54        23.846296
                     55 - 64        27.492593
                     65 or older    31.590741
Name: Data_Value, Length: 354, dtype: float64


In [26]:
year_results=  results2.reset_index()
year_results

Unnamed: 0,QuestionID,YearEnd,Age(years),Data_Value
0,Q018,2017,18 - 24,40.638889
1,Q018,2017,25 - 34,37.672222
2,Q018,2017,35 - 44,36.716667
3,Q018,2017,45 - 54,39.318519
4,Q018,2017,55 - 64,37.603704
...,...,...,...,...
349,Q047,2021,25 - 34,20.005556
350,Q047,2021,35 - 44,20.664815
351,Q047,2021,45 - 54,23.846296
352,Q047,2021,55 - 64,27.492593


In [27]:
#Join DataFrames

combined_df=year_results.merge(questions_df,on='QuestionID',how='left')

#Rename 'YearEnd' to 'Year':
combined_df = combined_df.rename(columns={'YearEnd': 'Year'})


combined_df

Unnamed: 0,QuestionID,Year,Age(years),Data_Value,Question
0,Q018,2017,18 - 24,40.638889,Percent of adults who report consuming fruit l...
1,Q018,2017,25 - 34,37.672222,Percent of adults who report consuming fruit l...
2,Q018,2017,35 - 44,36.716667,Percent of adults who report consuming fruit l...
3,Q018,2017,45 - 54,39.318519,Percent of adults who report consuming fruit l...
4,Q018,2017,55 - 64,37.603704,Percent of adults who report consuming fruit l...
...,...,...,...,...,...
349,Q047,2021,25 - 34,20.005556,Percent of adults who engage in no leisure-tim...
350,Q047,2021,35 - 44,20.664815,Percent of adults who engage in no leisure-tim...
351,Q047,2021,45 - 54,23.846296,Percent of adults who engage in no leisure-tim...
352,Q047,2021,55 - 64,27.492593,Percent of adults who engage in no leisure-tim...


In [28]:
#Switching over to the NCHS Dataset
#Narrow the NCHS data to only include data from the year 2010 or later.
# Query
query = {'Year': {'$gte': 2010}}
fields = {'Race': 1, 'Year': 1, 'Sex': 1, 'Average Life Expectancy (Years)': 1, 'Age-adjusted Death Rate': 1}
sort = [('Year', -1)]
#limit = 5

# Cast the results as a list and save them to a variable
results = list(nchs.find(query, fields).sort(sort))
#results = list(nchs.find(query, fields).sort(sort).limit(limit))

# Pretty print the results
pprint(results)

[{'Age-adjusted Death Rate': 723.6,
  'Average Life Expectancy (Years)': 78.7,
  'Race': 'All Races',
  'Sex': 'Both Sexes',
  'Year': 2018,
  '_id': ObjectId('64f16338a441037430f7b234')},
 {'Age-adjusted Death Rate': 611.3,
  'Average Life Expectancy (Years)': 81.2,
  'Race': 'All Races',
  'Sex': 'Female',
  'Year': 2018,
  '_id': ObjectId('64f16338a441037430f7b2b5')},
 {'Age-adjusted Death Rate': 855.5,
  'Average Life Expectancy (Years)': 76.2,
  'Race': 'All Races',
  'Sex': 'Male',
  'Year': 2018,
  '_id': ObjectId('64f16338a441037430f7b31d')},
 {'Age-adjusted Death Rate': 852.9,
  'Average Life Expectancy (Years)': '',
  'Race': 'Black',
  'Sex': 'Both Sexes',
  'Year': 2018,
  '_id': ObjectId('64f16338a441037430f7b399')},
 {'Age-adjusted Death Rate': 702.6,
  'Average Life Expectancy (Years)': '',
  'Race': 'Black',
  'Sex': 'Female',
  'Year': 2018,
  '_id': ObjectId('64f16338a441037430f7b432')},
 {'Age-adjusted Death Rate': 1051.5,
  'Average Life Expectancy (Years)': '',
  '

In [29]:
#Load the 2010 or later data into a Dataframe
aggregated_df = pd.json_normalize(results)
aggregated_df


Unnamed: 0,_id,Year,Race,Sex,Average Life Expectancy (Years),Age-adjusted Death Rate
0,64f16338a441037430f7b234,2018,All Races,Both Sexes,78.7,723.6
1,64f16338a441037430f7b2b5,2018,All Races,Female,81.2,611.3
2,64f16338a441037430f7b31d,2018,All Races,Male,76.2,855.5
3,64f16338a441037430f7b399,2018,Black,Both Sexes,,852.9
4,64f16338a441037430f7b432,2018,Black,Female,,702.6
...,...,...,...,...,...,...
76,64f16338a441037430f7b403,2010,Black,Female,78.0,752.5
77,64f16338a441037430f7b479,2010,Black,Male,71.8,1104.0
78,64f16338a441037430f7b4f1,2010,White,Both Sexes,78.9,741.8
79,64f16338a441037430f7b56e,2010,White,Female,81.3,630.8


In [116]:
#Drop rows with empty value spaces.
dropped_aggregated_nchs = aggregated_df[aggregated_df['Average Life Expectancy (Years)'] != '']

dropped_aggregated_nchs

Unnamed: 0,_id,Year,Race,Sex,Average Life Expectancy (Years),Age-adjusted Death Rate
0,64f16338a441037430f7b234,2018,All Races,Both Sexes,78.7,723.6
1,64f16338a441037430f7b2b5,2018,All Races,Female,81.2,611.3
2,64f16338a441037430f7b31d,2018,All Races,Male,76.2,855.5
9,64f16338a441037430f7b233,2017,All Races,Both Sexes,78.6,731.9
10,64f16338a441037430f7b2b1,2017,All Races,Female,81.1,619.7
...,...,...,...,...,...,...
76,64f16338a441037430f7b403,2010,Black,Female,78.0,752.5
77,64f16338a441037430f7b479,2010,Black,Male,71.8,1104.0
78,64f16338a441037430f7b4f1,2010,White,Both Sexes,78.9,741.8
79,64f16338a441037430f7b56e,2010,White,Female,81.3,630.8


In [31]:
#npao_mergeable_df = pd.json_normalize(results)
#npao_mergeable_df = npao_mergeable_df.drop("_id", axis='columns')
#npao_mergeable_df = npao_mergeable_df.groupby('Question')

#npao_mergeable_df = npao_mergeable_df.groupby(['YearEnd'])['Data_Value'].mean()
#npao_mergeable_df = npao_mergeable_df.rename(columns={'YearEnd': 'Year'})

#npao_mergeable_df = npao_mergeable_df.reset_index()
#npao_df
#npao_mergeable_df

In [32]:
#Create a dataframe from NCHS Collection. Include only "All Races" and "Both Sexes"
#dropped_aggregated = dropped_aggregated.drop("_id", axis='columns')
#dropped_aggregated = dropped_aggregated[dropped_aggregated.columns.drop(list(dropped_aggregated.filter(regex='Black')))]

# create a Boolean mask for the rows to remove
mask = dropped_aggregated_nchs['Race'] == 'All Races'
mask_2 = dropped_aggregated_nchs['Sex'] == 'Both Sexes'

# include only the rows that contain 'All Races' and 'Both Sexes'
dropped_aggregated_nchs = dropped_aggregated_nchs[mask]
nchs_df_allgroups = dropped_aggregated_nchs[mask_2]

# print the resulting DataFrame

nchs_df_allgroups

  nchs_df_allgroups = dropped_aggregated_nchs[mask_2]


Unnamed: 0,_id,Year,Race,Sex,Average Life Expectancy (Years),Age-adjusted Death Rate
0,64f16338a441037430f7b234,2018,All Races,Both Sexes,78.7,723.6
9,64f16338a441037430f7b233,2017,All Races,Both Sexes,78.6,731.9
18,64f16338a441037430f7b232,2016,All Races,Both Sexes,78.7,728.8
27,64f16338a441037430f7b231,2015,All Races,Both Sexes,78.7,733.1
36,64f16338a441037430f7b230,2014,All Races,Both Sexes,78.9,724.6
45,64f16338a441037430f7b22f,2013,All Races,Both Sexes,78.8,731.9
54,64f16338a441037430f7b22e,2012,All Races,Both Sexes,78.8,732.8
63,64f16338a441037430f7b240,2011,All Races,Both Sexes,78.7,741.3
72,64f16338a441037430f7b22d,2010,All Races,Both Sexes,78.7,747.0


In [33]:
#This data frame includes data from all races and sexes:

simple_compared_df = pd.merge(nchs_df_allgroups, combined_df, on='Year', how='inner')

#Drop the "ID" column, since it doesn't have any utility in analysis.

simple_compared_df = simple_compared_df.drop("_id", axis='columns')

simple_compared_df

Unnamed: 0,Year,Race,Sex,Average Life Expectancy (Years),Age-adjusted Death Rate,QuestionID,Age(years),Data_Value,Question
0,2018,All Races,Both Sexes,78.7,723.6,Q036,18 - 24,18.594444,Percent of adults aged 18 years and older who ...
1,2018,All Races,Both Sexes,78.7,723.6,Q036,25 - 34,30.350000,Percent of adults aged 18 years and older who ...
2,2018,All Races,Both Sexes,78.7,723.6,Q036,35 - 44,35.077778,Percent of adults aged 18 years and older who ...
3,2018,All Races,Both Sexes,78.7,723.6,Q036,45 - 54,37.701852,Percent of adults aged 18 years and older who ...
4,2018,All Races,Both Sexes,78.7,723.6,Q036,55 - 64,35.327778,Percent of adults aged 18 years and older who ...
...,...,...,...,...,...,...,...,...,...
247,2011,All Races,Both Sexes,78.7,741.3,Q047,25 - 34,22.048077,Percent of adults who engage in no leisure-tim...
248,2011,All Races,Both Sexes,78.7,741.3,Q047,35 - 44,24.169231,Percent of adults who engage in no leisure-tim...
249,2011,All Races,Both Sexes,78.7,741.3,Q047,45 - 54,26.625000,Percent of adults who engage in no leisure-tim...
250,2011,All Races,Both Sexes,78.7,741.3,Q047,55 - 64,28.721154,Percent of adults who engage in no leisure-tim...


In [98]:
dropped_ethnicities = npao_byyear[npao_byyear['Race/Ethnicity'] != '']

dropped_ethnicities

Unnamed: 0,_id,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,64f163a6182b38295adad2d9,2020,2020,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,,PA,PA1,Q047,VALUE,59,Race/Ethnicity,Hispanic,RACE,RACEHIS
4,64f163a6182b38295adad2dd,2015,2015,RI,Rhode Island,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,...,"(41.708280193, -71.522470314)",OWS,OWS1,Q037,VALUE,44,Race/Ethnicity,Hispanic,RACE,RACEHIS
5,64f163a6182b38295adad2de,2011,2011,US,National,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,...,,OWS,OWS1,Q036,VALUE,59,Race/Ethnicity,American Indian/Alaska Native,RACE,RACENAA
7,64f163a6182b38295adad2e0,2020,2020,DE,Delaware,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,"(39.008830667000495, -75.57774116799965)",PA,PA1,Q047,VALUE,10,Race/Ethnicity,Asian,RACE,RACEASN
8,64f163a6182b38295adad2e1,2015,2015,PR,Puerto Rico,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who achieve at least 150 min...,,...,"(18.220833, -66.590149)",PA,PA1,Q043,VALUE,72,Race/Ethnicity,Non-Hispanic White,RACE,RACEWHT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88605,64f163aa182b38295adc2cf6,2021,2021,ID,Idaho,Behavioral Risk Factor Surveillance System,Fruits and Vegetables,Fruits and Vegetables - Behavior,Percent of adults who report consuming fruit l...,,...,"(43.682630005000476, -114.3637300419997)",FV,FV1,Q018,VALUE,16,Race/Ethnicity,Hispanic,RACE,RACEHIS
88606,64f163aa182b38295adc2cf7,2021,2021,WV,West Virginia,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,...,"(38.66551020200046, -80.71264013499967)",OWS,OWS1,Q036,VALUE,54,Race/Ethnicity,American Indian/Alaska Native,RACE,RACENAA
88614,64f163aa182b38295adc2cff,2021,2021,AL,Alabama,Behavioral Risk Factor Surveillance System,Fruits and Vegetables,Fruits and Vegetables - Behavior,Percent of adults who report consuming vegetab...,,...,"(32.84057112200048, -86.63186076199969)",FV,FV1,Q019,VALUE,1,Race/Ethnicity,Non-Hispanic White,RACE,RACEWHT
88616,64f163aa182b38295adc2d01,2021,2021,NJ,New Jersey,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,...,"(40.13057004800049, -74.27369128799967)",PA,PA1,Q047,VALUE,34,Race/Ethnicity,Hispanic,RACE,RACEHIS


In [101]:
dropped_ethnicities['Data_Value'] = pd.to_numeric(dropped_ethnicities['Data_Value'], errors='coerce')

dropped_ethnicities.dropna(subset=['Data_Value'], inplace = True)

dropped_ethnicities['Data_Value'].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dropped_ethnicities['Data_Value'] = pd.to_numeric(dropped_ethnicities['Data_Value'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dropped_ethnicities.dropna(subset=['Data_Value'], inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dropped_ethnicities['Data_Value'].fillna(0, inplace=True)


In [102]:
results3= dropped_ethnicities.groupby(['QuestionID','YearEnd','Race/Ethnicity'])['Data_Value'].mean()
print(results3)

QuestionID  YearEnd  Race/Ethnicity               
Q018        2017     2 or more races                  38.690385
                     American Indian/Alaska Native    39.770000
                     Asian                            32.527778
                     Hawaiian/Pacific Islander        39.633333
                     Hispanic                         32.833333
                                                        ...    
Q047        2021     Hawaiian/Pacific Islander        24.150000
                     Hispanic                         29.133962
                     Non-Hispanic Black               28.014894
                     Non-Hispanic White               21.701887
                     Other                            26.626923
Name: Data_Value, Length: 472, dtype: float64


In [103]:
ethnicity_results=  results3.reset_index()
ethnicity_results

Unnamed: 0,QuestionID,YearEnd,Race/Ethnicity,Data_Value
0,Q018,2017,2 or more races,38.690385
1,Q018,2017,American Indian/Alaska Native,39.770000
2,Q018,2017,Asian,32.527778
3,Q018,2017,Hawaiian/Pacific Islander,39.633333
4,Q018,2017,Hispanic,32.833333
...,...,...,...,...
467,Q047,2021,Hawaiian/Pacific Islander,24.150000
468,Q047,2021,Hispanic,29.133962
469,Q047,2021,Non-Hispanic Black,28.014894
470,Q047,2021,Non-Hispanic White,21.701887


In [104]:
#Join DataFrames

combined_df2=ethnicity_results.merge(questions_df,on='QuestionID',how='left')

#Rename 'YearEnd' to 'Year':
combined_df2 = combined_df2.rename(columns={'YearEnd': 'Year'})


combined_df2

Unnamed: 0,QuestionID,Year,Race/Ethnicity,Data_Value,Question
0,Q018,2017,2 or more races,38.690385,Percent of adults who report consuming fruit l...
1,Q018,2017,American Indian/Alaska Native,39.770000,Percent of adults who report consuming fruit l...
2,Q018,2017,Asian,32.527778,Percent of adults who report consuming fruit l...
3,Q018,2017,Hawaiian/Pacific Islander,39.633333,Percent of adults who report consuming fruit l...
4,Q018,2017,Hispanic,32.833333,Percent of adults who report consuming fruit l...
...,...,...,...,...,...
467,Q047,2021,Hawaiian/Pacific Islander,24.150000,Percent of adults who engage in no leisure-tim...
468,Q047,2021,Hispanic,29.133962,Percent of adults who engage in no leisure-tim...
469,Q047,2021,Non-Hispanic Black,28.014894,Percent of adults who engage in no leisure-tim...
470,Q047,2021,Non-Hispanic White,21.701887,Percent of adults who engage in no leisure-tim...


In [105]:
#Find the number of names of ethnic categories in the NPAO dataset:

print(f'The number of unique ethnic groups in the NPAO Dataset: {len(combined_df2["Race/Ethnicity"].unique())}')

combined_df2["Race/Ethnicity"].unique()

The number of unique ethnic groups in the NPAO Dataset: 8


array(['2 or more races', 'American Indian/Alaska Native', 'Asian',
       'Hawaiian/Pacific Islander', 'Hispanic', 'Non-Hispanic Black',
       'Non-Hispanic White', 'Other'], dtype=object)

In [131]:
#Find the number of names of ethnic categories in the NCHS dataset:

print(f'The number of unique ethnic groups in the NCHS Dataset: {len(dropped_aggregated_nchs["Race"].unique())}')

dropped_aggregated_nchs["Race"].unique()

The number of unique ethnic groups in the NCHS Dataset: 3


array(['All Races', 'Black', 'White'], dtype=object)

In [106]:
#Replace cells in NPAO dataset to narrow the number of ethnic groups, so the data is comparable with the NCHS dataset:

#Replace ethnic groups to match groups in NCHS data:
combined_df2.replace(['Non-Hispanic White'], 'White', inplace=True)
combined_df2.replace(['Non-Hispanic Black'], 'Black', inplace=True)

#dropped_races.replace(to_replace="Non-Hispanic White",
#           value="White")
#dropped_races.replace(to_replace="Non-Hispanic Black",
#           value="Black")

combined_df2.head(10)

Unnamed: 0,QuestionID,Year,Race/Ethnicity,Data_Value,Question
0,Q018,2017,2 or more races,38.690385,Percent of adults who report consuming fruit l...
1,Q018,2017,American Indian/Alaska Native,39.77,Percent of adults who report consuming fruit l...
2,Q018,2017,Asian,32.527778,Percent of adults who report consuming fruit l...
3,Q018,2017,Hawaiian/Pacific Islander,39.633333,Percent of adults who report consuming fruit l...
4,Q018,2017,Hispanic,32.833333,Percent of adults who report consuming fruit l...
5,Q018,2017,Black,40.639535,Percent of adults who report consuming fruit l...
6,Q018,2017,White,36.269811,Percent of adults who report consuming fruit l...
7,Q018,2017,Other,27.383333,Percent of adults who report consuming fruit l...
8,Q018,2019,2 or more races,40.034694,Percent of adults who report consuming fruit l...
9,Q018,2019,American Indian/Alaska Native,41.859375,Percent of adults who report consuming fruit l...


In [108]:
#Include only rows with "White" and "Black" listed as Race. This will make a table able to be joined with NCHS data.

white_black_df = combined_df2.loc[(combined_df2['Race/Ethnicity'] == 'White') | (combined_df2['Race/Ethnicity'] == 'Black')]

#Rename 'Race/Ethnicity' to 'Race'
white_black_df = white_black_df.rename(columns={'Race/Ethnicity': 'Race'})

white_black_df.head()

Unnamed: 0,QuestionID,Year,Race,Data_Value,Question
5,Q018,2017,Black,40.639535,Percent of adults who report consuming fruit l...
6,Q018,2017,White,36.269811,Percent of adults who report consuming fruit l...
13,Q018,2019,Black,43.772093,Percent of adults who report consuming fruit l...
14,Q018,2019,White,39.484615,Percent of adults who report consuming fruit l...
21,Q018,2021,Black,40.048889,Percent of adults who report consuming fruit l...


In [123]:
#Back to the NCHS Data, drop columns we're not worried about, such as 'ID':
df4 = dropped_aggregated_nchs.drop("_id", axis='columns')
df5 = df4.drop("Sex", axis='columns')

In [124]:
race_compared_df = pd.merge(df5, white_black_df, on=['Year', 'Race'], how='outer')
race_compared_df

Unnamed: 0,Year,Race,Average Life Expectancy (Years),Age-adjusted Death Rate,QuestionID,Data_Value,Question
0,2018,All Races,78.7,723.6,,,
1,2018,All Races,81.2,611.3,,,
2,2018,All Races,76.2,855.5,,,
3,2017,All Races,78.6,731.9,,,
4,2017,All Races,81.1,619.7,,,
...,...,...,...,...,...,...,...
302,2020,Black,,,Q037,31.709302,Percent of adults aged 18 years and older who ...
303,2020,Black,,,Q047,27.970455,Percent of adults who engage in no leisure-tim...
304,2020,White,,,Q036,30.307547,Percent of adults aged 18 years and older who ...
305,2020,White,,,Q037,35.403774,Percent of adults aged 18 years and older who ...


In [127]:
race_compared_df = race_compared_df.dropna()

In [128]:
race_compared_df

Unnamed: 0,Year,Race,Average Life Expectancy (Years),Age-adjusted Death Rate,QuestionID,Data_Value,Question
6,2017,Black,75.3,854.1,Q018,40.639535,Percent of adults who report consuming fruit l...
7,2017,Black,75.3,854.1,Q019,25.709302,Percent of adults who report consuming vegetab...
8,2017,Black,75.3,854.1,Q036,37.774419,Percent of adults aged 18 years and older who ...
9,2017,Black,75.3,854.1,Q037,33.718605,Percent of adults aged 18 years and older who ...
10,2017,Black,75.3,854.1,Q043,43.825581,Percent of adults who achieve at least 150 min...
...,...,...,...,...,...,...,...
253,2011,White,76.6,870.2,Q043,53.463462,Percent of adults who achieve at least 150 min...
254,2011,White,76.6,870.2,Q044,20.567308,Percent of adults who achieve at least 150 min...
255,2011,White,76.6,870.2,Q045,33.153846,Percent of adults who achieve at least 300 min...
256,2011,White,76.6,870.2,Q046,28.957692,Percent of adults who engage in muscle-strengt...


# Part 3: Load
Export key dataframes as CSV files.

In [38]:
combined_df.to_csv('combined_results.csv', index=False)

In [39]:
simple_compared_df.to_csv('compared_results_all_sexes_all_ages.csv', index=False)

In [129]:
race_compared_df.to_csv('compared_results_by_race.csv', index=False)