# Physical Activity Analysis Across Different Age Groups in the US
  - Your analysis here

In [3]:
#import dependencies
import pandas as pd
import matplotlib as plt
import numpy as np
import scipy as sc
from pathlib import Path 

#import dataset from csv file
file = Path('data/Physical_Activity__and_Obesity_-_Behavioral_Risk_Factor_Surveillance_System.csv')
data_df = pd.read_csv(file)

#preview of dataframe
data_df.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.840571122, -86.631860762)",OWS,OWS1,Q036,VALUE,1,Age (years),25 - 34,AGEYR,AGEYR2534
1,2015,2015,GU,Guam,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(13.444304, 144.793731)",OWS,OWS1,Q036,VALUE,66,Age (years),55 - 64,AGEYR,AGEYR5564
2,2011,2011,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Age (years),18 - 24,AGEYR,AGEYR1824
3,2015,2015,PR,Puerto Rico,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(18.220833, -66.590149)",OWS,OWS1,Q037,VALUE,72,Age (years),55 - 64,AGEYR,AGEYR5564
4,2015,2015,PR,Puerto Rico,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(18.220833, -66.590149)",PA,PA1,Q047,VALUE,72,Age (years),45 - 54,AGEYR,AGEYR4554


#Dataset cleaning

In [4]:
#Check for missing data in columns 

data_df.count()

YearStart                     18024
YearEnd                       18024
LocationAbbr                  18024
LocationDesc                  18024
Datasource                    18024
Class                         18024
Topic                         18024
Question                      18024
Data_Value_Unit                   0
Data_Value_Type               18024
Data_Value                    17946
Data_Value_Alt                17946
Data_Value_Footnote_Symbol       78
Data_Value_Footnote              78
Low_Confidence_Limit          17946
High_Confidence_Limit         17946
Sample_Size                   17946
Total                             0
Age(years)                    18024
Education                         0
Gender                            0
Income                            0
Race/Ethnicity                    0
GeoLocation                   17688
ClassID                       18024
TopicID                       18024
QuestionID                    18024
DataValueTypeID             

In [5]:
#Check data type of each column
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18024 entries, 0 to 18023
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   YearStart                   18024 non-null  int64  
 1   YearEnd                     18024 non-null  int64  
 2   LocationAbbr                18024 non-null  object 
 3   LocationDesc                18024 non-null  object 
 4   Datasource                  18024 non-null  object 
 5   Class                       18024 non-null  object 
 6   Topic                       18024 non-null  object 
 7   Question                    18024 non-null  object 
 8   Data_Value_Unit             0 non-null      float64
 9   Data_Value_Type             18024 non-null  object 
 10  Data_Value                  17946 non-null  float64
 11  Data_Value_Alt              17946 non-null  float64
 12  Data_Value_Footnote_Symbol  78 non-null     object 
 13  Data_Value_Footnote         78 

In [6]:
#getting all column names
data_df.columns

Index(['YearStart', 'YearEnd', 'LocationAbbr', 'LocationDesc', 'Datasource',
       'Class', 'Topic', 'Question', 'Data_Value_Unit', 'Data_Value_Type',
       'Data_Value', 'Data_Value_Alt', 'Data_Value_Footnote_Symbol',
       'Data_Value_Footnote', 'Low_Confidence_Limit', 'High_Confidence_Limit ',
       'Sample_Size', 'Total', 'Age(years)', 'Education', 'Gender', 'Income',
       'Race/Ethnicity', 'GeoLocation', 'ClassID', 'TopicID', 'QuestionID',
       'DataValueTypeID', 'LocationID', 'StratificationCategory1',
       'Stratification1', 'StratificationCategoryId1', 'StratificationID1'],
      dtype='object')

In [8]:
#reducing data set with relevant columns
reduced_data_df = data_df[['YearStart', 'YearEnd', 'LocationAbbr', 'LocationDesc', 'Class', 'Question','Data_Value', 'Low_Confidence_Limit', 'High_Confidence_Limit ','Sample_Size', 'Age(years)']]
reduced_data_df.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Class,Question,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,Age(years)
0,2011,2011,AL,Alabama,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,35.2,30.7,40.0,598.0,25 - 34
1,2015,2015,GU,Guam,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,33.7,25.1,43.4,285.0,55 - 64
2,2011,2011,US,National,Physical Activity,Percent of adults who engage in no leisure-tim...,16.9,16.0,17.8,20923.0,18 - 24
3,2015,2015,PR,Puerto Rico,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,42.0,38.4,45.7,997.0,55 - 64
4,2015,2015,PR,Puerto Rico,Physical Activity,Percent of adults who engage in no leisure-tim...,50.4,46.4,54.3,862.0,45 - 54
