# County Level Summary  

The purpose of this notebook is to create county level summaries of the fitnessgram and academic test scores data.

## Load Data

In [23]:
#Import Packages
import os
import pandas as pd
from IPython.display import display
pd.options.display.max_rows = 101


#*******************************************************************************
#*******************************************************************************
#Set these file paths for your own local machine before running
#*******************************************************************************
#*******************************************************************************

#Set file path containing fitnessgram data
datapath = "/Users/nwchen24/Desktop/UC_Berkeley/w209_Data_Viz/final_proj_repo2/Combined_Data/"

fitnessgram_datapath = "/Users/nwchen24/Desktop/UC_Berkeley/w209_Data_Viz/final_project_data/Fitnessgram_Results"

combined_outfile = 'Comb_County_lvl_Fitnessgram_Academic_2014_2016_1.1.csv'

In [15]:
combined_data = pd.read_csv(datapath + 'Comb_Fitnessgram_Academic_2014_2016_1.1.csv')
combined_data = combined_data.drop(['Unnamed: 0', 'NoHFZ9_aerobic', 'NoHFZ9_bodycomp'], axis = 1)

In [16]:
combined_data.columns

Index([u'County_Name', u'School_Name', u'Subgroup', u'NoStud9_aerobic',
       u'Perc9HFZ_aerobic', u'Perc9NI_aerobic', u'Perc9NI_HR_aerobic',
       u'NoStud9_bodycomp', u'Perc9HFZ_bodycomp', u'Perc9NI_bodycomp',
       u'Perc9NI_HR_bodycomp', u'Mean_Academic_Test_Score',
       u'Students_Tested_Academic',
       u'Total_Tested_At_Subgroup_Level_Academic'],
      dtype='object')

### Get Fitness Data on County Level

In [29]:
#Get county level results for fitnessgram
#First, get average of percentage of students in each zone for aerobic and bodycomp
combined_data_fitness = combined_data.drop(['Mean_Academic_Test_Score', 'Students_Tested_Academic', 'Total_Tested_At_Subgroup_Level_Academic'], axis = 1)

combined_data_fitness['fitness_comb_HFZ_pct'] = (combined_data_fitness['Perc9HFZ_aerobic'] + combined_data_fitness['Perc9HFZ_bodycomp']) / 2
combined_data_fitness = combined_data_fitness.drop(['Perc9HFZ_aerobic', 'Perc9HFZ_bodycomp'], axis = 1)

combined_data_fitness['fitness_comb_NI_pct'] = (combined_data_fitness['Perc9NI_aerobic'] + combined_data_fitness['Perc9NI_bodycomp']) / 2
combined_data_fitness = combined_data_fitness.drop(['Perc9NI_aerobic', 'Perc9NI_bodycomp'], axis = 1)

combined_data_fitness['fitness_comb_NI_HR_pct'] = (combined_data_fitness['Perc9NI_HR_aerobic'] + combined_data_fitness['Perc9NI_HR_bodycomp']) / 2
combined_data_fitness = combined_data_fitness.drop(['Perc9NI_HR_aerobic', 'Perc9NI_HR_bodycomp'], axis = 1)

combined_data_fitness['fitness_number_students'] = combined_data_fitness['NoStud9_aerobic']
combined_data_fitness = combined_data_fitness.drop(['NoStud9_aerobic', 'NoStud9_bodycomp'], axis = 1)

#Second, calculate number of students at each school in each zone
combined_data_fitness['fitness_num_HFZ'] = combined_data_fitness['fitness_comb_HFZ_pct'] / 100. * combined_data_fitness['fitness_number_students']
combined_data_fitness = combined_data_fitness.drop(['fitness_comb_HFZ_pct'], axis = 1)

combined_data_fitness['fitness_num_NI'] = combined_data_fitness['fitness_comb_NI_pct'] / 100. * combined_data_fitness['fitness_number_students']
combined_data_fitness = combined_data_fitness.drop(['fitness_comb_NI_pct'], axis = 1)

combined_data_fitness['fitness_num_NI_HR'] = combined_data_fitness['fitness_comb_NI_HR_pct'] / 100. * combined_data_fitness['fitness_number_students']
combined_data_fitness = combined_data_fitness.drop(['fitness_comb_NI_HR_pct'], axis = 1)

#Third, sum by county and subgroup
#combined_data_fitness = combined_data_fitness.drop(['District_Code', 'School_Code'], axis = 1)

combined_data_fitness_county_grp = combined_data_fitness.groupby(['County_Name', 'Subgroup'], as_index=False).sum()

#Fourth, calculate percentage of students in each count in each zone
combined_data_fitness_county_grp['fitness_pct_HFZ'] = combined_data_fitness_county_grp['fitness_num_HFZ'] / combined_data_fitness_county_grp['fitness_number_students']
combined_data_fitness_county_grp['fitness_pct_NI'] = combined_data_fitness_county_grp['fitness_num_NI'] / combined_data_fitness_county_grp['fitness_number_students']
combined_data_fitness_county_grp['fitness_pct_NI_HR'] = combined_data_fitness_county_grp['fitness_num_NI_HR'] / combined_data_fitness_county_grp['fitness_number_students']

combined_data_fitness_county_grp = combined_data_fitness_county_grp.drop(['fitness_num_HFZ', 'fitness_num_NI', 'fitness_num_NI_HR', 'fitness_number_students'], axis = 1)
combined_data_fitness_county_grp.head(25)

#Fifth, reshape long to wide
combined_data_fitness_county = pd.pivot_table(combined_data_fitness_county_grp, values = ['fitness_pct_HFZ', 'fitness_pct_NI', 'fitness_pct_NI_HR'], index = ['County_Name'], columns = ['Subgroup'])

combined_data_fitness_county.columns = ['_'.join(col).strip() for col in combined_data_fitness_county.columns.values]

combined_data_fitness_county = combined_data_fitness_county.reset_index()

#combine the unhealthy fitness zones into one column
combined_data_fitness_county['fitness_pct_unhealthy_comb_ALL'] = combined_data_fitness_county['fitness_pct_NI_All'] + combined_data_fitness_county['fitness_pct_NI_HR_All']
combined_data_fitness_county['fitness_pct_unhealthy_comb_Economic_disadv'] = combined_data_fitness_county['fitness_pct_NI_Economic_disadv'] + combined_data_fitness_county['fitness_pct_NI_HR_Economic_disadv']
combined_data_fitness_county['fitness_pct_unhealthy_comb_NOT_economic_disadv'] = combined_data_fitness_county['fitness_pct_NI_NOT_economic_disadv'] + combined_data_fitness_county['fitness_pct_NI_HR_NOT_economic_disadv']

combined_data_fitness_county = combined_data_fitness_county.drop(['fitness_pct_NI_All', 'fitness_pct_NI_HR_All', 'fitness_pct_NI_Economic_disadv', 'fitness_pct_NI_HR_Economic_disadv', 'fitness_pct_NI_NOT_economic_disadv', 'fitness_pct_NI_HR_NOT_economic_disadv'], axis = 1)

combined_data_fitness_county.dtypes


County_Name                                        object
fitness_pct_HFZ_All                               float64
fitness_pct_HFZ_Economic_disadv                   float64
fitness_pct_HFZ_NOT_economic_disadv               float64
fitness_pct_unhealthy_comb_ALL                    float64
fitness_pct_unhealthy_comb_Economic_disadv        float64
fitness_pct_unhealthy_comb_NOT_economic_disadv    float64
dtype: object

### Get Academic Data on County Level

In [30]:
#Get county level results for fitnessgram
#First, get number of student points in each school
combined_data_academic = combined_data[['County_Name', 'Subgroup', 'Mean_Academic_Test_Score', 'Students_Tested_Academic', 'Total_Tested_At_Subgroup_Level_Academic']]

combined_data_academic['student_points'] = combined_data_academic['Mean_Academic_Test_Score'] * combined_data_academic['Total_Tested_At_Subgroup_Level_Academic']
combined_data_academic = combined_data_academic.drop(['Mean_Academic_Test_Score', 'Students_Tested_Academic'], axis = 1)

#sum by county and subgroup
combined_data_academic_county_group = combined_data_academic.groupby(['County_Name', 'Subgroup'], as_index=False).sum()

#get weighted average score at county level
combined_data_academic_county_group['academic_wght_avg_score'] = combined_data_academic_county_group['student_points'] / combined_data_academic_county_group['Total_Tested_At_Subgroup_Level_Academic']
combined_data_academic_county_group = combined_data_academic_county_group.drop(['Total_Tested_At_Subgroup_Level_Academic', 'student_points'], axis = 1)

#reshape long to wide
combined_data_academic_county = pd.pivot_table(combined_data_academic_county_group, values = ['academic_wght_avg_score'], index = ['County_Name'], columns = ['Subgroup'])

combined_data_academic_county.columns = ['_'.join(col).strip() for col in combined_data_academic_county.columns.values]

combined_data_academic_county = combined_data_academic_county.reset_index()


combined_data_academic_county.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,County_Name,academic_wght_avg_score_All,academic_wght_avg_score_Economic_disadv,academic_wght_avg_score_NOT_economic_disadv
0,Alameda,355.271326,325.452097,384.876137
1,Alpine,,,
2,Amador,269.833396,188.150312,332.966741
3,Butte,345.173664,334.487911,353.390633
4,Calaveras,355.557867,337.673849,366.480685


### Merge Academic and Fitness

In [31]:
County_level_combined_data = pd.merge(left = combined_data_academic_county, right = combined_data_fitness_county, on = 'County_Name')

County_level_combined_data.head()

#County_level_combined_data.shape

Unnamed: 0,County_Name,academic_wght_avg_score_All,academic_wght_avg_score_Economic_disadv,academic_wght_avg_score_NOT_economic_disadv,fitness_pct_HFZ_All,fitness_pct_HFZ_Economic_disadv,fitness_pct_HFZ_NOT_economic_disadv,fitness_pct_unhealthy_comb_ALL,fitness_pct_unhealthy_comb_Economic_disadv,fitness_pct_unhealthy_comb_NOT_economic_disadv
0,Alameda,355.271326,325.452097,384.876137,0.642583,0.520962,0.73462,0.355606,0.442271,0.231707
1,Alpine,,,,,,,,,
2,Amador,269.833396,188.150312,332.966741,0.66849,0.587222,0.702264,0.298901,0.331786,0.265178
3,Butte,345.173664,334.487911,353.390633,0.657105,0.587467,0.743108,0.324662,0.388525,0.226653
4,Calaveras,355.557867,337.673849,366.480685,0.644996,0.602104,0.685488,0.349932,0.37537,0.314512


### Export

In [32]:
County_level_combined_data.to_csv(datapath + combined_outfile)