# County Level Summary  

The purpose of this notebook is to create county level summaries of the fitnessgram and academic test scores data.

## Load Data

In [62]:
#Import Packages
import os
import pandas as pd
from IPython.display import display
pd.options.display.max_rows = 101


#*******************************************************************************
#*******************************************************************************
#Set these file paths for your own local machine before running
#*******************************************************************************
#*******************************************************************************

#Set file path containing fitnessgram data
datapath = "/Users/nwchen24/Desktop/UC_Berkeley/w209_Data_Viz/final_proj_repo2/Combined_Data/"

fitnessgram_datapath = "/Users/nwchen24/Desktop/UC_Berkeley/w209_Data_Viz/final_project_data/Fitnessgram_Results"

combined_outfile = 'Comb_County_lvl_Fitnessgram_Academic_2014_2016_1.1.csv'

In [63]:
combined_data = pd.read_csv(datapath + 'Comb_Fitnessgram_Academic_2014_2016_1.1.csv')
combined_data = combined_data.drop(['Unnamed: 0'], axis = 1)

In [64]:
combined_data.dtypes

County_Name                                 object
School_Name                                 object
Subgroup                                    object
Number_Students_Total                      float64
Number_Students_Healthy                    float64
Number_Students_Unhealthy                  float64
Mean_Academic_Test_Score                   float64
Students_Tested_Academic                   float64
Total_Tested_At_Subgroup_Level_Academic    float64
dtype: object

### Get Fitness Data on County Level

In [65]:
#Get county level results for fitnessgram
#First, get average of percentage of students in each zone for aerobic and bodycomp
combined_data_fitness = combined_data.drop(['Mean_Academic_Test_Score', 'Students_Tested_Academic', 'Total_Tested_At_Subgroup_Level_Academic'], axis = 1)

#Get county level results for fitnessgram
combined_data_fitness_county_grp = combined_data_fitness.groupby(['County_Name', 'Subgroup'], as_index=False).sum()

#Fourth, calculate percentage of students in each count in each zone
combined_data_fitness_county_grp['fitness_pct_Healthy'] = combined_data_fitness_county_grp['Number_Students_Healthy'] / combined_data_fitness_county_grp['Number_Students_Total']
combined_data_fitness_county_grp['fitness_pct_Unhealthy'] = combined_data_fitness_county_grp['Number_Students_Unhealthy'] / combined_data_fitness_county_grp['Number_Students_Total']

combined_data_fitness_county_grp = combined_data_fitness_county_grp.drop(['Number_Students_Healthy', 'Number_Students_Unhealthy', 'Number_Students_Total'], axis = 1)

#Fifth, reshape long to wide
combined_data_fitness_county = pd.pivot_table(combined_data_fitness_county_grp, values = ['fitness_pct_Healthy', 'fitness_pct_Unhealthy'], index = ['County_Name'], columns = ['Subgroup'])

#rename columns
combined_data_fitness_county.columns = ['_'.join(col).strip() for col in combined_data_fitness_county.columns.values]

combined_data_fitness_county = combined_data_fitness_county.reset_index()

print combined_data_fitness_county.dtypes
print combined_data_fitness_county.shape
combined_data_fitness_county.head()

County_Name                                   object
fitness_pct_Healthy_All                      float64
fitness_pct_Healthy_Economic_disadv          float64
fitness_pct_Healthy_NOT_economic_disadv      float64
fitness_pct_Unhealthy_All                    float64
fitness_pct_Unhealthy_Economic_disadv        float64
fitness_pct_Unhealthy_NOT_economic_disadv    float64
dtype: object
(58, 7)


Unnamed: 0,County_Name,fitness_pct_Healthy_All,fitness_pct_Healthy_Economic_disadv,fitness_pct_Healthy_NOT_economic_disadv,fitness_pct_Unhealthy_All,fitness_pct_Unhealthy_Economic_disadv,fitness_pct_Unhealthy_NOT_economic_disadv
0,Alameda,0.65363,0.532795,0.75008,0.344524,0.462387,0.24155
1,Alpine,,,,,,
2,Amador,0.670318,0.591973,0.705993,0.296824,0.337793,0.26779
3,Butte,0.658652,0.59437,0.742945,0.325426,0.379072,0.230375
4,Calaveras,0.641667,0.604538,0.683206,0.343841,0.367909,0.311705


### Get Academic Data on County Level

In [66]:
#Get county level results for fitnessgram
#First, get number of student points in each school
combined_data_academic = combined_data[['County_Name', 'Subgroup', 'Mean_Academic_Test_Score', 'Students_Tested_Academic', 'Total_Tested_At_Subgroup_Level_Academic']]

combined_data_academic['student_points'] = combined_data_academic['Mean_Academic_Test_Score'] * combined_data_academic['Total_Tested_At_Subgroup_Level_Academic']
combined_data_academic = combined_data_academic.drop(['Mean_Academic_Test_Score', 'Students_Tested_Academic'], axis = 1)

#sum by county and subgroup
combined_data_academic_county_group = combined_data_academic.groupby(['County_Name', 'Subgroup'], as_index=False).sum()

#get weighted average score at county level
combined_data_academic_county_group['academic_wght_avg_score'] = combined_data_academic_county_group['student_points'] / combined_data_academic_county_group['Total_Tested_At_Subgroup_Level_Academic']
combined_data_academic_county_group = combined_data_academic_county_group.drop(['Total_Tested_At_Subgroup_Level_Academic', 'student_points'], axis = 1)

#reshape long to wide
combined_data_academic_county = pd.pivot_table(combined_data_academic_county_group, values = ['academic_wght_avg_score'], index = ['County_Name'], columns = ['Subgroup'])

combined_data_academic_county.columns = ['_'.join(col).strip() for col in combined_data_academic_county.columns.values]

combined_data_academic_county = combined_data_academic_county.reset_index()


combined_data_academic_county.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,County_Name,academic_wght_avg_score_All,academic_wght_avg_score_Economic_disadv,academic_wght_avg_score_NOT_economic_disadv
0,Alameda,355.271326,325.452097,384.876137
1,Alpine,,,
2,Amador,269.833396,188.150312,332.966741
3,Butte,345.173664,334.487911,353.390633
4,Calaveras,355.557867,337.673849,366.480685


### Merge Academic and Fitness

In [67]:
County_level_combined_data = pd.merge(left = combined_data_academic_county, right = combined_data_fitness_county, on = 'County_Name')

print County_level_combined_data.shape

County_level_combined_data.head()

(58, 10)


Unnamed: 0,County_Name,academic_wght_avg_score_All,academic_wght_avg_score_Economic_disadv,academic_wght_avg_score_NOT_economic_disadv,fitness_pct_Healthy_All,fitness_pct_Healthy_Economic_disadv,fitness_pct_Healthy_NOT_economic_disadv,fitness_pct_Unhealthy_All,fitness_pct_Unhealthy_Economic_disadv,fitness_pct_Unhealthy_NOT_economic_disadv
0,Alameda,355.271326,325.452097,384.876137,0.65363,0.532795,0.75008,0.344524,0.462387,0.24155
1,Alpine,,,,,,,,,
2,Amador,269.833396,188.150312,332.966741,0.670318,0.591973,0.705993,0.296824,0.337793,0.26779
3,Butte,345.173664,334.487911,353.390633,0.658652,0.59437,0.742945,0.325426,0.379072,0.230375
4,Calaveras,355.557867,337.673849,366.480685,0.641667,0.604538,0.683206,0.343841,0.367909,0.311705


### Export

In [61]:
County_level_combined_data.to_csv(datapath + combined_outfile)