# County Level Summary  

The purpose of this notebook is to create county level summaries of the fitnessgram and academic test scores data.

## Load Data

In [9]:
#Import Packages
import os
import pandas as pd
from IPython.display import display
pd.options.display.max_rows = 101


#*******************************************************************************
#*******************************************************************************
#Set these file paths for your own local machine before running
#*******************************************************************************
#*******************************************************************************

#Set file path containing fitnessgram data
datapath = "/Users/nwchen24/Desktop/UC_Berkeley/w209_Data_Viz/final_proj_repo2/Combined_Data/"

fitnessgram_datapath = "/Users/nwchen24/Desktop/UC_Berkeley/w209_Data_Viz/final_project_data/Fitnessgram_Results"

combined_outfile = 'Comb_County_lvl_Fitnessgram_Academic_2014_2016.csv'

In [2]:
combined_data = pd.read_csv(datapath + 'Comb_Fitnessgram_Academic_2014_2016.csv')
combined_data = combined_data.drop(['Unnamed: 0', 'NoHFZ9_aerobic', 'NoHFZ9_bodycomp'], axis = 1)

### Get Fitness Data on County Level

In [4]:
#Get county level results for fitnessgram
#First, get average of percentage of students in each zone for aerobic and bodycomp
combined_data_fitness = combined_data.drop(['Mean_Academic_Test_Score', 'Students_Tested_Academic', 'Total_Tested_At_Subgroup_Level_Academic'], axis = 1)

combined_data_fitness['fitness_comb_HFZ_pct'] = (combined_data_fitness['Perc9HFZ_aerobic'] + combined_data_fitness['Perc9HFZ_bodycomp']) / 2
combined_data_fitness = combined_data_fitness.drop(['Perc9HFZ_aerobic', 'Perc9HFZ_bodycomp'], axis = 1)

combined_data_fitness['fitness_comb_NI_pct'] = (combined_data_fitness['Perc9NI_aerobic'] + combined_data_fitness['Perc9NI_bodycomp']) / 2
combined_data_fitness = combined_data_fitness.drop(['Perc9NI_aerobic', 'Perc9NI_bodycomp'], axis = 1)

combined_data_fitness['fitness_comb_NI_HR_pct'] = (combined_data_fitness['Perc9NI_HR_aerobic'] + combined_data_fitness['Perc9NI_HR_bodycomp']) / 2
combined_data_fitness = combined_data_fitness.drop(['Perc9NI_HR_aerobic', 'Perc9NI_HR_bodycomp'], axis = 1)

combined_data_fitness['fitness_number_students'] = combined_data_fitness['NoStud9_aerobic']
combined_data_fitness = combined_data_fitness.drop(['NoStud9_aerobic', 'NoStud9_bodycomp'], axis = 1)


combined_data_fitness.head()

#Second, calculate number of students at each school in each zone
combined_data_fitness['fitness_num_HFZ'] = combined_data_fitness['fitness_comb_HFZ_pct'] / 100. * combined_data_fitness['fitness_number_students']
combined_data_fitness = combined_data_fitness.drop(['fitness_comb_HFZ_pct'], axis = 1)

combined_data_fitness['fitness_num_NI'] = combined_data_fitness['fitness_comb_NI_pct'] / 100. * combined_data_fitness['fitness_number_students']
combined_data_fitness = combined_data_fitness.drop(['fitness_comb_NI_pct'], axis = 1)

combined_data_fitness['fitness_num_NI_HR'] = combined_data_fitness['fitness_comb_NI_HR_pct'] / 100. * combined_data_fitness['fitness_number_students']
combined_data_fitness = combined_data_fitness.drop(['fitness_comb_NI_HR_pct'], axis = 1)

combined_data_fitness.head()

#Third, sum by county and subgroup
combined_data_fitness = combined_data_fitness.drop(['District_Code', 'School_Code'], axis = 1)

combined_data_fitness_county_grp = combined_data_fitness.groupby(['County_Code', 'Subgroup'], as_index=False).sum()

#Fourth, calculate percentage of students in each count in each zone
combined_data_fitness_county_grp['fitness_pct_HFZ'] = combined_data_fitness_county_grp['fitness_num_HFZ'] / combined_data_fitness_county_grp['fitness_number_students']
combined_data_fitness_county_grp['fitness_pct_NI'] = combined_data_fitness_county_grp['fitness_num_NI'] / combined_data_fitness_county_grp['fitness_number_students']
combined_data_fitness_county_grp['fitness_pct_NI_HR'] = combined_data_fitness_county_grp['fitness_num_NI_HR'] / combined_data_fitness_county_grp['fitness_number_students']

combined_data_fitness_county_grp = combined_data_fitness_county_grp.drop(['fitness_num_HFZ', 'fitness_num_NI', 'fitness_num_NI_HR', 'fitness_number_students'], axis = 1)
combined_data_fitness_county_grp.head(25)

#Fifth, reshape long to wide
combined_data_fitness_county = pd.pivot_table(combined_data_fitness_county_grp, values = ['fitness_pct_HFZ', 'fitness_pct_NI', 'fitness_pct_NI_HR'], index = ['County_Code'], columns = ['Subgroup'])

combined_data_fitness_county.columns = ['_'.join(col).strip() for col in combined_data_fitness_county.columns.values]

combined_data_fitness_county = combined_data_fitness_county.reset_index()


### Get Academic Data on County Level

In [5]:
#Get county level results for fitnessgram
#First, get number of student points in each school
combined_data_academic = combined_data[['County_Code', 'Subgroup', 'Mean_Academic_Test_Score', 'Students_Tested_Academic', 'Total_Tested_At_Subgroup_Level_Academic']]

combined_data_academic['student_points'] = combined_data_academic['Mean_Academic_Test_Score'] * combined_data_academic['Total_Tested_At_Subgroup_Level_Academic']
combined_data_academic = combined_data_academic.drop(['Mean_Academic_Test_Score', 'Students_Tested_Academic'], axis = 1)

#sum by county and subgroup
combined_data_academic_county_group = combined_data_academic.groupby(['County_Code', 'Subgroup'], as_index=False).sum()

#get weighted average score at county level
combined_data_academic_county_group['academic_wght_avg_score'] = combined_data_academic_county_group['student_points'] / combined_data_academic_county_group['Total_Tested_At_Subgroup_Level_Academic']
combined_data_academic_county_group = combined_data_academic_county_group.drop(['Total_Tested_At_Subgroup_Level_Academic', 'student_points'], axis = 1)

#reshape long to wide
combined_data_academic_county = pd.pivot_table(combined_data_academic_county_group, values = ['academic_wght_avg_score'], index = ['County_Code'], columns = ['Subgroup'])

combined_data_academic_county.columns = ['_'.join(col).strip() for col in combined_data_academic_county.columns.values]

combined_data_academic_county = combined_data_academic_county.reset_index()


combined_data_academic_county.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,County_Code,academic_wght_avg_score_All,academic_wght_avg_score_Economic_disadv,academic_wght_avg_score_NOT_economic_disadv
0,1.0,353.261754,326.202292,381.387079
1,2.0,,,
2,3.0,279.356928,202.842149,335.766349
3,4.0,343.621098,329.966129,355.704252
4,5.0,353.115935,335.224607,362.544265


### Merge Academic and Fitness

In [6]:
County_level_combined_data = pd.merge(left = combined_data_academic_county, right = combined_data_fitness_county, on = 'County_Code')

County_level_combined_data.head()

County_level_combined_data.shape

(58, 13)

### Merge in County Names

In [11]:
Entities_files_list = []

#Walk the data directory and get all filepaths
for root, dirs, files in os.walk(fitnessgram_datapath):
    for filename in files:
        #Get full list of filepaths to the physical fitness test files
        if filename.endswith('.txt'):    
            #Get full list of filepaths to the entities files        
            if filename[:8] == "Entities":
                Entities_files_list.append(fitnessgram_datapath + "/PFT_" + filename[8:13] + "/" + filename)
            if filename[8:16] == "Entities":
                Entities_files_list.append(fitnessgram_datapath + "/PFT_" + str(int(filename[:4])+1) + "/" + filename)

#read entities files
physfit_entities_df = pd.DataFrame()

entities_2014 = pd.read_table(Entities_files_list[0], delimiter = "\t")
entities_2014['Year'] = 2014
#Note there are some bad lines in the entities file. There are not that many, so we will just skip them
entities_2015 = pd.read_csv(Entities_files_list[1], error_bad_lines = False)
entities_2015['Year'] = 2015
entities_2016 = pd.read_csv(Entities_files_list[2], error_bad_lines = False)
entities_2016['Year'] = 2016

#combine all years
physfit_entities_df = physfit_entities_df.append(entities_2014).append(entities_2015).append(entities_2016)

#Standardize Columns
physfit_entities_coldict = {}
physfit_entities_coldict['scode'] = 'School_Code'
physfit_entities_coldict['ccode'] = 'County_Code'
physfit_entities_coldict['dcode'] = 'District_Code'
physfit_entities_coldict['chrtnum'] = 'Charter_Number'
physfit_entities_coldict['Year'] = 'Year'

physfit_entities_df = physfit_entities_df.rename(columns = physfit_entities_coldict)

#Keep only county code and county name then remove duplicates
physfit_entities_df_counties = physfit_entities_df[['County_Code', 'County', 'Year']].drop_duplicates().reset_index().drop(['index'], axis = 1)

physfit_entities_df_counties.shape



Skipping line 154: expected 8 fields, saw 9
Skipping line 285: expected 8 fields, saw 9
Skipping line 287: expected 8 fields, saw 9
Skipping line 301: expected 8 fields, saw 9
Skipping line 302: expected 8 fields, saw 10
Skipping line 1961: expected 8 fields, saw 9
Skipping line 1964: expected 8 fields, saw 9
Skipping line 1966: expected 8 fields, saw 9
Skipping line 1967: expected 8 fields, saw 9
Skipping line 1968: expected 8 fields, saw 9
Skipping line 2468: expected 8 fields, saw 9
Skipping line 2469: expected 8 fields, saw 10
Skipping line 2486: expected 8 fields, saw 9
Skipping line 2570: expected 8 fields, saw 9
Skipping line 2649: expected 8 fields, saw 10
Skipping line 2659: expected 8 fields, saw 10
Skipping line 2682: expected 8 fields, saw 9
Skipping line 2683: expected 8 fields, saw 10
Skipping line 2686: expected 8 fields, saw 10
Skipping line 2687: expected 8 fields, saw 9
Skipping line 2694: expected 8 fields, saw 11
Skipping line 2753: expected 8 fields, saw 11
Skippin

(180, 3)

In [81]:
County_level_combined_data = County_level_combined_data.merge(physfit_entities_df_counties, on = ['County_Code'], how = 'inner')

### Export

In [88]:
County_level_combined_data.to_csv(datapath + combined_outfile)

In [12]:
physfit_entities_df_counties

Unnamed: 0,County_Code,County,Year
0,1,Alameda,2014
1,2,Alpine,2014
2,3,Amador,2014
3,4,Butte,2014
4,5,Calaveras,2014
5,6,Colusa,2014
6,7,Contra Costa,2014
7,8,Del Norte,2014
8,9,El Dorado,2014
9,10,Fresno,2014
