# Key Variable Selection from Data Sets

## Date: 1/19/22

In [41]:
import pandas as pd
import numpy as np

### Broadband

In [42]:
file = open("Initial Clean Data/broadband_clean.csv","r")
raw_broadband_data = pd.read_csv("Initial Clean Data/broadband_clean.csv")

broadband_2018 = raw_broadband_data.loc[raw_broadband_data['year'] == 2018].sort_values(by = "county", ascending=True)

broadband_2018

Unnamed: 0,county,year,id,cfips,state,broadband_pct
43908,Abbeville County,2018,0500000US45001,45001,South Carolina,0.657
21070,Acadia Parish,2018,0500000US22001,22001,Louisiana,0.709
53427,Accomack County,2018,0500000US51001,51001,Virginia,0.638
10392,Ada County,2018,0500000US16001,16001,Idaho,0.812
18790,Adair County,2018,0500000US21001,21001,Kentucky,0.666
...,...,...,...,...,...,...
2070,Yuma County,2018,0500000US04027,4027,Arizona,0.763
5775,Yuma County,2018,0500000US08125,8125,Colorado,0.733
52572,Zapata County,2018,0500000US48505,48505,Texas,0.541
52591,Zavala County,2018,0500000US48507,48507,Texas,0.483


### School Enrollment

In [43]:
file = open("Initial Clean Data/school_enrollment_clean.csv","r")
school_data = pd.read_csv("Initial Clean Data/school_enrollment_clean.csv")

school_data 

Unnamed: 0,id,Total Pop Enrolled,Percent Total Pop Enrolled,Margin Total Pop,County,State
0,0500000US01003,35912,79.0,1447,Baldwin County,Alabama
1,0500000US01015,17680,67.7,703,Calhoun County,Alabama
2,0500000US01043,13985,78.8,760,Cullman County,Alabama
3,0500000US01049,13367,86.8,617,DeKalb County,Alabama
4,0500000US01051,13077,72.0,544,Elmore County,Alabama
...,...,...,...,...,...,...
833,0500000US72113,19965,60.8,956,Ponce Municipio,Puerto Rico
834,0500000US72127,44217,56.6,1358,San Juan Municipio,Puerto Rico
835,0500000US72135,12180,70.4,1268,Toa Alta Municipio,Puerto Rico
836,0500000US72137,11050,63.9,541,Toa Baja Municipio,Puerto Rico


### Employment

In [44]:
with open('Initial Clean Data/clean_employment_data.csv') as f:
    raw_clean_employment =  pd.read_csv(f, delimiter=',')

In [45]:
### Selecting overall popluation vars

overall_pop_employ_data = raw_clean_employment[["GEO_ID", "county", "state","S2301_C01_001E","S2301_C03_001E","S2301_C04_001E"]]

### Creating a dictionary for the selected variables to be renamed

overall_pop_employ_dict = {"S2301_C01_001E":"est_total_pop_16_over",
               "S2301_C03_001E":"est_emp_pop_ratio_16_over",
               "S2301_C04_001E":"est_unemp_pop_ratio_16_over"
                          }

overall_pop_employ_data.rename(columns=overall_pop_employ_dict, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


### Income

In [46]:
with open('Initial Clean Data/clean_income_data.csv') as f:
    raw_clean_income =  pd.read_csv(f, delimiter=',')

In [47]:
### Selecting overall income vars

overall_income_data = raw_clean_income[["GEO_ID", "county", "state", "S1901_C01_001E", "S1901_C01_012E", "S1901_C02_001E", "S1901_C02_012E"]]

overall_income_dict = {"S1901_C01_001E":"est_total_households",
               "S1901_C01_012E":"est_med_income_households",
               "S1901_C02_001E":"est_total_families",
                "S1901_C02_012E":"est_med_income_families"
                          }

overall_income_data.rename(columns=overall_income_dict, inplace=True)

## Merging Dataframes

In [48]:
#Making column names consistent in snake case and lower 
school_data.columns = school_data.columns.str.lower().str.replace(" ", "_")
overall_pop_employ_data.columns = overall_pop_employ_data.columns.str.lower()
overall_income_data.columns = overall_income_data.columns.str.lower()

In [52]:
set1 = school_data.merge(broadband_2018, left_on='id', right_on='id')
set1 = set1.rename(columns = {"county_x":"county", "state_x":"state"})
set1 = set1.drop(columns = ["cfips", "county_y", "state_y"])

set2 = overall_pop_employ_data.merge(overall_income_data, left_on='geo_id', right_on='geo_id')
set2 = set2.rename(columns = {"county_x":"county", "state_x":"state"})
set2 = set2.drop(columns = ["county_y", "state_y"])

set3 = set1.merge(set2, left_on='id', right_on='geo_id')
set3 = set3.rename(columns = {"county_x":"county", "state_x":"state"})
set3 = set3.drop(columns = ["county_y", "state_y"])

set3

Unnamed: 0,id,total_pop_enrolled,percent_total_pop_enrolled,margin_total_pop,county,state,year,broadband_pct,geo_id,est_total_pop_16_over,est_emp_pop_ratio_16_over,est_unemp_pop_ratio_16_over,est_total_households,est_med_income_households,est_total_families,est_med_income_families
0,0500000US01003,35912,79.0,1447,Baldwin County,Alabama,2018,0.781,0500000US01003,176222.0,58.3,2.4,83501,56813,55354,71051
1,0500000US01015,17680,67.7,703,Calhoun County,Alabama,2018,0.730,0500000US01015,93284.0,53.0,8.1,44264,45818,30110,60889
2,0500000US01043,13985,78.8,760,Cullman County,Alabama,2018,0.699,0500000US01043,66680.0,53.8,4.3,30323,44612,21587,54852
3,0500000US01049,13367,86.8,617,DeKalb County,Alabama,2018,0.670,0500000US01049,57239.0,51.7,3.2,26462,36998,19148,50728
4,0500000US01051,13077,72.0,544,Elmore County,Alabama,2018,0.796,0500000US01051,65407.0,52.4,5.3,30155,60796,22103,66541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
821,0500000US55133,66577,69.8,1072,Waukesha County,Wisconsin,2018,0.888,0500000US55133,327449.0,65.3,2.3,158368,86968,110134,108847
822,0500000US55139,25205,61.7,856,Winnebago County,Wisconsin,2018,0.816,0500000US55139,140227.0,62.8,3.8,71332,56589,41338,74110
823,0500000US55141,11256,76.1,557,Wood County,Wisconsin,2018,0.784,0500000US55141,59152.0,60.6,2.6,32274,55273,19775,67833
824,0500000US56021,16743,72.8,724,Laramie County,Wyoming,2018,0.839,0500000US56021,78343.0,57.4,4.3,39678,64306,25836,82617


In [None]:
set1 = school_data.merge(broadband_2018, left_on='id', right_on='id',

          suffixes=('_sch', '_brb'))

In [None]:
#Merge broadband and school data
sch_bb = pd.merge(eighteendata, school_data, on = "id")
merged_data = merged_data.sort_values(by = ["state_x", "county_x"], ascending = True)
merged_data = merged_data.rename(columns = {"county_x":"county", "state_x":"state"})
merged_data = merged_data.drop(columns = ["cfips", "county_y", "state_y"])
merged_data