In this notebook we clean Florida census data to have the population as a demogrpahic feature for our machine learning model.

# 1. Loading Data

In [1]:
#importing useful libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#loading the data
data = pd.read_excel('2010Census_RedistrictingFile_Table1.xls')

In [3]:
data.head(10)

Unnamed: 0,Florida Legislative Office of,Unnamed: 1,Unnamed: 2,Unnamed: 3,Census 2010
0,Economic and Demographic Research,,,,Redistricting Data (Public Law 94-171) Summary...
1,Phone: 850.487.1402,,,,Generated on 3/17/2011
2,Web: http://edr.state.fl.us,,,,
3,Table 1,,,,
4,Total Population: 2000 and 2010,,,,
5,Incorporated Cities by County in Florida,,,,
6,,,,,
7,,Total Population,,"Change, 2000-2010",
8,State / County / City,"April 1, 2000 (2)",2010-04-01 00:00:00,Number,Percent
9,Florida,15982824,18801310,2818486,0.176345


In [4]:
data.drop([x for x in range(9)], axis=0, inplace=True) #removing the first 8 redundant rows

In [5]:
data.head()

Unnamed: 0,Florida Legislative Office of,Unnamed: 1,Unnamed: 2,Unnamed: 3,Census 2010
9,Florida,15982824,18801310,2818486,0.176345
10,Alachua County,217955,247336,29381,0.134803
11,Alachua,6098,9059,2961,0.485569
12,Archer,1289,1118,-171,-0.132661
13,Gainesville,95447,124354,28907,0.302859


In [6]:
data.columns = ['Region','2000','2010','Change','Percent']

In [7]:
data.head()

Unnamed: 0,Region,2000,2010,Change,Percent
9,Florida,15982824,18801310,2818486,0.176345
10,Alachua County,217955,247336,29381,0.134803
11,Alachua,6098,9059,2961,0.485569
12,Archer,1289,1118,-171,-0.132661
13,Gainesville,95447,124354,28907,0.302859


# 2. Creating features

At this stage we want to extract the County populations to use them as a feature. Therefore, we need to understand which rows are counties.

In [8]:
def filt(x):#defining a filter to choose the rows with counties
    return 'County' in x #if the string x contains 'County', then the result is true
data['is County'] = data['Region'].apply(filt) #is County = True means the corresponding Region is a county

In [9]:
data.head()

Unnamed: 0,Region,2000,2010,Change,Percent,is County
9,Florida,15982824,18801310,2818486,0.176345,False
10,Alachua County,217955,247336,29381,0.134803,True
11,Alachua,6098,9059,2961,0.485569,False
12,Archer,1289,1118,-171,-0.132661,False
13,Gainesville,95447,124354,28907,0.302859,False


In [10]:
data = data[data['is County']] #choosing the subdataframe with county rows

In [11]:
data.head()

Unnamed: 0,Region,2000,2010,Change,Percent,is County
10,Alachua County,217955,247336,29381,0.134803,True
20,Baker County,22259,27115,4856,0.218159,True
23,Bay County,148217,168852,20635,0.139222,True
32,Bradford County,26088,28520,2432,0.0932229,True
37,Brevard County,476230,543376,67146,0.140995,True


At some point we will merge this dataframe with Florida Health Department's beach water test data based on the county column. Therefore, we turn the county names here in to the form they appear in Florida Health Department's beach water test data.

In [12]:
data['County'] = data['Region'].apply(lambda x: x[:-7]) #creating a column with the county name
#Bay County ---> Bay

In [13]:
#Here is the list of counties in the current data frame
data.County.unique()

array(['Alachua', 'Baker', 'Bay', 'Bradford', 'Brevard', 'Broward',
       'Calhoun', 'Charlotte', 'Citrus', 'Clay', 'Collier', 'Columbia',
       'DeSoto', 'Dixie', 'Duval', 'Escambia', 'Flagler', 'Franklin',
       'Gadsden', 'Gilchrist', 'Glades', 'Gulf', 'Hamilton', 'Hardee',
       'Hendry', 'Hernando', 'Highlands', 'Hillsborough', 'Holmes',
       'Indian River', 'Jackson', 'Jefferson', 'Lafayette', 'Lake', 'Lee',
       'Leon', 'Levy', 'Liberty', 'Madison', 'Manatee', 'Marion',
       'Martin', 'Miami-Dade', 'Monroe', 'Nassau', 'Okaloosa',
       'Okeechobee', 'Orange', 'Osceola', 'Palm Beach', 'Pasco',
       'Pinellas', 'Polk', 'Putnam', 'St. Johns', 'St. Lucie',
       'Santa Rosa', 'Sarasota', 'Seminole', 'Sumter', 'Suwannee',
       'Taylor', 'Union', 'Volusia', 'Wakulla', 'Walton', 'Washington'],
      dtype=object)

In [14]:
#Here is the list of coastal counties in the current dataframe
coastal = ['Bay', 'Brevard', 'Broward', 'Charlotte', 'Citrus', 'Collier',
       'Miami-Dade', 'Dixie', 'Duval', 'Escambia', 'Flagler', 'Franklin',
       'Gulf', 'Hernando', 'Hillsborough', 'Indian River', 'Lee', 'Levy',
       'Manatee', 'Martin', 'Monroe', 'Nassau', 'Okaloosa', 'Palm Beach',
       'Pasco', 'Pinellas', 'Santa Rosa', 'Sarasota', 'St. Johns',
       'St. Lucie', 'Taylor', 'Volusia', 'Wakulla', 'Walton']

In [15]:
#Here is the list of coastal counties in the Florida Health Department's data
county_names = ['Bay', 'Brevard', 'Broward', 'Charlotte', 'Citrus', 'Collier',
       'Dixie', 'Duval', 'Escambia', 'Flagler', 'Franklin', 'Gulf',
       'Hernando', 'Hillsborough', 'Indian River', 'Lee', 'Levy',
       'Manatee', 'Martin', 'Dade', 'Monroe', 'Nassau', 'Okaloosa',
       'Palm Beach', 'Pasco', 'Pinellas', 'St Johns', 'St Lucie',
       'Santa Rosa', 'Sarasota', 'Taylor', 'Volusia', 'Wakulla', 'Walton']

In [16]:
data.head()

Unnamed: 0,Region,2000,2010,Change,Percent,is County,County
10,Alachua County,217955,247336,29381,0.134803,True,Alachua
20,Baker County,22259,27115,4856,0.218159,True,Baker
23,Bay County,148217,168852,20635,0.139222,True,Bay
32,Bradford County,26088,28520,2432,0.0932229,True,Bradford
37,Brevard County,476230,543376,67146,0.140995,True,Brevard


In [17]:
def fil(x):#defining a filter to choose coastal counties
    return x in coastal

data['is coastal'] = data['County'].apply(fil) #defining a column with boolean
#True means coastal county, False means not a coastal county

In [18]:
data.head()

Unnamed: 0,Region,2000,2010,Change,Percent,is County,County,is coastal
10,Alachua County,217955,247336,29381,0.134803,True,Alachua,False
20,Baker County,22259,27115,4856,0.218159,True,Baker,False
23,Bay County,148217,168852,20635,0.139222,True,Bay,True
32,Bradford County,26088,28520,2432,0.0932229,True,Bradford,False
37,Brevard County,476230,543376,67146,0.140995,True,Brevard,True


In [19]:
new = data[data['is coastal']] #slicing the subdataframe with coastal county entries

In [20]:
new.columns

Index(['Region', '2000', '2010', 'Change', 'Percent', 'is County', 'County',
       'is coastal'],
      dtype='object')

In [21]:
new.County.unique()

array(['Bay', 'Brevard', 'Broward', 'Charlotte', 'Citrus', 'Collier',
       'Dixie', 'Duval', 'Escambia', 'Flagler', 'Franklin', 'Gulf',
       'Hernando', 'Hillsborough', 'Indian River', 'Lee', 'Levy',
       'Manatee', 'Martin', 'Miami-Dade', 'Monroe', 'Nassau', 'Okaloosa',
       'Palm Beach', 'Pasco', 'Pinellas', 'St. Johns', 'St. Lucie',
       'Santa Rosa', 'Sarasota', 'Taylor', 'Volusia', 'Wakulla', 'Walton'],
      dtype=object)

In [22]:
new['County'] = county_names #transforming the county names to the format in Florida Health Department's beach test data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [23]:
new

Unnamed: 0,Region,2000,2010,Change,Percent,is County,County,is coastal
23,Bay County,148217,168852,20635,0.139222,True,Bay,True
37,Brevard County,476230,543376,67146,0.140995,True,Brevard,True
54,Broward County,1623018,1748066,125048,0.0770466,True,Broward,True
89,Charlotte County,141627,159978,18351,0.129573,True,Charlotte,True
91,Citrus County,118085,141236,23151,0.196054,True,Citrus,True
99,Collier County,251377,321520,70143,0.279035,True,Collier,True
108,Dixie County,13827,16422,2595,0.187676,True,Dixie,True
111,Duval County,778879,864263,85384,0.109624,True,Duval,True
117,Escambia County,294410,297619,3209,0.0108998,True,Escambia,True
120,Flagler County,49832,95696,45864,0.920372,True,Flagler,True


In [24]:
for year in range(2000,2021):
    new[str(year)] = new['2000'] + 0.1 * new['Change'] * (year-2000) 
#linearly increasing the population between 2000--->2010--->2020

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [25]:
new.head()

Unnamed: 0,Region,2000,2010,Change,Percent,is County,County,is coastal,2001,2002,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
23,Bay County,148217.0,168852.0,20635,0.139222,True,Bay,True,150280.0,152344.0,...,170916.0,172979.0,175042.0,177106.0,179170.0,181233.0,183296.0,185360.0,187424.0,189487.0
37,Brevard County,476230.0,543376.0,67146,0.140995,True,Brevard,True,482945.0,489659.0,...,550091.0,556805.0,563520.0,570234.0,576949.0,583664.0,590378.0,597093.0,603807.0,610522.0
54,Broward County,1623020.0,1748070.0,125048,0.0770466,True,Broward,True,1635520.0,1648030.0,...,1760570.0,1773080.0,1785580.0,1798090.0,1810590.0,1823090.0,1835600.0,1848100.0,1860610.0,1873110.0
89,Charlotte County,141627.0,159978.0,18351,0.129573,True,Charlotte,True,143462.0,145297.0,...,161813.0,163648.0,165483.0,167318.0,169154.0,170989.0,172824.0,174659.0,176494.0,178329.0
91,Citrus County,118085.0,141236.0,23151,0.196054,True,Citrus,True,120400.0,122715.0,...,143551.0,145866.0,148181.0,150496.0,152812.0,155127.0,157442.0,159757.0,162072.0,164387.0


In [26]:
new.columns

Index(['Region', '2000', '2010', 'Change', 'Percent', 'is County', 'County',
       'is coastal', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020'],
      dtype='object')

In [27]:
population = new.drop(['Region', 'Change', 'Percent', 'is County',
       'is coastal'], axis = 1)

In [28]:
population = population[['County', '2000', '2010', '2001', '2002', '2003', '2004', '2005','2006',
       '2007', '2008', '2009', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018','2019', '2020']]

In [29]:
population.head()

Unnamed: 0,County,2000,2010,2001,2002,2003,2004,2005,2006,2007,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
23,Bay,148217.0,168852.0,150280.0,152344.0,154408.0,156471.0,158534.0,160598.0,162662.0,...,170916.0,172979.0,175042.0,177106.0,179170.0,181233.0,183296.0,185360.0,187424.0,189487.0
37,Brevard,476230.0,543376.0,482945.0,489659.0,496374.0,503088.0,509803.0,516518.0,523232.0,...,550091.0,556805.0,563520.0,570234.0,576949.0,583664.0,590378.0,597093.0,603807.0,610522.0
54,Broward,1623020.0,1748070.0,1635520.0,1648030.0,1660530.0,1673040.0,1685540.0,1698050.0,1710550.0,...,1760570.0,1773080.0,1785580.0,1798090.0,1810590.0,1823090.0,1835600.0,1848100.0,1860610.0,1873110.0
89,Charlotte,141627.0,159978.0,143462.0,145297.0,147132.0,148967.0,150802.0,152638.0,154473.0,...,161813.0,163648.0,165483.0,167318.0,169154.0,170989.0,172824.0,174659.0,176494.0,178329.0
91,Citrus,118085.0,141236.0,120400.0,122715.0,125030.0,127345.0,129660.0,131976.0,134291.0,...,143551.0,145866.0,148181.0,150496.0,152812.0,155127.0,157442.0,159757.0,162072.0,164387.0


Finally we obtained the population of each coastal county on a given year between 2000-2020.

In [30]:
popu_melt = pd.melt(frame=population,id_vars='County') #turning the year data into a column
popu_melt.columns = ['County', 'Year', 'Population'] #defining columns
popu_melt.sort_values(by='County')

Unnamed: 0,County,Year,Population
0,Bay,2000,148217
68,Bay,2001,150280
646,Bay,2019,187424
102,Bay,2002,152344
612,Bay,2018,185360
136,Bay,2003,154408
578,Bay,2017,183296
170,Bay,2004,156471
544,Bay,2016,181233
204,Bay,2005,158534


In [31]:
popu_melt.County.unique() #looks like we have the right county names.

array(['Bay', 'Brevard', 'Broward', 'Charlotte', 'Citrus', 'Collier',
       'Dixie', 'Duval', 'Escambia', 'Flagler', 'Franklin', 'Gulf',
       'Hernando', 'Hillsborough', 'Indian River', 'Lee', 'Levy',
       'Manatee', 'Martin', 'Dade', 'Monroe', 'Nassau', 'Okaloosa',
       'Palm Beach', 'Pasco', 'Pinellas', 'St Johns', 'St Lucie',
       'Santa Rosa', 'Sarasota', 'Taylor', 'Volusia', 'Wakulla', 'Walton'],
      dtype=object)

In [32]:
popu_melt.to_csv('YearlyCountyPopulation.csv', encoding='utf-8', index=False)