# Data Pre-processing

__Running this notebook will produce a json file with the census tracts that are 50 miles away from each other as well as a csv file containing the number of universities accessible by each census tract and whether or not it is an education desert__

---
## Data Mining
__In this section we will read in census tract data, representing each cenus tract as a graph node with location of node at the centroid of the polygon__

In [1]:
# Library Imports
import fiona 
import rtree
import shapely
import geopandas as gpd
import pandas as pd
import numpy as np
import subprocess
import os
import requests
from bs4 import BeautifulSoup
import seaborn as sns
sns.set(style="ticks")

# default dictionary (a dictionary with a default value if a key doesn't exist)
from collections import defaultdict

# To unzip file
import zipfile

# To have progress bar
from tqdm import tqdm

# plotting libraries
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-paper')
%matplotlib inline

# Helper function to create a new folder
def mkdir(path):
    try: 
        os.makedirs(path)
    except OSError:
        if not os.path.isdir(path):
            raise
        else:
            print("(%s) already exists" % (path))

In [2]:
with fiona.Env():
    ds = fiona.open('./datasets/census_tracts/Alabama/')
    print(ds.crs)

{'init': 'epsg:4269'}


---
## Datasets

In [3]:
# Census tracts shapefiles url
ct_shape_url = 'https://www.census.gov/geo/maps-data/data/cbf/cbf_tracts.html'

# Census tracts data url from 2012 - 2017
ct_file_name = 'acs_5_year_estimates_census_tracts.csv'
ct_data_url = 'https://www.dropbox.com/s/ni28x7mw6uh00dg/' + ct_file_name + '.zip?dl=1'

# American University Data
au_file_name = 'IPEDS_data.xlsx'
au_data_url = 'https://public.tableau.com/s/sites/default/files/media/Resources/' + au_file_name

# Directory of datasets
DATASETS_PATH = 'datasets/'

# Directory of census tract shapefile data
CENSUS_TRACTS_PATH = DATASETS_PATH + 'census_tracts/'

# Make the directory for the census tracts shapefiles data
mkdir(DATASETS_PATH)

# Remove any old data for census tracts shapefiles
# subprocess.call(['rm', '-rf', CENSUS_TRACTS_PATH])

(datasets/) already exists


### Census Tract Data

__Census Tracts have a population of around ${2,500}$ - ${8,000}$ people__

In [4]:
# Download data 
if not os.path.isfile(DATASETS_PATH + ct_file_name):
    
    os.system('!wget --directory-prefix={} -Nq {}'.format(DATASETS_PATH, ct_data_url))
    
    # Unzipping the file
    zip_ref = zipfile.ZipFile(DATASETS_PATH + ct_file_name + '.zip', 'r')
    zip_ref.extractall(DATASETS_PATH + ct_file_name + '/')
    zip_ref.close()
    
    # Remove the old census tract .zip shapefile
    subprocess.call(['rm', '-rf', DATASETS_PATH + ct_file_name + '.zip'])

In [5]:
# Let's take a look at the census tract data
census_tracts = pd.read_csv(DATASETS_PATH + ct_file_name, encoding='ISO-8859-1', low_memory=False)
census_tracts.head()

Unnamed: 0,FIPS,Geographic Identifier,Name of Area,Qualifying Name,State/U.S.-Abbreviation (USPS),Summary Level,Geographic Component,File Identification,Logical Record Number,US,...,Civilian Population 16 Years and Over for Whom Poverty Status Is Determined: Income in the Past 12 Months Below Poverty Level: Male: in Labor Force: Employed,Civilian Population 16 Years and Over for Whom Poverty Status Is Determined: Income in the Past 12 Months Below Poverty Level: Male: in Labor Force: Unemployed,Civilian Population 16 Years and Over for Whom Poverty Status Is Determined: Income in the Past 12 Months Below Poverty Level: Male: Not in Labor Force,Civilian Population 16 Years and Over for Whom Poverty Status Is Determined: Income in the Past 12 Months At or Above Poverty Level,Civilian Population 16 Years and Over for Whom Poverty Status Is Determined: Income in the Past 12 Months At or Above Poverty Level: Male: in Labor Force,Civilian Population 16 Years and Over for Whom Poverty Status Is Determined: Income in the Past 12 Months At or Above Poverty Level: Male: in Labor Force: Employed,Civilian Population 16 Years and Over for Whom Poverty Status Is Determined: Income in the Past 12 Months At or Above Poverty Level: Male: in Labor Force: Unemployed,Civilian Population 16 Years and Over for Whom Poverty Status Is Determined: Income in the Past 12 Months At or Above Poverty Level: Male: Not in Labor Force,Households.1,Households with Housing Costs more than 30% of Income
0,Geo_FIPS,Geo_GEOID,Geo_NAME,Geo_QName,Geo_STUSAB,Geo_SUMLEV,Geo_GEOCOMP,Geo_FILEID,Geo_LOGRECNO,Geo_US,...,SE_T254_004,SE_T254_005,SE_T254_006,SE_T254_007,SE_T254_008,SE_T254_009,SE_T254_010,SE_T254_011,SE_T255_001,SE_T255_002
1,01001020100,14000US01001020100,"Census Tract 201, Autauga County, Alabama","Census Tract 201, Autauga County, Alabama",al,140,00,ACSSF,0001766,,...,36,7,80,1360,880,845,35,480,754,144
2,01001020200,14000US01001020200,"Census Tract 202, Autauga County, Alabama","Census Tract 202, Autauga County, Alabama",al,140,00,ACSSF,0001767,,...,59,0,204,1230,823,793,30,407,783,218
3,01001020300,14000US01001020300,"Census Tract 203, Autauga County, Alabama","Census Tract 203, Autauga County, Alabama",al,140,00,ACSSF,0001768,,...,61,3,305,2291,1491,1421,70,800,1279,357
4,01001020400,14000US01001020400,"Census Tract 204, Autauga County, Alabama","Census Tract 204, Autauga County, Alabama",al,140,00,ACSSF,0001769,,...,16,0,66,3241,1953,1833,120,1288,1749,361


### American University Data

In [6]:
# Download data 
if not os.path.isfile(DATASETS_PATH + au_file_name):
    
    os.system('!wget --directory-prefix={} -Nq {}'.format(DATASETS_PATH, au_data_url))

In [7]:
# Let's take a look at the american university data
universities = pd.read_excel(DATASETS_PATH + au_file_name, index_col='ID number')
universities.head()

Unnamed: 0_level_0,Name,year,ZIP code,Highest degree offered,County name,Longitude location of institution,Latitude location of institution,Religious affiliation,Offers Less than one year certificate,Offers One but less than two years certificate,...,Percent of freshmen receiving federal grant aid,Percent of freshmen receiving Pell grants,Percent of freshmen receiving other federal grant aid,Percent of freshmen receiving state/local grant aid,Percent of freshmen receiving institutional grant aid,Percent of freshmen receiving student loan aid,Percent of freshmen receiving federal student loans,Percent of freshmen receiving other loan aid,Endowment assets (year end) per FTE enrollment (GASB),Endowment assets (year end) per FTE enrollment (FASB)
ID number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100654,Alabama A & M University,2013,35762,Doctor's degree - research/scholarship,Madison County,-86.568502,34.783368,Not applicable,Implied no,Implied no,...,81.0,81.0,7.0,1.0,32.0,89.0,89.0,1.0,,
100663,University of Alabama at Birmingham,2013,35294-0110,Doctor's degree - research/scholarship and pro...,Jefferson County,-86.80917,33.50223,Not applicable,Implied no,Yes,...,36.0,36.0,10.0,0.0,60.0,56.0,55.0,5.0,24136.0,
100690,Amridge University,2013,36117-3553,Doctor's degree - research/scholarship and pro...,Montgomery County,-86.17401,32.362609,Churches of Christ,Implied no,Implied no,...,90.0,90.0,0.0,40.0,90.0,100.0,100.0,0.0,,302.0
100706,University of Alabama in Huntsville,2013,35899,Doctor's degree - research/scholarship and pro...,Madison County,-86.63842,34.722818,Not applicable,Yes,Implied no,...,31.0,31.0,4.0,1.0,63.0,46.0,46.0,3.0,11502.0,
100724,Alabama State University,2013,36104-0271,Doctor's degree - research/scholarship and pro...,Montgomery County,-86.295677,32.364317,Not applicable,Implied no,Implied no,...,76.0,76.0,13.0,11.0,34.0,81.0,81.0,0.0,13202.0,


In [8]:
len(universities)

1534

### University Points Dictionary
__Let's store the IDs and Location (Lat, Long) of the universities as Shapely Point objects in dictionary__

In [9]:
for idx, id_num in enumerate(universities.index):
    print(idx)
    print(id_num)
    break

0
100654


In [10]:
from shapely.geometry import shape

# List of tuples to store all the university
# locations as Shapely Points
uni_list = []

for idx, id_num in enumerate(universities.index):
    uni_list.append((id_num, shape({"type": "Point",
                                    "coordinates": (universities.loc[id_num, 'Longitude location of institution'],
                                                    universities.loc[id_num, 'Latitude location of institution'])})))

for x in uni_list:
    print(x)
    break

(100654, <shapely.geometry.point.Point object at 0x1a2b8c4cf8>)


### University RTree Index
__Let's store the IDs and Location (Lat, Long) of the universities in an RTree Index__

In [11]:
# Initialize rtree spatial index
uni_index = rtree.index.Index()

# Iterate over all american universities
for idx, (id_num, uni) in enumerate(uni_list):
    
    # add coordinates of univeristy location and store id along with it
    uni_index.insert(idx, coordinates=uni.bounds, obj=id_num)

    # we can now query this datastructure with a point and it will tell us which 
    # polygon it lies in

### Census Tract Shape Files

In [12]:
# Make request to get the webpage
r = requests.get(ct_shape_url)
soup = BeautifulSoup(r.content, "html.parser")

# Get the download links from the dropdown <option> tag
locations = soup.find('select',
                      {'name':'Location',
                       'id':'ct2017m'}).findChildren('option' , recursive=False)[1:]

# Put all the states and the urls for their shape files in a dictionary
state_urls = {location.text.strip() : location.attrs['value'] for location in locations}

In [13]:
# Download data 
if not os.path.isdir(CENSUS_TRACTS_PATH):
    
    # Make the directory for the census tracts shapefiles data
    mkdir(CENSUS_TRACTS_PATH)
    
    for state, state_url in state_urls.items():
        os.system('wget --directory-prefix={} -Nq {}'.format(CENSUS_TRACTS_PATH, state_url))
        
    # Storing the shape file names
    census_tract_shapefiles = []
    for state, state_url in state_urls.items():

        # Extracting the name of shapefile
        shapefile = state_url[state_url.rindex('/') + 1:]
        census_tract_shapefiles.append(shapefile)

        # Renaming the file
        os.rename(CENSUS_TRACTS_PATH + shapefile, CENSUS_TRACTS_PATH + state + '.zip')

        # Unzipping the file
        zip_ref = zipfile.ZipFile(CENSUS_TRACTS_PATH + state + '.zip', 'r')
        zip_ref.extractall(CENSUS_TRACTS_PATH + state + '/')
        zip_ref.close()

        # Remove the old census tract .zip shapefile
        subprocess.call(['rm', '-rf', CENSUS_TRACTS_PATH + state + '.zip'])

### Census Tract Centroid Points Dictionary
__Let's store the geoIDs and Location (Lat, Long) of the census tract centroids as Shapely Point objects in dictionary__

### Census Tract RTree Index
__Let's store the geoIDs and Location (Lat, Long) of the universities in an RTree Index__

In [14]:
# Shapely library to help with calculation
# of the representative centroid position
from shapely.geometry import MultiPoint

# ----------------------------
# Let's calculate and store each of 
# the centroids of each
# census tract in a list of tuples
tract_centroids = []

# ----------------------------
# Initialize rtree spatial index
tract_index = rtree.index.Index()
    
for subdir, dirs, files in list(os.walk(CENSUS_TRACTS_PATH))[1:]:
    
    # Opening the shapefile
    state_shapes = fiona.open(subdir, 'r')
#     state_shapes = gpd.read_file(subdir)
    
    # Looping through each census tract in each state and 
    # making key: geoid, value: centroid position
    # of (longitude, latitude) as well as building the 
    # RTree index for census tracts
    for idx, census_tract in enumerate(state_shapes):
        
        geoid = census_tract['properties']['GEOID']
        geometry = np.array(census_tract['geometry']['coordinates'])
        points = None
        
        # Some of the geometries are in a 2d and some in 3d array
        if len([True for lat_long in geometry[0] if len(lat_long) != 2]) > 0:
            
            # Create the Multipoint object to find centroid
            points = MultiPoint(geometry[0][0])
            
        else:
            
            # Create the Multipoint object to find centroid
            points = MultiPoint(geometry[0])
        
        # ----------------------------
        # Create Census tract Polygon
        tract = shapely.geometry.shape(census_tract['geometry'])

        # add bounding box of census tract and store geoid along with it
        tract_index.insert(idx, coordinates=tract.bounds, obj=geoid)

        # we can now query this datastructure with a point and it will tell us which 
        # polygon it lies in
        
        # ----------------------------
        # Create Census tract Centroid
        # True centroid, not necessarily an existing point
        centroid_pt = points.centroid
        
        # A represenative point, not centroid,
        # that is guarnateed to be with the geometry
        tract_centroids.append((geoid, points.representative_point()))

In [15]:
# Number of census tracts
len(tract_centroids)

73874

In [16]:
for x in tract_centroids:
    print(x)
    break

('29001950900', <shapely.geometry.point.Point object at 0x1a2b7c3a20>)


__TODO: Problem with shape of the geometry, I thought it was supposed to be a list of size=2 tuples.__

__UPDATE: I reduced the 3D and 2D arrays to 1D, hopefully I didn't miss information__

### Census Tract Graph

__We will loop through each census tract and for each census tract, compute it's distance to all other ${n - 1}$ census tracts, storing it in a graph as the edge weights for each census tract node - Essentially creating a ${K_{73,874}}$ Complete Graph__

In [17]:
# Haversine formula to calculate 
# geographical distance between 2 pairs of
# latitude, longitude coordinates
from haversine import haversine

# The Census Tract Graph where each node represents
# a census tract and each edge exists if and only if
# the two census tracts are within 50 miles of each
# other
class CT_Graph:
    
    # The Nodes (Census Tracts)
    class Node:
        def __init__(self, geo_id, coordinates):
            self._geo_id = geo_id
            self._coordinates = coordinates
        
        # US Census Tract Geo ID
        @property
        def geo_id(self):
            return self._geo_id
        
        @geo_id.setter
        def geo_id(self, value):
            self._geo_id = value
        
        # Census tract centroid location 
        # Type: Point
        # Format: (latitude, longitude)
        @property
        def coordinates(self):
            return self._coordinates
        
        @coordinates.setter
        def coordinates(self, value):
            self._coordinates = value
        
    # The Edges (Connections from centroids to centroids)
    class Edge:
        def __init__(self, node_pair, distance):
            self._node_pair = node_pair
            
            # Storing Haversine Distance in miles
            self._distance = distance

        # Tuple of 2 Nodes
        @property
        def node_pair(self):
            return self._node_pair
        
        @node_pair.setter
        def node_pair(self, value):
            self._node_pair = value
        
        # Edge Weight / Haversine Distance
        # between the pairs of lat longs
        @property
        def distance(self):
            return self._distance
        
        @distance.setter
        def distance(self, value):
            self._distance = value
        
    def __init__(self, census_tracts_centroids):
        self._threshold_distance = 50
        self._nodes = []
        self._edges = []
        
        # Creating all my nodes O(n)
        for geo_id, coordinates in census_tracts_centroids.items():
            self._nodes.append(CT_Graph.Node(geo_id, coordinates))
         
        # Creating all my edges O(n^2)
        for idx, node1 in tqdm(enumerate(self._nodes)):
            
            for node2 in self._nodes[idx + 1:]:
                
                # Calculating Haversine Distance
                distance = haversine(*[(node.coordinates.x, node.coordinates.y) for node in (node1, node2)], unit='mi')
                
                # Add the Edge only if Haversine Distance is less than 50 miles
                if distance <= self._threshold_distance:
                    self._edges.append(CT_Graph.Edge((node1, node2), distance))
        
    # List of Census Tract nodes
    @property
    def nodes(self):
        return self._nodes

    @nodes.setter
    def nodes(self, value):
        self._nodes = value

    # List of Census Tract edges
    @property
    def edges(self):
        return self._edges

    @edges.setter
    def edges(self, value):
        self._edges = value

In [18]:
# Initializing Census Tract Graph (THIS WILL TAKE SUPER LONG TO RUN)
# ct_graph = CT_Graph(tract_centroids) 

### Saving Data

__Courtesy of Caleb Robinson:__

This article is a very good discussion of what coordinate reference systems (CRSs) are and why we need them, you should probably read it as a starting point: https://docs.qgis.org/testing/en/docs/gentle_gis_introduction/coordinate_reference_systems.html

 

- Every piece of geographic data will have a CRS. Aaron showed how to see the CRS of shapefiles in Tuesday's lecture, so you can see that notebook for an example of how to find the CRS that your data is in.

- Once you have the CRS of your data, then you can look it up on epsg.io to see what units it is in. If you are using Census Tract shapefiles from TIGER then your data will most likely be in EPSG:4269, which has the units of degrees.

- The conversion rate from degrees --> km or degrees --> miles changes based on the latitude. For example, 1 degree of latitude is ~68.71 miles at the equator but it is ~68.99 miles at 40 degrees N. Similarly, a degree of longitude is 69.17 at the equator but just 53.06 miles at 40 degrees N.

- This means you need to convert your data into an equal area coordinate system that has been specifically designed to represent distances equally. This is of course impossible to do for the entire globe, which is why these coordinate systems are local to different areas (and why big states like California can have many different coordinate systems, different coordinate systems will let you represent distances with different degrees of precision).

- One such equal area CRS is the U.S. National Atlas Equal Area Projection - EPSG:2163. You should probably use this.

- If you need a realllly high degree of accuracy (for example in the land cover mapping project that I presented - 1m differences matter) then read about the US State Plane Coordinate System - https://en.wikipedia.org/wiki/State_Plane_Coordinate_System. Here, clever people have defined a bunch of coordinate systems (like, over 100) that tile the US and give high degrees of accuracy. This is hard to use for US wide projects.

- See the internet/notebook from Tuesday/office hours for how to actually convert your data into EPSG:2163.

In [19]:
# Setting our coordinate system for the tracts 
# and universities to EPSG:2163 https://epsg.io/2163
tract_gdf = gpd.GeoDataFrame(geometry=[tract_pt for geoid, tract_pt in tract_centroids])
tract_gdf.crs = {'init': 'epsg:4269'}
tract_gdf.to_crs({'init': 'epsg:2163'})

uni_gdf = gpd.GeoDataFrame(geometry=[uni_pt for uniid, uni_pt in uni_list])
uni_gdf.crs = {'init': 'epsg:4269'}
uni_gdf.to_crs({'init': 'epsg:2163'});

__New Census Tract Equal Area CRS__

In [20]:
tract_gdf.head()

Unnamed: 0,geometry
0,POINT (-92.577102 40.196419)
1,POINT (-91.487574 39.320619)
2,POINT (-93.75334099999999 36.748141)
3,POINT (-92.38100999999999 38.93771)
4,POINT (-94.825532 39.780538)


__New University Equal Area CRS__

In [21]:
uni_gdf.head()

Unnamed: 0,geometry
0,POINT (-86.568502 34.783368)
1,POINT (-86.80916999999999 33.50223)
2,POINT (-86.17401 32.362609)
3,POINT (-86.63842 34.722818)
4,POINT (-86.295677 32.364317)


__Finding out which of the census tracts are 50 miles (80467.2 metres) away from each other__

In [25]:
from tqdm import tqdm
import json

# miles_50_in_m = 80467.2
miles_50_in_m = 1

# Dictionary of Key: GeoID
# Value: List(GeoID)
buffered_tract_per_tract = defaultdict(list)

for idx, (geoid, tract_pt) in enumerate(tract_centroids):
    
    # Buffered circle around the census tract 
    # denoting a 50 mile radius polygon
    buffered_centroid = tract_gdf.iloc[idx, 0].buffer(miles_50_in_m)
    
    # get iterable (generator) of indices whose circle overlaps with each census tract centroid
    for overlapping_ind in tract_index.intersection(buffered_centroid.bounds):
        
        # check if the polygon is not the census tract itself and 
        # is actually INTERSECTS the buffered centroid (somewhat expensive)
        if tract_centroids[overlapping_ind][0] != geoid and tract_centroids[overlapping_ind][1].intersects(buffered_centroid):
            
            # add the current geoID of the overlapping census tract into list of census tracts within 50 miles
            buffered_tract_per_tract[geoid].append(tract_centroids[overlapping_ind][0])
            


1893it [00:38, 60.94it/s][A

In [26]:
# Let's take a peak
for x in buffered_tract_per_tract.items():
    print(x)
    break

('29001950900', ['29041470200', '29041470300', '29041470100', '29115490500', '29115490200', '29115490300', '29115490100', '29115490400', '29079960100', '29211480200', '29211480100', '29211480300', '29171960100', '29171960200', '29137960100', '29137960200', '29137960300', '29205450300', '29205450200', '29205450100', '29111970400', '29111970300', '29111970100', '29111970200', '29103960100', '29103960200', '29045950300', '29199480200', '29045950200', '29199480100', '29045950100', '29001950300', '29001951000', '29001950400', '29001950500', '29121960200', '29001950200', '29175490200', '29197470200', '29175490400', '29089960100', '29175490500', '29175490300', '29121960400', '29121960500', '29197470100', '29001950100', '29175490600', '29175490100', '29121960300', '29121960100', '29137960300'])


In [27]:
# Store centroid 50 mile dict to file
with open('./datasets/ct_50_miles.json', 'w') as fp:
    json.dump(buffered_tract_per_tract, fp)

__Finding out which of the census tracts are education deserts or not__

In [28]:
# Dictionary of Keys: geoId
# Value: Number of universities accessible
tract_dict = {geoid: 0 for geoid, _ in tract_centroids}

for idx, (uniid, uni_pt) in tqdm(enumerate(uni_list)):
    
    # Buffered Circle around the university
    # denoting a 50 mile radius polygon
    buffered_uni = uni_gdf.iloc[idx, 0].buffer(miles_50_in_m)
    
    # get iterable (generator) of indices whose circle overlaps with each census tract centroid
    for overlapping_ind in tract_index.intersection(buffered_uni.bounds):
        
        # check if the polygon is not the census tract itself and 
        # is actually INTERSECTS the buffered centroid (somewhat expensive)
        if tract_centroids[overlapping_ind][1].intersects(buffered_uni):
            
            tract_dict[tract_centroids[overlapping_ind][0]] += 1



0it [00:00, ?it/s][A[A

54it [00:00, 529.68it/s][A[A

70it [00:00, 296.33it/s][A[A

86it [00:00, 170.05it/s][A[A

100it [00:00, 145.89it/s][A[A

113it [00:00, 121.25it/s][A[A

125it [00:00, 106.41it/s][A[A

137it [00:00, 105.65it/s][A[A

151it [00:01, 113.69it/s][A[A

168it [00:01, 121.81it/s][A[A

181it [00:01, 107.22it/s][A[A

193it [00:01, 104.84it/s][A[A

216it [00:01, 124.97it/s][A[A

236it [00:01, 140.36it/s][A[A

261it [00:01, 160.85it/s][A[A

287it [00:01, 179.30it/s][A[A

308it [00:02, 116.68it/s][A[A

325it [00:02, 89.14it/s] [A[A

339it [00:02, 74.10it/s][A[A

361it [00:02, 90.69it/s][A[A

378it [00:02, 102.01it/s][A[A

416it [00:03, 130.18it/s][A[A

445it [00:03, 155.47it/s][A[A

494it [00:03, 195.50it/s][A[A

525it [00:03, 161.03it/s][A[A

550it [00:03, 154.24it/s][A[A

572it [00:03, 153.34it/s][A[A

592it [00:03, 159.11it/s][A[A

612it [00:04, 167.98it/s][A[A

643it [00:04, 194.37it/s][A[A

675it [00:04, 218.21

In [29]:
# Creating Pandas Dataframe from dictionary
education_deserts = pd.DataFrame.from_dict(tract_dict, orient='index', columns=['Number of Accessible Universities'])
education_deserts['Education Desert'] = 0
education_deserts.loc[education_deserts['Number of Accessible Universities'] == 0, 'Education Desert'] = 1
education_deserts.head()

Unnamed: 0,Number of Accessible Universities,Education Desert
29001950900,1,0
29007950100,8,0
29009960100,12,0
29019001201,9,0
29021000600,25,0


In [30]:
# Write to csv file
education_deserts.to_csv(r'./datasets/education_deserts.csv')