In [32]:
import pandas as pd
import geopandas as gpd
import numpy as np
import re

In [33]:
# read in csv file
all_cases = gpd.read_file('/Users/leahwallihan/Durham_school_planning/Res_Development')

In [34]:
# types = A_TYPE codes we want to keep
types = ['PL_MINSP', 'PL_SSP_SM', 'PL_SSM_SM2', 'PL_CPAA', 'PL_MINPP', 'PL_MAJSP', 'PL_MAJSUP', 'PL_PPA', 'PL_MAJPP'] 
# filter that only keeps cases of specified types
filter_cases_type = all_cases[all_cases['A_TYPE'].isin(types)]
# status = A_STATUS codes we want to keep
status = all_cases['A_STATUS'].unique()
status = status[~np.isin(status, ['WITH', 'VOID','DEN','DISAP','EXP'])]
# filter that only keeps cases of specified status
filter_cases_status = filter_cases_type[filter_cases_type['A_STATUS'].isin(status)]

In [35]:
keywords = ['home', 'family', 'residen', 'mixed', 'mized', 'duplex', 'apartment', ' housing', 'condo', 'dwelling', 'tenant', 'affordable', 'units', 'townhouse']
pattern = '|'.join(keywords)
filtered1 = filter_cases_status[filter_cases_status['A_DESCRIPT'].str.contains(pattern, case=False, na=False)]

#use not pattern to avoid these words
keywords_avoid = ['addition','expand','storage', 'zoning', 'grading']
pattern_avoid = '|'.join(keywords_avoid)
filtered2 = filtered1[~filtered1['A_DESCRIPT'].str.contains(pattern_avoid, case=False, na=False)]

In [36]:
#filter for all status dates after 2020
filtered2['A_STATUS_D'] = pd.to_datetime(filtered2['A_STATUS_D'])
filtered3 = filtered2[filtered2['A_STATUS_D'].dt.year>=2020]
# filtered3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [37]:
# function to extract unit quantity from description
def extract_units(description):
    description = re.sub(pattern=r'(\d+|\d{1,3}(,\d{3})*)(\s+[A-Za-z-]+){0,2}?\s*(SF|square feet|sq. ft.|sqft)', 
                         repl='', string=description, flags=re.IGNORECASE) # remove square footage descriptors

    match_pattern = r'(\d{1,3})(?:\s+[A-Za-z-]+){0,3}?\s+(units?|lots?|homes?|houses?|townhomes?|townhouses?|town homes?|town houses?|apartments?|duplex(?:es)?)'
    matches = re.findall(match_pattern, description, flags=re.IGNORECASE) # match residence types with their quantity

    return matches # returns a list of tuples in format (quantity, residence type)

In [56]:
filtered3['match_results'] = filtered3['A_DESCRIPT'].apply(extract_units)
filtered3 = filtered3.to_crs('EPSG:4326')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [58]:
high_school_boundaries = gpd.read_file('/Users/leahwallihan/Durham_school_planning/HS_regions')
high_school_boundaries = high_school_boundaries.to_crs('EPSG:4326')

In [66]:
filtered3_with_school = filtered3.copy()

for i,geometry in enumerate(high_school_boundaries['geometry']):
    print(high_school_boundaries.loc[i, 'ShortName'])
    
    in_geometry = geometry.contains(filtered3['geometry'])

    high_school_name = high_school_boundaries.loc[i, 'ShortName']

    filtered3_with_school.loc[in_geometry, 'high_school'] = high_school_name
    

Riverside
Southern School of Energy and Sustainability
Northern
Jordan
Hillside


In [68]:
filtered3_with_school.head()

Unnamed: 0,A_NUMBER,A_TYPE,A_DATE,A_STATUS,A_STATUS_D,A_PROJECT_,A_DESCRIPT,A_USER_ID,A_CASE_PLA,StatCode,...,CasePlanne,EMAIL,ORIG_FID,CreationDa,Creator,EditDate,Editor,geometry,match_results,high_school
90,D2000291,PL_MINSP,2020-12-02,APP,2021-09-03,Umstead Grove Conservation Subdivision,"50 Single - family lots, 1 stormwater pond, ad...",JESSICADO,COURTNEYMC,APP,...,Courtney McQueen,Courtney.McQueen@durhamnc.gov,91,2025-05-21,gisproc_sys,2025-05-21,gisproc_sys,POINT (-78.94561 36.07501),"[(50, lots)]",Riverside
224,D1800378,PL_MINSP,2018-11-28,APP,2020-04-14,Elan Innovation District,Mixe of uses including retail and residential....,JOHNRA,TREYFI,APP,...,Trey Figueroa,Trey.Figueroa@durhamnc.gov,225,2025-05-21,gisproc_sys,2025-05-21,gisproc_sys,POINT (-78.90421 35.99991),[],Riverside
242,D2100156,PL_MINSP,2021-06-04,APP,2022-02-02,ALTA Rutherford,Multifamily Apartments,KIMRO,TREYFI,APP,...,Trey Figueroa,Trey.Figueroa@durhamnc.gov,243,2025-05-21,gisproc_sys,2025-05-21,gisproc_sys,POINT (-78.92991 36.01005),[],Riverside
265,D1900171,PL_MINSP,2019-05-30,APP,2020-04-03,Ellis Road Phase 3,"37 Townhome units, 102 Attached S-F units (cal...",ROBINSH,COLERE,APP,...,Cole Renigar,Cole.Renigar@durhamnc.gov,266,2025-05-21,gisproc_sys,2025-05-21,gisproc_sys,POINT (-78.86167 35.9519),"[(37, Townhome), (102, units), (146, lots)]",Hillside
290,A1900003,PL_CPAA,2019-02-14,COM,2020-08-17,Cole Property,"From Rural Density Residential (RDR, 0.5DU/acr...",DCULTRA,EMILYST,COM,...,Emily Struthers,Emily.Struthers@durhamnc.gov,291,2025-05-21,gisproc_sys,2025-05-21,gisproc_sys,POINT (-78.77719 35.98944),[],Southern School of Energy and Sustainability
