In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import re

In [None]:
# read in shapefile file
all_cases = gpd.read_file(r'/Users/leahwallihan/Durham_school_planning/geospatial files/Res_Development')

In [None]:
# types = A_TYPE codes we want to keep
types = ['PL_MINSP', 'PL_SSP_SM', 'PL_SSM_SM2', 'PL_CPAA', 'PL_MINPP', 'PL_MAJSP', 'PL_MAJSUP', 'PL_PPA', 'PL_MAJPP'] 
# filter that only keeps cases of specified types
filter_cases_type = all_cases[all_cases['A_TYPE'].isin(types)]
# status = A_STATUS codes we want to keep
status = all_cases['A_STATUS'].unique()
status = status[~np.isin(status, ['WITH', 'VOID','DEN','DISAP','EXP'])]
# filter that only keeps cases of specified status
filter_cases_status = filter_cases_type[filter_cases_type['A_STATUS'].isin(status)]

In [None]:
keywords = ['home', 'family', 'residen', 'mixed', 'mized', 'duplex', 'apartment', ' housing', 'condo', 'dwelling', 'tenant', 'affordable', 'units', 'townhouse']
pattern = '|'.join(keywords)
filtered1 = filter_cases_status[filter_cases_status['A_DESCRIPT'].str.contains(pattern, case=False, na=False)]

#use not pattern to avoid these words
keywords_avoid = ['expand','storage']
pattern_avoid = '|'.join(keywords_avoid)
filtered2 = filtered1[~filtered1['A_DESCRIPT'].str.contains(pattern_avoid, case=False, na=False)]

In [None]:
#filter for all status dates after 2020
filtered2['A_STATUS_D'] = pd.to_datetime(filtered2['A_STATUS_D'])
filtered3 = filtered2[filtered2['A_STATUS_D'].dt.year>=2020]

In [None]:
def normalize_for_regex(term):
    return re.sub(r'[-\s]+', r'\\s*-?\\s*', term)

def extract_units(description):
    # Remove square footage
    description = re.sub(
        r'(\d+|\d{1,3}(,\d{3})*)(\s+[A-Za-z-]+){0,2}?\s*(SF|square feet|sq\.?\s*ft\.?|sqft)',
        '', description, flags=re.IGNORECASE
    )

    # Housing normalization, THIS MEANS THAT IF HOUSES->HOME, S-F -> SINGLE FAMILY, ETC.
    term_map = {
        "home": "home", "homes": "home", "house": "home", "houses": "home",
        "duplex": "duplex", "duplexes": "duplex",
        "condo": "condo", "condominium": "condo", "condominiums": "condo", "condos": "condo", 
        "apartment": "apartment", "apartments": "apartment",
        "townhome": "townhouse", "townhomes": "townhouse",
        "townhouse": "townhouse", "townhouses": "townhouse",
        "town home": "townhouse", "town homes": "townhouse",
        "town house": "townhouse", "town houses": "townhouse",
        "multifamily": "multifamily", "multi-family": "multifamily", "multi - family": "multifamily", "multi family": "multifamily",
        "single family": "single family", "single-family": "single family", 
        "single - family": "single family", "s-f": "single family", "s - f": "single family", "s f": "single family"
    }

    # Optional leading and trailing terms
    modifiers = ["attached", "detached"]
    suffixes = ["units", "lots", "homes", "houses"]

    # Build regex patterns
    housing_pattern = "|".join([normalize_for_regex(term) for term in term_map])
    modifier_pattern = "|".join(modifiers)
    suffix_pattern = "|".join(suffixes)

    match_pattern = rf'''
        \b
        (?P<qty>\d{{1,4}})
        \s+
        (?:(?P<mod>{modifier_pattern})\s+)?
        (?:[A-Za-z-]+\s+){{0,2}}?
        (?P<type>{housing_pattern})
        (?:\s+(?P<suffix>{suffix_pattern}))?
        \b
    '''

    matches = re.finditer(match_pattern, description, flags=re.IGNORECASE | re.VERBOSE)
    
    result = []
    for match in matches:
        qty = match.group("qty")
        raw_type = match.group("type")
        raw_mod = match.group("mod")
        raw_suffix = match.group("suffix")

        # Normalize type
        norm_key = re.sub(r'[-\s]+', ' ', raw_type.lower()).strip()
        normalized_type = term_map.get(norm_key, norm_key)

        # Build output tuple
        result.append((
            int(qty),
            raw_mod.lower() if raw_mod else None,
            normalized_type,
            raw_suffix.lower() if raw_suffix else None
        ))

    return result

In [None]:
filtered3['match_results'] = filtered3['A_DESCRIPT'].apply(extract_units)
filtered3 = filtered3.to_crs('EPSG:4326')

In [None]:
# create a column for each relevant housing type
housing_types = ['sf_detached', 'sf_attached', 'duplex/triplex', 'multifamily', 'condo']
for h_type in housing_types: 
    filtered3[h_type] = 0

housing_type_dict = {
        'townhouse': 'sf_attached',
        'home': 'sf_detached', 'single family': 'sf_detached', 
        'duplex': 'duplex/triplex', 
        'apartment': 'multifamily', 'multifamily': 'multifamily', 
        'condo': 'condo'}
      
    
# function to fill in housing type columns
def fill_types(match_results):

    row_data = {h_type: 0 for h_type in housing_types}
    for group in match_results:
        
        quantity = int(group[0])
        mod = group[1] if len(group) > 1 else None
        housing = group[2] if len(group) > 2 else None
        
        if housing == 'single family' and mod == 'attached':
            row_data['sf_attached'] += quantity
        elif housing in housing_type_dict:
            row_data[housing_type_dict[housing]] += quantity
        else:
            pass

    return pd.Series(row_data)
            

In [None]:
filtered3[housing_types] = filtered3['match_results'].apply(fill_types)
filtered3.head()

In [None]:
high_school_boundaries = gpd.read_file(r'/Users/leahwallihan/Durham_school_planning/geospatial files/HS_regions') 
high_school_boundaries = high_school_boundaries.to_crs('EPSG:4326')

In [None]:
filtered3_with_school = filtered3.copy()

for i,geometry in enumerate(high_school_boundaries['geometry']):
    
    in_geometry = geometry.contains(filtered3['geometry'])
    high_school_name = high_school_boundaries.loc[i, 'region']

    filtered3_with_school.loc[in_geometry, 'region'] = high_school_name
    

In [None]:
# filtered3_with_school.head(30)

In [None]:
# filtered3_with_school.to_file('filtered3_with_school.json')

In [None]:
# filtered3_with_school.to_file("resdev_cases.geojson", driver="GeoJSON")