In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import re
pd.set_option('display.max_columns', None)

In [2]:
# geostl nghbd codes:
    # gravois park = 19
    # benton park west = 30
    # dutchtown = 16
    # mount pleasant = 17
# https://dynamic.stlouis-mo.gov/citydata/newdesign/sqlsearch.cfm
# gravois_total_parcels = 1733
# benton_park_west_total_parcels = 1693
# dutchtown_total_parcels = 4850

In [3]:
csv_folder = 'stl_vacancy_data/'
path = "stl_vacancy_data/*.csv"
csv_list = []

gj_vacancy_cat_df = pd.DataFrame(columns=['Date', 'V_Indeterminate', 'V_Possible', 'V_Very_likely', 'V_Definite'])
gj_burden_cat_df = pd.DataFrame(columns=['Date', 'B_Zero', 'B_Low', 'B_Medium', 'B_High'])

for fname in glob.glob(path):
    csv_name = re.findall(r'stl_vacancy_data_\d\d\d\d-\d\d-\d\d.csv', fname)[0]
    csv_list.append(csv_name)

In [4]:
parcels = pd.read_csv('grav-jeff-parcels.csv')
nhd_num_list = [16, 17, 19, 30]
mask = parcels['NBRHD'].isin(nhd_num_list)
parcels_df = parcels[mask]

parcels_df['SITEADDR'] = parcels_df['SITEADDR'].replace(r'\s+', ' ', regex=True)

  parcels = pd.read_csv('grav-jeff-parcels.csv')


In [5]:
gj_vacancy_cat_df = pd.DataFrame(columns=['Date', 'V_Indeterminate', 'V_Possible', 'V_Very_likely', 'V_Definite'])
gj_burden_cat_df = pd.DataFrame(columns=['Date', 'B_Zero', 'B_Low', 'B_Medium', 'B_High'])

In [6]:
def combine_columns(row):
    return str(row['StAddrNum']) + ' ' + row['StNameFull']

In [7]:
class VacancyTransformer():
    def __init__(self, csv_name):
        self.date = re.findall(r'\d\d\d\d-\d\d-\d\d', csv_name)[0]
        self.csv_name = csv_name
        
    def load_raw_df(self):
        self.raw_df = pd.read_csv(csv_folder+self.csv_name)
        return self.raw_df

    def create_regional_df(self, parcel_df):

        # nhd_names = ['Gravois Park', 'Benton Park West', 'Dutchtown', 'Mount Pleasant']
        # mask = self.raw_df['NhdName'].isin(nhd_names)
        # regional_df = raw_df[mask]

        full_nhds = ['Gravois Park', 'Benton Park West']
        partial_nhds = ['Dutchtown', 'Mount Pleasant']

        full_mask = self.raw_df['NhdName'].isin(full_nhds)
        bpw_gp_df = self.raw_df[full_mask]

        partial_mask = self.raw_df['NhdName'].isin(partial_nhds)
        partial_df = self.raw_df[partial_mask]

        matching_parcels = []
        for handle in list(partial_df['Handle']):
            if handle in list(parcels_df['HANDLE']):
                matching_parcels.append(handle)

        handle_mask = partial_df['NhdName'].isin(matching_parcels)
        handle_match_df = partial_df[handle_mask]

        partial_df['SITEADDR'] = partial_df.apply(combine_columns, axis=1)
        
        matching_parcels = []
        for site_addr in list(partial_df['SITEADDR']):
            if site_addr in list(parcels_df['SITEADDR']):
                matching_parcels.append(site_addr)

        addr_mask = partial_df['SITEADDR'].isin(matching_parcels)
        addr_match_df = partial_df[addr_mask]

        joint_df = pd.concat([handle_match_df, addr_match_df], ignore_index=True)
        dt_mp_df = joint_df.drop_duplicates(ignore_index=True)

        regional_df = pd.concat([bpw_gp_df, dt_mp_df], ignore_index=True)
        
        # gj_addr = list(regional_df['SITEADDR'])
        # site_mask = parcels['SITEADDR'].isin(gj_addr)
        # site_handles = parcels['HANDLE'][site_mask]

        # for handle in list(regional_df['Handle']):
        #     if handle in list(parcels_df['HANDLE']):
        #         matching_parcels.append(handle)
        #     elif 

        return regional_df
        
    def calc_vacancy_cats(self, df, aggregate_df):
        counts = df['VacancyCat'].value_counts()
        data = [self.date] + counts[['Indeterminant', 'Possible', 'Very Likely', 'Definite']].tolist()
        
        aggregate_df.loc[len(aggregate_df.index)] = data

    def calc_burden_cats(self, df, aggregate_df):
        counts = df['BurdenCat'].value_counts()
        zero_cat = counts['Zero']
        low_cat = sum(counts[['Minimal', 'Very Low', 'Low']])
        med_cat = sum(counts[['Medium Low', 'Medium', 'Medium High', 'Somewhat High']])
        high_cat = sum(counts[['High', 'Very High', 'Extremely High']])
        
        aggregate_df.loc[len(aggregate_df.index)] = [self.date, zero_cat, low_cat, med_cat, high_cat]

    def calc_groupby_counts(self, df, cat_list, region):
        
        # valid = {'stl','bpw-gp', 'dutchtown', 'gravois-jefferson'}
        # if region not in valid:
        #     raise ValueError("results: status must be one of %r." % valid)
        
        for cat in cat_list:
            
            # VacancyCat:
            vac_df = df.groupby(cat)['VacancyCat'].value_counts().to_frame().unstack()
            vac_df.columns = vac_df.columns.droplevel()
            vac_df.fillna(value=0, inplace=True)
            vac_value_name = vac_df.columns.name
            vac_index_name = vac_df.index.name
            vac_df = vac_df[['Indeterminant', 'Possible', 'Very Likely', 'Definite']]
            vac_df.rename(columns={'Indeterminant': 'V_Indeterminant',
                               'Possible': 'V_Possible',
                               'Very Likely': 'V_Very_Likely',
                               'Definite': 'V_Definite'},
                          inplace=True)
            vac_df.reset_index(inplace=True)
            vac_df.insert(loc=0, column='Date', value=self.date)
            vac_df.to_csv(f'data/temp/type_vacancy_data/{region}/{region}_{vac_index_name}_{vac_value_name}_{self.date}.csv')

            # BurdenCat
            bur_df =  df.groupby(cat)['BurdenCat'].value_counts().to_frame().unstack()
            bur_df.columns = bur_df.columns.droplevel()
            bur_df.fillna(value=0, inplace=True)
            bur_index_name = bur_df.index.name
            bur_value_name = bur_df.columns.name
            bur_df['B_Zero'] = bur_df['Zero']
            bur_df['B_Low'] = bur_df['Minimal'] + bur_df['Very Low'] + bur_df['Low']
            bur_df['B_Medium'] = bur_df['Medium Low'] + bur_df['Medium'] + bur_df['Medium High'] + bur_df['Somewhat High']
            bur_df['B_High'] = bur_df['High'] + bur_df['Very High'] + bur_df['Extremely High']
            bur_df.reset_index(inplace=True)
            bur_df.insert(loc=0, column='Date', value=self.date)
            bur_df = bur_df[['Date', 'Type', 'B_Zero', 'B_Low', 'B_Medium', 'B_High']]
            bur_df.to_csv(f'data/temp/type_burden_data/{region}/{region}_{bur_index_name}_{bur_value_name}_{self.date}.csv')

In [8]:
for csv in csv_list:

    d = VacancyTransformer(csv)
    d.load_raw_df()
    reg_df = d.create_regional_df(parcel_df=parcels_df)
    d.calc_vacancy_cats(df=reg_df, aggregate_df=gj_vacancy_cat_df)
    d.calc_burden_cats(df=reg_df, aggregate_df=gj_burden_cat_df)
    d.calc_groupby_counts(df=reg_df, cat_list=['Type'], region='gravois-jefferson')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partial_df['SITEADDR'] = partial_df.apply(combine_columns, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partial_df['SITEADDR'] = partial_df.apply(combine_columns, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partial_df['SITEADDR'] = partial_df.apply(combine_columns, axis=1)
A va

In [9]:
gj_vacancy_cat_df

Unnamed: 0,Date,V_Indeterminate,V_Possible,V_Very_likely,V_Definite
0,2021-03-01,176,54,536,319
1,2021-04-01,116,12,434,278
2,2021-04-13,116,12,654,321
3,2021-06-24,64,9,604,320
4,2021-07-17,65,10,602,320
5,2021-08-18,67,10,606,320
6,2021-09-22,70,11,611,320
7,2021-10-15,72,11,617,320
8,2021-11-24,70,11,627,319
9,2021-12-20,70,12,628,319


In [10]:
gj_burden_cat_df

Unnamed: 0,Date,B_Zero,B_Low,B_Medium,B_High
0,2021-03-01,56,373,313,343
1,2021-04-01,83,307,224,226
2,2021-04-13,195,387,240,281
3,2021-06-24,154,307,181,355
4,2021-07-17,152,308,181,356
5,2021-08-18,159,301,186,357
6,2021-09-22,165,301,188,358
7,2021-10-15,168,303,192,357
8,2021-11-24,173,301,197,356
9,2021-12-20,175,298,198,358


In [11]:
path = "data/temp/type_burden_data/gravois-jefferson/*.csv"
burden_list = []
for fname in glob.glob(path):
    df = pd.read_csv(fname, index_col=0)
    burden_list.append(df)
dutchtown_type_burden_all = pd.concat(burden_list, ignore_index=True)
dutchtown_type_burden_all.to_csv('data/gravois-jefferson_type_burden_all.csv')

In [12]:
path = "data/temp/type_vacancy_data/gravois-jefferson/*.csv"
burden_list = []
for fname in glob.glob(path):
    df = pd.read_csv(fname, index_col=0)
    burden_list.append(df)
dutchtown_type_burden_all = pd.concat(burden_list, ignore_index=True)
dutchtown_type_burden_all.to_csv('data/gravois-jefferson_type_vacancy_all.csv')

In [14]:
gj_burden_cat_df.to_csv('data/temp/gravois-jefferson_burden.csv')
gj_vacancy_cat_df.to_csv('data/temp/gravois-jefferson_vacancy.csv')

### now split the type counts by categories

In [17]:
# csv_folder = 'data/'
path = "data/temp/type_burden_data/gravois-jefferson/*.csv"
csv_list = []

for fname in glob.glob(path):
    df = pd.read_csv(fname, index_col=0)
    
    date = df['Date'][0]
    res_cats = ['Duplex', 'Multi-Unit', 'Single-Family']
    res_mask = df['Type'].isin(res_cats)
    res_df = df[res_mask]
    
    df.loc[len(df.index)] = [date, 'Residential', res_df['B_Zero'].sum(), res_df['B_Low'].sum(), res_df['B_Medium'].sum(), res_df['B_High'].sum()]

Unnamed: 0,Date,Type,B_Zero,B_Low,B_Medium,B_High
0,2021-03-01,Commercial,2.0,20.0,11.0,15.0
1,2021-03-01,Duplex,10.0,91.0,82.0,102.0
2,2021-03-01,Empty Lot,5.0,57.0,50.0,62.0
3,2021-03-01,Multi-Unit,6.0,31.0,31.0,43.0
4,2021-03-01,Other,0.0,12.0,6.0,11.0
5,2021-03-01,Single-Family,33.0,162.0,133.0,110.0


In [24]:
df = csv_list[0]

date = df['Date'][0]
res_cats = ['Duplex', 'Multi-Unit', 'Single-Family']
res_mask = df['Type'].isin(res_cats)
res_df = df[res_mask]
res_df['B_Zero'].sum()

49.0

In [23]:
res_df

Unnamed: 0,Date,Type,B_Zero,B_Low,B_Medium,B_High
1,2021-03-01,Duplex,10.0,91.0,82.0,102.0
3,2021-03-01,Multi-Unit,6.0,31.0,31.0,43.0
5,2021-03-01,Single-Family,33.0,162.0,133.0,110.0


In [26]:
date = df['Date'][0]
res_cats = ['Duplex', 'Multi-Unit', 'Single-Family']
res_mask = df['Type'].isin(res_cats)
res_df = df[res_mask]
df.loc[len(df.index)] = [date, 'Residential', res_df['B_Zero'].sum(), res_df['B_Low'].sum(), res_df['B_Medium'].sum(), res_df['B_High'].sum()]
df

Unnamed: 0,Date,Type,B_Zero,B_Low,B_Medium,B_High
0,2021-03-01,Commercial,2.0,20.0,11.0,15.0
1,2021-03-01,Duplex,10.0,91.0,82.0,102.0
2,2021-03-01,Empty Lot,5.0,57.0,50.0,62.0
3,2021-03-01,Multi-Unit,6.0,31.0,31.0,43.0
4,2021-03-01,Other,0.0,12.0,6.0,11.0
5,2021-03-01,Single-Family,33.0,162.0,133.0,110.0
6,2021-03-01,Residential,49.0,284.0,246.0,255.0
