In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import re
pd.set_option('display.max_columns', None)

In [None]:
parcels = pd.read_csv('grav-jeff-parcels.csv')
nhd_num_list = [16, 17, 19, 30]
mask = parcels['NBRHD'].isin(nhd_num_list)
parcels_df = parcels[mask]
parcels_df['SITEADDR'] = parcels_df['SITEADDR'].replace(r'\s+', ' ', regex=True)

csv = ''

In [None]:
gj_vacancy_cat_df = pd.DataFrame(columns=['Date', 'V_Indeterminate', 'V_Possible', 'V_Very_likely', 'V_Definite'])
gj_burden_cat_df = pd.DataFrame(columns=['Date', 'B_Zero', 'B_Low', 'B_Medium', 'B_High'])

lra_vacancy_cat_df = pd.DataFrame(columns=['Date', 'V_Indeterminate', 'V_Possible', 'V_Very_likely', 'V_Definite'])
lra_burden_cat_df = pd.DataFrame(columns=['Date', 'B_Zero', 'B_Low', 'B_Medium', 'B_High'])

non_lra_vacancy_cat_df = pd.DataFrame(columns=['Date', 'V_Indeterminate', 'V_Possible', 'V_Very_likely', 'V_Definite'])
non_lra_burden_cat_df = pd.DataFrame(columns=['Date', 'B_Zero', 'B_Low', 'B_Medium', 'B_High'])

In [None]:
class VacancyDataTransformer():

    def load_raw_df(self, csv):
        self.raw_df = pd.read_csv(csv)
        return self.raw_df

    def create_regional_df(self, raw_df, parcel_df):

        # nhd_names = ['Gravois Park', 'Benton Park West', 'Dutchtown', 'Mount Pleasant']
        # mask = self.raw_df['NhdName'].isin(nhd_names)
        # regional_df = raw_df[mask]

        full_nhds = ['Gravois Park', 'Benton Park West']
        partial_nhds = ['Dutchtown', 'Mount Pleasant']

        full_mask = raw_df['NhdName'].isin(full_nhds)
        bpw_gp_df = raw_df[full_mask]

        partial_mask = raw_df['NhdName'].isin(partial_nhds)
        partial_df = raw_df[partial_mask]

        matching_parcels = []
        for handle in list(partial_df['Handle']):
            if handle in list(parcels_df['HANDLE']):
                matching_parcels.append(handle)

        handle_mask = partial_df['NhdName'].isin(matching_parcels)
        handle_match_df = partial_df[handle_mask]

        partial_df['SITEADDR'] = partial_df.apply(combine_columns, axis=1)
        
        matching_parcels = []
        for site_addr in list(partial_df['SITEADDR']):
            if site_addr in list(parcels_df['SITEADDR']):
                matching_parcels.append(site_addr)

        addr_mask = partial_df['SITEADDR'].isin(matching_parcels)
        addr_match_df = partial_df[addr_mask]

        joint_df = pd.concat([handle_match_df, addr_match_df], ignore_index=True)
        dt_mp_df = joint_df.drop_duplicates(ignore_index=True)

        regional_df = pd.concat([bpw_gp_df, dt_mp_df], ignore_index=True)
        
        return regional_df

    def calc_vacancy_cats(self, df, aggregate_df):
        counts = df['VacancyCat'].value_counts().to_dict()
        cnt_cols = ['Indeterminant', 'Possible', 'Very Likely', 'Definite']
        for col in cnt_cols:
            if col not in counts.keys():
                counts[col] = 0
                
        data = [self.date] + [counts['Indeterminant'], counts['Possible'], counts['Very Likely'], counts['Definite']]
        
        aggregate_df.loc[len(aggregate_df.index)] = data

    def calc_burden_cats(self, df, aggregate_df):
        counts = df['BurdenCat'].value_counts().to_dict()
        cnt_cols = ['Zero', 'Minimal', 'Very Low', 'Low', 'Medium Low', 'Medium', 'Medium High', 'Somewhat High', 'High', 'Very High', 'Extremely High']
        for col in cnt_cols:
            if col not in counts.keys():
                counts[col] = 0
                
        zero_cat = counts['Zero']
        low_cat = sum([counts['Minimal'], counts['Very Low'], counts['Low']])
        med_cat = sum([counts['Medium Low'], counts['Medium'], counts['Medium High'], counts['Somewhat High']])
        high_cat = sum([counts['High'], counts['Very High'], counts['Extremely High']])
        
        aggregate_df.loc[len(aggregate_df.index)] = [self.date, zero_cat, low_cat, med_cat, high_cat]

    def calc_groupby_counts(self, df, cat_list, region):
        
        # valid = {'stl','bpw-gp', 'dutchtown', 'gravois-jefferson'}
        # if region not in valid:
        #     raise ValueError("results: status must be one of %r." % valid)
        
        for cat in cat_list:

            # VacancyCat:
            vac_df = df.groupby(cat)['VacancyCat'].value_counts().to_frame().unstack()
            vac_df.columns = vac_df.columns.droplevel()
            vac_df.fillna(value=0, inplace=True)
            vac_value_name = vac_df.columns.name
            vac_index_name = vac_df.index.name

            vac_dict = vac_df.to_dict() # sdflkjasd;lfja;slfjklsd;fkjls;dkfjlsdfja;lsfdjl;skdjflsdjflsdjfklsdkfjl;sdkfjaksjdfasjdf;ldsf
            key = list(vac_dict.keys())[0]
            b_types = list(vac_dict[key].keys())
            v_cols = ['Indeterminant', 'Possible', 'Very Likely', 'Definite']

            for col in v_cols:
                if col not in vac_dict.keys():
                    vac_dict[col] = {}
                    for b in b_types:
                        vac_dict[col][b] = 0

            vac_df = pd.DataFrame(vac_dict)
            
            vac_df = vac_df[['Indeterminant', 'Possible', 'Very Likely', 'Definite']]
            vac_df.rename(columns={'Indeterminant': 'V_Indeterminant',
                               'Possible': 'V_Possible',
                               'Very Likely': 'V_Very_Likely',
                               'Definite': 'V_Definite'},
                          inplace=True)
            vac_df.reset_index(inplace=True, names='Type')
            vac_df.insert(loc=0, column='Date', value=self.date)
            # vac_df.to_csv(f'data/temp/{vac_index_name.lower()}_vacancy_data/{region}/{region}_{vac_index_name}_{vac_value_name}_{self.date}.csv')
            

            # BurdenCat
            bur_df =  df.groupby(cat)['BurdenCat'].value_counts().to_frame().unstack()
            bur_df.columns = bur_df.columns.droplevel()
            bur_df.fillna(value=0, inplace=True)
            bur_index_name = bur_df.index.name
            bur_value_name = bur_df.columns.name

            bur_dict = bur_df.to_dict()
            key = list(bur_dict.keys())[0]
            b_types = list(bur_dict[key].keys())
            b_cols = ['Zero', 'Minimal', 'Very Low', 'Low', 'Medium Low', 'Medium', 'Medium High', 'Somewhat High', 'High', 'Very High', 'Extremely High']

            for col in b_cols:
                if col not in bur_dict.keys():
                    bur_dict[col] = {}
                    for b in b_types:
                        bur_dict[col][b] = 0

            bur_df = pd.DataFrame(bur_dict).reset_index(names='Type')
            
            bur_df['B_Zero'] = bur_df['Zero']
            bur_df['B_Low'] = bur_df['Minimal'] + bur_df['Very Low'] + bur_df['Low']
            bur_df['B_Medium'] = bur_df['Medium Low'] + bur_df['Medium'] + bur_df['Medium High'] + bur_df['Somewhat High']
            bur_df['B_High'] = bur_df['High'] + bur_df['Very High'] + bur_df['Extremely High']
            bur_df.reset_index(inplace=True)
            bur_df.insert(loc=0, column='Date', value=self.date)
            bur_df = bur_df[['Date', 'Type', 'B_Zero', 'B_Low', 'B_Medium', 'B_High']]
            # bur_df.to_csv(f'data/temp/{vac_index_name.lower()}_burden_data/{region}/{region}_{bur_index_name}_{bur_value_name}_{self.date}.csv')
            
        return vac_df, bur_df

In [None]:
vt = VacancyTransformer(csv)
raw = vt.load_raw_df()

lra = raw.loc[raw['IsLRA'].isin([True])]
non_lra = raw.loc[~raw['IsLRA'].isin([True])]

# calc the grav-jeff totals
reg_df = vt.create_regional_df(raw_df=raw, parcel_df=parcels_df)
vt.calc_vacancy_cats(df=reg_df, aggregate_df=gj_vacancy_cat_df)
vt.calc_burden_cats(df=reg_df, aggregate_df=gj_burden_cat_df)
reg_vac, reg_bur = vt.calc_groupby_counts(df=reg_df, cat_list=['Type'], region='gravois-jefferson')

# calc the lra totals
lra_df = vt.create_regional_df(raw_df=lra, parcel_df=parcels_df)
vt.calc_vacancy_cats(df=lra_df, aggregate_df=lra_vacancy_cat_df)
vt.calc_burden_cats(df=lra_df, aggregate_df=lra_burden_cat_df)
lra_vac, lra_bur = vt.calc_groupby_counts(df=lra_df, cat_list=['Type'], region='gj_lra')

# calc the non-lra totals
non_lra_df = vt.create_regional_df(raw_df=non_lra, parcel_df=parcels_df)
vt.calc_vacancy_cats(df=non_lra_df, aggregate_df=non_lra_vacancy_cat_df)
vt.calc_burden_cats(df=non_lra_df, aggregate_df=non_lra_burden_cat_df)
non_lra_vac, non_lra_bur = vt.calc_groupby_counts(df=non_lra_df, cat_list=['Type'], region='gj_non-lra')

In [None]:
# gj_burden_cat_df.to_csv('data/temp/gravois-jefferson_burden.csv')
# gj_vacancy_cat_df.to_csv('data/temp/gravois-jefferson_vacancy.csv')

# lra_burden_cat_df.to_csv('data/temp/gj_lra_burden.csv')
# lra_vacancy_cat_df.to_csv('data/temp/gj_lra_vacancy.csv')

# non_lra_burden_cat_df.to_csv('data/temp/gj_non-lra_burden.csv')
# non_lra_vacancy_cat_df.to_csv('data/temp/gj_non-lra_vacancy.csv')

In [None]:
'09A': non-negligient manslaughter and murder
'11A': rape
'13A', '13A*': aggravated assault