In [90]:
import pandas as pd
import numpy as np
import io
import requests
from datetime import datetime


In [91]:
url = 'https://phl.carto.com/api/v2/sql?q=SELECT+*,+ST_Y(the_geom)+AS+lat,+ST_X(the_geom)+AS+lng+FROM+opa_properties_public&filename=opa_properties_public&format=csv&skipfields=cartodb_id,the_geom,the_geom_webmercator'
df = pd.read_csv(url)


In [93]:
### new columns
added_columns = [
    '# UNITS',
    'REIS Submarket',
    'CITY',
    'STATE',
    'RESI',
    'CONDO',
    'UNIT',
    'COMM',
    'TOT ASSD $',
    'RE TAXES',
]

### OVERWRITE instructions for the original data file
### these instructions could be manually altered  
rename_dict = {
    'number_of_rooms': {
        'delete': 0,
        'new_name': '# ROOMS',
    },
    'assessment_date': {
        'delete': 0,
        'new_name': 'assessment_date',
    },
    'beginning_point': {
        'delete': 1,
    },
    'book_and_page': {
        'delete': 1,
    },
    'building_code': {
        'delete': 1,
    },
    'building_code_description': {
        'delete': 0,
        'new_name': 'BLDG CODE',
    },
    'category_code': {
        'delete': 1,
    },
    'category_code_description': {
        'delete': 0,
        'new_name': 'BLDG CAT',
    },
    'census_tract': {
        'delete': 1,
    },
    'central_air': {
        'delete': 0,
        'new_name': 'CENTRAL AIR',
    },
    'cross_reference': {
        'delete': 1,
    },
    'date_exterior_condition': {
        'delete': 0,
        'new_name': 'EXT CONDITION DATE',
    },
    'depth': {
        'delete': 0,
        'new_name': 'LOT DEPTH',
    },
    'exempt_building': {
        'delete': 0,
        'new_name': 'BLDG EXEMPT',
    },
    'exempt_land': {
        'delete': 0,
        'new_name': 'LAND EXEMPT',
    },
    'exterior_condition': {
        'delete': 0,
        'new_name': 'EXT CONDITION',
    },
    'fireplaces': {
        'delete': 0,
        'new_name': '# FIREPLACE',
    },
    'frontage': {
        'delete': 0,
        'new_name': 'LOT FRONTAGE',
    },
    'fuel': {
        'delete': 1,
    },
    'garage_spaces': {
        'delete': 0,
        'new_name': 'GARAGE',
    },
    'garage_type': {
        'delete': 0,
        'new_name': 'GARAGE TYPE',
    },
    'general_construction': {
        'delete': 1,
    },
    'geographic_ward': {
        'delete': 1,
    },
    'homestead_exemption': {
        'delete': 0,
        'new_name': 'homestead_exemption',
    },
    'house_extension': {
        'delete': 1,
    },
    'house_number': {
        'delete': 1,
    },
    'interior_condition': {
        'delete': 0,
        'new_name': 'INT CONDITION',
    },
    'location': {
        'delete': 0,
        'new_name': 'ADDRESS',
    },
    'mailing_address_1': {
        'delete': 1,
    },
    'mailing_address_2': {
        'delete': 1,
    },
    'mailing_care_of': {
        'delete': 1,
    },
    'mailing_city_state': {
        'delete': 0,
        'new_name': 'OWNER CITY',
    },
    'mailing_street': {
        'delete': 0,
        'new_name': 'OWNER ADDRESS',
    },
    'mailing_zip': {
        'delete': 0,
        'new_name': 'OWNER ZIP',
    },
    'market_value': {
        'delete': 0,
        'new_name': 'MARKET VALUE',
    },
    'market_value_date': {
        'delete': 1,
    },
    'number_of_bathrooms': {
        'delete': 0,
        'new_name': '# BATH',
    },
    'number_of_bedrooms': {
        'delete': 0,
        'new_name': '# BED',
    },
    'basements': {
        'delete': 0,
        'new_name': 'BASEMENT',
    },
    'number_stories': {
        'delete': 0,
        'new_name': '# FLOORS',
    },
    'off_street_open': {
        'delete': 0,
        'new_name': 'off_street_open'
    },
    'other_building': {
        'delete': 0,
        'new_name': 'BUILDING',
    },
    'owner_1': {
        'delete': 0,
        'new_name': 'OWNER',
    },
    'owner_2': {
        'delete': 1,
    },
    'parcel_number': {
        'delete': 0,
        'new_name': 'PARCEL ID',
    },
    'parcel_shape': {
        'delete': 0,
        'new_name': 'PARCEL SHAPE',
    },
    'quality_grade': {
        'delete': 1
    },
    'recording_date': {
        'delete': 0,
        'new_name': 'RECORDING DATE',
    },
    'registry_number': {
        'delete': 1
    },
    'sale_date': {
        'delete': 0,
        'new_name': 'SALE DATE',
    },
    'sale_price': {
        'delete': 0,
        'new_name': 'SALE PRICE',
    },
    'separate_utilities': {
        'delete': 1,
    },
    'sewer': {
        'delete': 1,
    },
    'site_type': {
        'delete': 1,
    },
    'state_code': {
        'delete': 1,
    },
    'street_code': {
        'delete': 1,
    },
    'street_designation': {
        'delete': 1,
    },
    'street_direction': {
        'delete': 1,
    },
    'street_name': {
        'delete': 1,
    },
    'suffix': {
        'delete': 1,
    },
    'taxable_building': {
        'delete': 0,
        'new_name': 'BLDG ASSD $',
    },
    'taxable_land': {
        'delete': 0,
        'new_name': 'LAND ASSD $',
    },
    'topography': {
        'delete': 0,
        'new_name': 'TOPOGRAPHY',
    },
    'total_area': {
        'delete': 0,
        'new_name': 'LAND SF',
    },
    'total_livable_area': {
        'delete': 0,
        'new_name': 'GSF',
    },
    'type_heater': {
        'delete': 1,
    },
    'unfinished': {
        'delete': 1,
    },
    'unit': {
        'delete': 0,
        'new_name': 'UNIT #',
    },
    'utility': {
        'delete': 1,
    },
    'view_type': {
        'delete': 0,
        'new_name': 'VIEW',
    },
    'year_built': {
        'delete': 0,
        'new_name': 'YEAR BUILT',
    },
    'year_built_estimate': {
        'delete': 1,
    },
    'zip_code': {
        'delete': 0,
        'new_name': 'ZIP',
    },
    'zoning': {
        'delete': 0,
        'new_name': 'ZONING',
    },
    'objectid': {
        'delete': 1,
    },
    'lat': {
        'delete': 0,
        'new_name': 'LATITUDE',
    },
    'lng': {
        'delete': 0,
        'new_name': 'LONGITUDE',
    },
}

instructions = {
    'added_columns': added_columns,
    'rename_dict': rename_dict,
}

In [274]:
def pre_clean_df(df, instructions):
    added_columns = instructions['added_columns']
    rename_dict = instructions['rename_dict']
    orig_columns = list(instructions['rename_dict'].keys())
    df_new = df.copy()[orig_columns]
    
    for column in orig_columns:
        if rename_dict[column]['delete'] == 1:
            df_new = df_new.drop([column], axis=1)
        if rename_dict[column]['delete'] == 0:
            df_new = df_new.rename(columns={column: rename_dict[column]['new_name']})
    
    df_new = df_new.reindex(df_new.columns.tolist()+added_columns, axis=1)\
                   .astype(dtype={'SALE DATE': str})
    
    df_new['SALE DATE'] = pd.to_datetime(df_new['SALE DATE'], errors='coerce')
    df_new = df_new.sort_values(by=['SALE DATE'], ascending=False)\
                   .drop(df_new[df_new['SALE DATE']==pd.NaT].index)\
                   .reset_index(drop=True)
    
    return df_new

def subset_df_date(df_new, deltadays):
    delta = pd.Timedelta(deltadays)
    df_new = df_new.sort_values(by=['SALE DATE'], ascending=False)
    latest_date = df_new['SALE DATE'].iloc[0]
    earliest_date = latest_date-delta
    keep_index = df_new[(df_new['SALE DATE']>=earliest_date) & 
                        (df_new['SALE DATE']<=latest_date)].index
    df_sub = df_new.iloc[keep_index]\
                   .reset_index(drop=True)
    return df_sub

def update_PLUTO(pluto, df_sub):
    pluto_addresses = pluto['ADDRESS'].tolist()
    sub_addresses = df_sub['ADDRESS'].tolist()
    pluto_update = pluto.copy()
    
    ### loop through all the addresses in the new data 
    ### to match the addresses in the PLUTO dataset 
    for address in sub_addresses:
        if address in pluto_addresses:
            print(f'1: {address}')
            added = df_sub.loc[df_sub['ADDRESS']==address]
            original = pluto.loc[pluto['ADDRESS']==address]
            
            added_unit = added['UNIT'].values.tolist()[0]
            orig_unit = original['UNIT'].values.tolist()[0]
            # address in the PLUTO whose data need to be updated 
            if added_unit == orig_unit:
                pluto_update.at[
                    pluto_update[pluto_update['ADDRESS']==address].index,
                    ['GSF', 'SALE PRICE', 'SALE DATE']
                ] = df_sub.loc[df_sub['ADDRESS']==address]\
                          .loc[['GSF', 
                                'SALE PRICE', 
                                'SALE DATE']]\
                          .values\
                          .tolist()
            else:
                # to account for the addresses that have multiple properties
                added_rows = df_sub.loc[df_sub['ADDRESS']==address]
                for i in range(added_rows.shape[0]):
                    pluto_update = pluto_update.append(added_rows.iloc[i], 
                                                       ignore_index=True)
        else:
            print(f'2: {address}')
            added_row = df_sub.loc[df_sub['ADDRESS']==address]
            pluto_update = pluto_update.append(added_row, ignore_index=True)
    
    return pluto_update

In [234]:
df_new = pre_clean_df(df, instructions)
df_sub = subset_df_date(df_new, '40 days')

In [235]:
pluto = pd.read_csv('../data/project/PHLPL-001 All_Properties [byaddress;location] PLUTO.csv')

In [236]:
pluto['SALE DATE'] = pd.to_datetime(pluto['SALE DATE'], errors='coerce')

In [237]:
pluto = pluto.sort_values(by=['SALE DATE'], ascending=False)

In [275]:
pluto_update = update_PLUTO(pluto, df_sub)

1: 2323 W CUMBERLAND ST
1: 2643 SEPVIVA ST
2: 3300 CONRAD ST
1: 2849 HEDLEY ST
1: 3135 CROATAN PL
1: 1916 N 61ST ST
1: 2614 S MARSHALL ST
1: 1110 BUTTONWOOD ST
1: 1310 NARRAGANSETT ST
1: 3319 ENGLEWOOD ST
1: 2833 S ISEMINGER ST
1: 7245 OAKLAND ST
1: 2143 E CLEARFIELD ST
2: 5940 MARKET ST
2: 2012 WHARTON ST
1: 2557 N JESSUP ST
1: 3618 N MARVINE ST
1: 1227 E VENANGO ST
1: 611 W LURAY ST
1: 2923 S CARLISLE ST
1: 12521 RICHTON RD
1: 636 E TABOR RD
1: 219 W DUVAL ST
1: 602 W LURAY ST
1: 1824 MEDARY AVE
1: 6308 EASTWOOD ST
1: 414 E WYOMING AVE
1: 2203 DISSTON ST
1: 478 HART LN
1: 2643 N 31ST ST
2: 2324-26 E SUSQUEHANNA AVE
1: 1510 LOMBARD ST
1: 333 DALY ST
1: 3158 MILLER ST
1: 5659 HADFIELD ST
1: 2113 E WILLIAM ST
1: 2842 UNRUH AVE
1: 1818 HARTEL AVE
2: 2602 W DAUPHIN ST
1: 3335 DISSTON ST
1: 12046 GLENFIELD ST
1: 1217 S 49TH ST
1: 530 MONTROSE ST
1: 5605 SPRUCE ST
1: 3003 S 17TH ST
1: 6439 EASTWOOD ST
1: 3214 H ST
1: 816 GRIFFITH ST
1: 3051 HARTVILLE ST
1: 2327 WATKINS ST
2: 4403 CHESTNUT S

1: 909 S 16TH ST
1: 4946 PINE ST
1: 5100 CONVENT LN
1: 1559 BEVERLY RD
1: 3109 WELSH RD
1: 1520 S CORLIES ST
1: 1709 WHARTON ST
1: 407 W MOUNT PLEASANT AVE
1: 2122 S 64TH ST
1: 7927 BUSTLETON AVE
1: 4727 SHEFFIELD AVE
1: 133 W WEAVER ST
1: 7023 VALLEY AVE
1: 648-50 N 33RD ST
1: 2415 S AMERICAN ST
1: 910 N FALLON ST
1: 2523 S SARTAIN ST
1: 4328 MITCHELL ST
1: 208 MC KEAN ST
1: 5239 C ST
1: 7126 RUTLAND ST
1: 1814 S 24TH ST
1: 4622 TACONY ST
2: 3301 N FRONT ST
1: 5861 CEDAR AVE
1: 1216 N REDFIELD ST
1: 2540 S 66TH ST
1: 5241 BEAUMONT ST
1: 2618 S 17TH ST
1: 7062 GRAYS AVE
1: 604-36 S WASHINGTON SQ
2: 1344 PALETHORP ST
1: 3512 VAUX ST
1: 2534 BERBRO ST
1: 450 S 55TH ST
1: 2313 RHAWN ST
1: 4469 RICHMOND ST
1: 707 CLYMER ST
1: 2824 CEDAR ST
1: 3373 RED LION RD
1: 3128 KNORR ST
1: 2946 N 25TH ST
2: 9622 HOFF ST
1: 2601 PENNSYLVANIA AVE
1: 3058 AMBER ST
1: 648 VAN KIRK ST
2: 2547 S BROAD ST
1: 1937 WATKINS ST
1: 3013 S 68TH ST
2: 2815 KENSINGTON AVE
2: 1732 S 58TH ST
1: 5107 WALKER ST
1: 2771

1: 1745 DOUNTON ST
1: 836 MERCER ST
1: 1822 STANWOOD ST
1: 4708 HAWTHORNE ST
1: 2233 FONTAIN ST
1: 2729 N GARNET ST
1: 1318 HALE ST
1: 12007 TYRONE RD
1: 1444 GERMANTOWN AVE
2: 1711 FERNON ST
2: 2420 RIDGE AVE
1: 1233 FULLER ST
1: 8800 BLUE GRASS RD
1: 1614 ELLSWORTH ST
1: 608 HERMITAGE ST
2: 700-06 CALEDONIA ST
1: 32 N ITHAN ST
1: 2337 E FLETCHER ST
1: 2502 S AMERICAN ST
1: 7601 CRITTENDEN ST
1: 1547 S 28TH ST
1: 2926 NICHOLAS ST
1: 6446 MORRIS PARK RD
1: 3664 FRANKFORD AVE
1: 9281 ANGUS PL
1: 3154 DERRY RD
1: 5228 N FRONT ST
1: 707 MARCHMAN RD
1: 5024 GREENE ST
1: 1915 N DARIEN ST
1: 1713 W OXFORD ST
1: 8029 ALBION ST
2: 107 E DUVAL ST
1: 6039 COLGATE ST
1: 4213 PALMETTO ST
1: 2927 FANSHAWE ST
1: 2216 E THOMPSON ST
1: 418 E ALLENS LN
1: 1541 N EDGEWOOD ST
1: 1228 S 51ST ST
1: 5005 MC KEAN AVE
1: 908 MARLYN RD
1: 4517 N HICKS ST
1: 1958 GRANITE ST
1: 1247 N DOVER ST
2: 2247 WILDER ST
1: 5406 ADDISON ST
1: 3435 N HOWARD ST
1: 113 W COURTLAND ST
1: 9317 TROUT RD
1: 217 STEARLY ST
1: 161

KeyboardInterrupt: 

In [257]:
pluto_update.loc[pluto_update['ADDRESS']=='4326 POTTER ST'][['GSF']]

Unnamed: 0,GSF
353206,


In [272]:
df_sub[df_sub['ADDRESS']=='2330 SANSOM ST'][['BLDG CODE', 'BLDG CAT','SALE DATE', 'SALE PRICE', 'UNIT']]

Unnamed: 0,BLDG CODE,BLDG CAT,SALE DATE,SALE PRICE,UNIT
172,RES CONDO 2 STY MASONRY,Single Family,2019-11-25,3500000.0,
174,COM CONDO 1 STY MASONRY,Commercial,2019-11-25,3500000.0,
177,RES CONDO 2 STY MASONRY,Single Family,2019-11-25,3500000.0,


In [277]:
set([1,2,3]) == set([3,2,1])

True

In [278]:
pluto.shape

(488410, 53)

In [280]:
pluto['PARCEL ID'].unique().shape

(488410,)

In [283]:
pluto[pluto['ADDRESS']=='6751 N 13TH ST'][['ADDRESS', 'PARCEL ID', 'SALE DATE', 'SALE PRICE']]

Unnamed: 0,ADDRESS,PARCEL ID,SALE DATE,SALE PRICE
136014,6751 N 13TH ST,888610063,2017-05-19,1600
136019,6751 N 13TH ST,888610019,2016-11-28,1600
136152,6751 N 13TH ST,888610087,2016-01-05,3100
136149,6751 N 13TH ST,888610081,2015-09-09,3100
136018,6751 N 13TH ST,888610017,2015-04-17,6500
136287,6751 N 13TH ST,888610037,2015-02-17,5000
136289,6751 N 13TH ST,888610041,2015-02-17,1800
136020,6751 N 13TH ST,888610021,2015-02-17,4200
136291,6751 N 13TH ST,888610045,2015-02-04,1600
136141,6751 N 13TH ST,888610075,2015-01-13,4000


In [284]:
df_sub[df_sub['ADDRESS']=='6751 N 13TH ST'][['ADDRESS', 'PARCEL ID', 'SALE DATE', 'SALE PRICE']]

Unnamed: 0,ADDRESS,PARCEL ID,SALE DATE,SALE PRICE
969,6751 N 13TH ST,888610065,2019-11-13,1000000.0
999,6751 N 13TH ST,888610043,2019-11-13,1000000.0
1015,6751 N 13TH ST,888610059,2019-11-13,1000000.0
1019,6751 N 13TH ST,888610031,2019-11-13,1000000.0
1035,6751 N 13TH ST,888610021,2019-11-13,1000000.0
1036,6751 N 13TH ST,888610105,2019-11-13,1000000.0
1037,6751 N 13TH ST,888610087,2019-11-13,1000000.0
1038,6751 N 13TH ST,888610085,2019-11-13,1000000.0
1039,6751 N 13TH ST,888610045,2019-11-13,1000000.0
1042,6751 N 13TH ST,888610049,2019-11-13,1000000.0
