Skip to content

Commit

Permalink
add DOHMH rodent inspections dataset (#337)
Browse files Browse the repository at this point in the history
Adds the DOHMH's Rodent Inspections dataset

While working on this I caught a bug in typecast.time(), that wasn't handling 12pm correctly, causing nulls. I've simplified the approach and added some addition tests.
  • Loading branch information
austensen committed Jun 14, 2024
1 parent 7395870 commit 388d1f9
Show file tree
Hide file tree
Showing 8 changed files with 89 additions and 11 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Residents, lawyers, tenants, and organizers who want to use data in their strugg
- [Major Capital Improvements (MCI) Applications](https://github.com/nycdb/nycdb/wiki/Dataset:-Major-Capital-Improvements-(MCI)-Applications) - From [FOIL request by Winnie Shen](https://github.com/wshenyc/nyc_mci_map)
- [DOB Safety Violations](https://data.cityofnewyork.us/Housing-Development/DOB-Safety-Violations/855j-jady)
- [DHS Daily Shelter Census](https://www.nyc.gov/assets/dhs/downloads/pdf/dailyreport.pdf) - From [scraping by Adrian Nesta and Patrick Spauster](https://github.com/anesta95/nyc_shelter_count/tree/main)
- [DOHMH Rodent Inspections](https://github.com/nycdb/nycdb/wiki/Dataset:-DOHMH-Rodent-Inspections)


## Using the database
Expand Down
10 changes: 10 additions & 0 deletions src/nycdb/dataset_transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,16 @@ def dhs_daily_shelter_count(dataset):
return to_csv(dataset.files[0].dest, header_replacements={'table': 'series_name'})


def dohmh_rodent_inspections(dataset):
return with_bbl(
skip_fields(
to_csv(dataset.files[0].dest),
[s.lower() for s in dataset.dataset["schema"]["skip"]],
),
borough="borocode",
)


def hpd_aep(dataset):
return to_csv(dataset.files[0].dest, header_replacements={'ofbcviolationsatstart': 'bcviolationsatstart'})

Expand Down
37 changes: 37 additions & 0 deletions src/nycdb/datasets/dohmh_rodent_inspections.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
---
files:
-
url: https://data.cityofnewyork.us/api/views/p937-wjvj/rows.csv?accessType=DOWNLOAD
dest: dohmh_rodent_inspections.csv
sql:
- dohmh_rodent_inspections.sql
schema:
table_name: dohmh_rodent_inspections
fields:
InspectionType: text
JobTicketOrWorkOrderId: integer
JobId: text
JobProgress: integer
BoroCode: char(1)
Block: char(5)
Lot: char(4)
HouseNumber: text
StreetName: text
ZipCode: text
XCoord: numeric
YCoord: numeric
Latitude: numeric
Longitude: numeric
Borough: text
InspectionDate: timestamp
Result: text
ApprovedDate: timestamp
CommunityBoard: text
CouncilDistrict: text
CensusTract: text
Bin: char(7)
Nta: text
bbl: char(10)
skip:
- Bbl
- Location
6 changes: 6 additions & 0 deletions src/nycdb/sql/dohmh_rodent_inspections.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE INDEX dohmh_rodent_inspections_bbl_idx ON dohmh_rodent_inspections (bbl);
CREATE INDEX dohmh_rodent_inspections_inspectiondate_idx ON dohmh_rodent_inspections (inspectiondate);
CREATE INDEX dohmh_rodent_inspections_approveddate_idx ON dohmh_rodent_inspections (ApprovedDate);
CREATE INDEX dohmh_rodent_inspections_result_idx ON dohmh_rodent_inspections (result);
CREATE INDEX dohmh_rodent_inspections_inspectiontype_idx ON dohmh_rodent_inspections (inspectiontype);
CREATE INDEX dohmh_rodent_inspections_inspectiontype_jobticketorworkorderid_idx ON dohmh_rodent_inspections (inspectiontype, JobTicketOrWorkOrderId);
13 changes: 6 additions & 7 deletions src/nycdb/typecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,14 +163,13 @@ def time(x):
"""
if isinstance(x, datetime.time):
return x
if isinstance(x, str) and re.match(
r"^\d{1,2}:\d{1,2}:\d{1,2}(\s+[AP]M)?$", x.strip(), flags=re.IGNORECASE
):
if isinstance(x, str):
try:
time = re.search(r"(\d{1,2}):(\d{1,2}):(\d{1,2})", x.strip())
pm = True if re.match(r"^.*?PM$", x.strip(), flags=re.IGNORECASE) else False
hour, minute, second = map(int, time.groups())
return datetime.time(hour + (pm * 12), minute, second)
time_str = re.compile(r"\s+").sub(' ', x.strip().upper())
if re.match(r"^\d{1,2}:\d{1,2}:\d{1,2}\s[AP]M$", time_str):
return datetime.datetime.strptime(time_str, '%I:%M:%S %p').time()
elif re.match(r"^\d{1,2}:\d{1,2}:\d{1,2}$", time_str):
return datetime.datetime.strptime(time_str, '%H:%M:%S').time()
except ValueError:
return None

Expand Down
6 changes: 6 additions & 0 deletions src/tests/integration/data/dohmh_rodent_inspections.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
INSPECTION_TYPE,JOB_TICKET_OR_WORK_ORDER_ID,JOB_ID,JOB_PROGRESS,BBL,BORO_CODE,BLOCK,LOT,HOUSE_NUMBER,STREET_NAME,ZIP_CODE,X_COORD,Y_COORD,LATITUDE,LONGITUDE,BOROUGH,INSPECTION_DATE,RESULT,APPROVED_DATE,LOCATION,COMMUNITY BOARD,COUNCIL DISTRICT,CENSUS TRACT,BIN,NTA
Initial,13661515,PC8122655,1,,1,00529,0009,,,0,,,,,Manhattan,06/07/2023 04:36:13 PM,Passed,06/08/2023 10:38:30 AM,,,,,,
Initial,11507405,PC7504205,1,,5,00062,0113,000,YORK AVENUE,,,,0,0,Staten Island,09/16/2010 04:00:00 PM,Failed for Other R,09/17/2010 09:47:30 AM,"(0.0, 0.0)",,,,,
Initial,13264492,PC7744572,1,,2,02857,0095,0000,JEROME AVENUE,,,,0,0,Bronx,07/28/2021 12:48:53 PM,Passed,07/29/2021 01:19:32 PM,"(0.0, 0.0)",,,,,
Initial,13252138,PC7732344,1,,2,02409,0059,,,10451,,,,,Bronx,07/07/2021 08:49:45 AM,Passed,07/08/2021 10:18:56 AM,,,,,,
Initial,13213464,PC7684724,1,1000520021,1,00052,0021,125,Cedar Street,10006,,,40.709622926374,-74.012635256681,Manhattan,03/26/2021 12:45:41 PM,Failed for Other R,03/29/2021 10:42:23 AM,"(40.709622926374, -74.012635256681)",1,1,13,1001040,Financial District-Battery Park City
15 changes: 15 additions & 0 deletions src/tests/integration/test_nycdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,21 @@ def test_dhs_daily_shelter_count(conn):
assert row_count(conn, 'dhs_daily_shelter_count') == 5


def test_dohmh_rodent_inspections(conn):
drop_table(conn, 'dohmh_rodent_inspections')
dataset = nycdb.Dataset('dohmh_rodent_inspections', args=ARGS)
dataset.db_import()
assert row_count(conn, 'dohmh_rodent_inspections') == 5
assert has_one_row(conn, "select 1 where to_regclass('public.dohmh_rodent_inspections_bbl_idx') is NOT NULL")
with conn.cursor(row_factory=dict_row) as curs:
curs.execute("select * from dohmh_rodent_inspections WHERE bbl = '1000520021'")
rec = curs.fetchone()
assert rec is not None
print(rec)
assert rec['inspectiondate'].strftime("%Y-%m-%d") == '2021-03-26'
assert rec['approveddate'].strftime("%Y-%m-%d") == '2021-03-29'


def test_hpd_aep(conn):
drop_table(conn, 'hpd_aep')
dataset = nycdb.Dataset('hpd_aep', args=ARGS)
Expand Down
12 changes: 8 additions & 4 deletions src/tests/unit/test_typecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,12 @@ def test_date_mm_dd_yyyy_with_timestamp():


def test_time():
assert typecast.time('15:01:00') == datetime.time(hour=15, minute=1, second=0)
assert typecast.time('15:1:0') == datetime.time(hour=15, minute=1, second=0)
assert typecast.time('3:01:00 PM') == datetime.time(hour=15, minute=1, second=0)
assert typecast.time('3:01:00 AM') == datetime.time(hour=3, minute=1, second=0)
assert typecast.time('15:01:00') == datetime.time(15, 1, 0)
assert typecast.time('15:1:0') == datetime.time(15, 1, 0)
assert typecast.time('3:01:00 PM') == datetime.time(15, 1, 0)
assert typecast.time('3:01:00 AM') == datetime.time(3, 1, 0)
assert typecast.time('12:45:41 PM') == datetime.time(12, 45, 41)
assert typecast.time('12:45:41 AM') == datetime.time(0, 45, 41)
assert typecast.time(datetime.time.min) == datetime.time.min
assert typecast.time('RIGHT NOW') is None
assert typecast.time(None) is None
Expand All @@ -114,6 +116,8 @@ def test_timestamp():
assert typecast.timestamp('05/13/2020 23:30:00') == datetime.datetime(2020, 5, 13, 23, 30, 0)
assert typecast.timestamp('2020-05-13 11:30:00 PM') == datetime.datetime(2020, 5, 13, 23, 30, 0)
assert typecast.timestamp('2020-05-13 11:30:00 AM') == datetime.datetime(2020, 5, 13, 11, 30, 0)
assert typecast.timestamp('03/26/2021 12:45:41 PM') == datetime.datetime(2021, 3, 26, 12, 45, 41)
assert typecast.timestamp('03/26/2021 12:45:41 AM') == datetime.datetime(2021, 3, 26, 0, 45, 41)
assert typecast.timestamp(datetime.datetime(2020, 5, 13, 11, 30, 0)) == datetime.datetime(2020, 5, 13, 11, 30, 0)


Expand Down

0 comments on commit 388d1f9

Please sign in to comment.