In [63]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import feather as fth # fast file format for reading whole columns

precinct = '12'
squad = 'A'
date ='09/27/2016'

datadirIn = '../../data/nyc_parking_tickets/'
datadirOut = '../../data/nyc_parking_tickets/squad_route/'
fileNameIn = 'Parking_Violations_Issued_-_Fiscal_Year_2017'
#fileNameOut = 'squad_route_time_' + precinct + squad + '_Parking_Violations_Issued_-_Fiscal_Year_2017'
fileFormatIn = '.csv'
fileFormatOut = '.fth'

pathIn = datadirIn + fileNameIn + fileFormatIn
pathOut = datadirOut + 'squad_route_time_' + squad + precinct + '_' + date.replace('/', '') + '_Parking_Violations_17' + fileFormatOut

print(pathIn)
print(pathOut)

../../data/nyc_parking_tickets/Parking_Violations_Issued_-_Fiscal_Year_2017.csv
../../data/nyc_parking_tickets/squad_route/squad_route_time_A12_09272016_Parking_Violations_17.fth


Before we can analyse the tickets recorded by single squad on a specific date, we have to load the dataset and extract the entries with matching squad, precinct and date.

For better performance we decided to read the .csv-file in chunks, so the memory does not overflow.

After we extracted the tuples, we save the file in our data-directory for later use.

We chose the feather format for our saved files, because we always read whole columns and the format is up to 30x faster than common csv doing that.

In [66]:
#load whole dataset in chunks
#select data by precinct, squad and date
#write to feather


dtype = {'Street Name': object, 'House Number': object, 'Issuer Squad': object, 'Violation Precinct': object, 'Violation Time': object}
cols = ['Street Name','House Number','Issue Date', 'Issuer Squad', 'Violation Precinct', 'Violation Time']
csize = 100000

data = pd.DataFrame()
readcount = 0
foundcount = 0

reader = pd.read_csv(pathIn, chunksize = csize, sep=',', dtype=dtype, usecols=cols)
#, error_bad_lines=False

for chunk in reader:
        chunk = chunk[chunk['Issue Date'].str.contains(date) & (chunk['Issuer Squad'] == squad) & chunk['Violation Precinct'].str.contains(precinct)]
        data = pd.concat([data, chunk])
        readcount = readcount + csize
        print('Read: ' + str(readcount) + ' Found: ' + str(len(data)), end='\r')

print('Read: ' + str(readcount) + ' Found: ' + str(len(data)))
fth.write_dataframe(data, pathOut)
print('File saved as ' + pathOut)
print('Successful')

Read: 7700000 Found: 63
File saved as ../../data/nyc_parking_tickets/squad_route/squad_route_time_A12_09272016_Parking_Violations_17.fth
Successful


If you want to have a look at the file saved by the cell above, just run the following cell.

In [67]:
#check a feather file's dataset
df = fth.read_dataframe(pathOut)
print(df.head())

   Issue Date Violation Precinct Issuer Squad Violation Time House Number  \
0  09/27/2016                122            A          0851A          360   
1  09/27/2016                120            A          0930A           42   
2  09/27/2016                120            A          1022A          294   
3  09/27/2016                122            A          1003A          275   
4  09/27/2016                120            A          0827A           25   

   Street Name  
0  Seaview Ave  
1  Norwood Ave  
2     Bard Ave  
3    Mason Ave  
4     Wayne St  
