In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt
import json
import re

In [3]:
#Read in the original csv file
AsheCits = pd.read_csv('../data/Asheville_Citations.csv', low_memory=False)

## Important to note that this includes Driving/Traffic citations.  Pittsburgh dataset does not.  

In [4]:
AsheCits #6494788 rows

Unnamed: 0,OBJECTID,citation_date,citation_time,address,citation_type,subject_race,subject_gender,agency,tcmainid,objectid_1
0,11659052,2018/06/04,1530,US 25/MARCELLUS ST,OPEN CONTAINER ALCOHOL VIOL,W,M,APD,258536,176303
1,11659053,2018/06/04,1530,US 25/MARCELLUS ST,SECOND DEGREE TRESPASS,W,M,APD,258536,176304
2,11659054,2018/06/14,1033,100-BLK SW PACK SQ,OPEN CONTAINER ALCOHOL VIOL,W,M,APD,258888,176305
3,11659055,2018/07/26,1930,126 BROADWAY ST,SECOND DEGREE TRESPASS,W,M,APD,259949,176306
4,11659056,2018/08/25,1843,US 25/240 E,SOLICIT ALMS/BEG FOR MONEY,W,M,APD,260857,176307
...,...,...,...,...,...,...,...,...,...,...
6494783,18153835,2020/11/26,1435,1400-BLK PATTON AVE/DEAVERVIEW RD,FAIL TO STOP-STEADY RED LIGHT,W,M,APD,291013,198620
6494784,18153836,2020/11/26,1435,1400-BLK PATTON AVE/DEAVERVIEW RD,NO OPERATORS LICENSE,W,M,APD,291013,198621
6494785,18153837,2020/11/01,1800,I 40/SMOKEY PARK HWY,SPEEDING,W,F,APD,291191,198622
6494786,18153838,2020/11/04,2017,200-BLK HAYWOOD RD,OPERATE VEH NO INS,W,M,APD,291275,198623


In [7]:
#convert times to datetimes and sort
AsheCits['citation_date'] = pd.to_datetime(AsheCits['citation_date'])
AsheCits = AsheCits.sort_values(['citation_date'], ascending=True)
#Date range covered: 2012-01-01 - 2020-11-30
AsheCits.tail()

Unnamed: 0,OBJECTID,citation_date,citation_time,address,citation_type,subject_race,subject_gender,agency,tcmainid,objectid_1
4584593,16243645,2020-11-30,1551,ACTON CIRCLE/127 ACTON CIRCLE,MISDEMEANOR LARCENY,W,F,APD,290533,197115
5928348,17587400,2020-11-30,1907,128 BINGHAM RD/128 BINGHAM RD,SECOND DEGREE TRESPASS,W,M,APD,290535,195540
5551998,17211050,2020-11-30,1551,ACTON CIRCLE/127 ACTON CIRCLE,MISDEMEANOR LARCENY,W,F,APD,290533,197115
6331180,17990232,2020-11-30,2105,US 19/20TH ST,OPERATE VEH NO INS,W,M,APD,290556,195967
5390646,17049698,2020-11-30,1649,I 240/I40W,SPEEDING,W,M,APD,290555,196701


In [8]:
#Periodical Cicadas are confirmed in Asheville for Brood VI (2017).  Reducing dataset to 2016-2018
AsheCitstrim = AsheCits[AsheCits['citation_date'] > dt.datetime(2016,5,1)]
AsheCitstrim = AsheCitstrim[AsheCitstrim['citation_date'] < dt.datetime(2018,7,1)]

In [9]:
AsheCits.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6494788 entries, 1479629 to 5390646
Data columns (total 10 columns):
 #   Column          Dtype         
---  ------          -----         
 0   OBJECTID        int64         
 1   citation_date   datetime64[ns]
 2   citation_time   int64         
 3   address         object        
 4   citation_type   object        
 5   subject_race    object        
 6   subject_gender  object        
 7   agency          object        
 8   tcmainid        int64         
 9   objectid_1      int64         
dtypes: datetime64[ns](1), int64(4), object(5)
memory usage: 545.1+ MB


## Getting the relevant years

In [12]:
#2016
#154981 rows!?! Seems like a lot
AsheCits2016 = AsheCitstrim[AsheCitstrim['citation_date'] > dt.datetime(2016,5,1)]
AsheCits2016 = AsheCitstrim[AsheCitstrim['citation_date'] < dt.datetime(2016,7,1)]

In [15]:
#2017
#926666 rows!?! Holy cow
AsheCits2017 = AsheCitstrim[AsheCitstrim['citation_date'] > dt.datetime(2017,5,1)]
AsheCits2017 = AsheCitstrim[AsheCitstrim['citation_date'] < dt.datetime(2017,7,1)]

In [16]:
#2018
#1521834 rows!
AsheCits2018 = AsheCitstrim[AsheCitstrim['citation_date'] > dt.datetime(2018,5,1)]
AsheCits2018 = AsheCitstrim[AsheCitstrim['citation_date'] < dt.datetime(2018,7,1)]

In [17]:
#Bring into a single dataset
years = [AsheCits2016,AsheCits2017,AsheCits2018]
AsheCits_final = pd.concat(years)

## What else can we learn about the data?

In [18]:
#what kinds of citations are there?
AsheCits_final['citation_type'].unique()

array(['NO OPERATORS LICENSE                                        ',
       'FOLLOWING TOO CLOSELY                                       ',
       'SPEEDING                                                    ',
       'DWLR NOT IMPAIRED REV                                       ',
       'EXPIRED REGISTRATION CARD/TAG                               ',
       'EXPIRED/NO INSPECTION                                       ',
       'SECOND DEGREE TRESPASS                                      ',
       'POSSESS DRUG PARAPHERNALIA                                  ',
       'FAIL MAINTAIN LANE CONTROL                                  ',
       'OPEN CONTAINER ALCOHOL VIOL                                 ',
       'OPERATE VEH NO INS                                          ',
       'CANCL/REVOK/SUSP CERTIF/TAG                                 ',
       'AID & ABET LARCENY (M)                                      ',
       'FAIL TO REPORT ACCIDENT                                     ',
      

## Export

In [20]:
#This is actually still a huge amount of data.  Going to trim off some columns
#data = data.drop(columns="cases")
AsheCits_final = AsheCits_final.drop(columns='address')

In [21]:
AsheCits_final = AsheCits_final.drop(columns='tcmainid') #not sure what this even is / not useful

In [22]:
AsheCits_final = AsheCits_final.drop(columns='objectid_1')

In [26]:
AsheCits_final = AsheCits_final.drop(columns='agency') #Asheville PD

In [23]:
# time of day seems out of scope for this investigation but could be revisited.  
#For the record, Periodical cicadas appear to typically sing between 10am and 5pm.
AsheCits_final = AsheCits_final.drop(columns='citation_time')

In [27]:
AsheCits_final.head()

Unnamed: 0,OBJECTID,citation_date,citation_type,subject_race,subject_gender
1601774,13260826,2016-05-02,NO OPERATORS LICENSE ...,W,F
399400,12058452,2016-05-02,FOLLOWING TOO CLOSELY ...,W,M
193291,11852343,2016-05-02,SPEEDING ...,W,M
1120542,12779594,2016-05-02,DWLR NOT IMPAIRED REV ...,B,M
2158530,13817582,2016-05-02,EXPIRED REGISTRATION CARD/TAG ...,W,M


In [28]:
#Save as a csv
AsheCits_final.to_csv('AsheCits_final.csv', index = False, header=True)