In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt
import json
import re

In [2]:
#Read in the original csv file
PittCits = pd.read_csv('../data/PittsburgNonTrafficCitations.csv', low_memory=False)

In [9]:
PittCits.info() #11412 rows

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11412 entries, 2964 to 11346
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PK                     11412 non-null  int64         
 1   CCR                    11412 non-null  object        
 2   GENDER                 11372 non-null  object        
 3   RACE                   11365 non-null  object        
 4   AGE                    11367 non-null  float64       
 5   CITEDTIME              11412 non-null  datetime64[ns]
 6   INCIDENTLOCATION       11412 non-null  object        
 7   OFFENSES               11412 non-null  object        
 8   NEIGHBORHOOD           11261 non-null  object        
 9   ZONE                   11412 non-null  object        
 10  INCIDENTTRACT          11265 non-null  object        
 11  COUNCIL_DISTRICT       10299 non-null  float64       
 12  PUBLIC_WORKS_DIVISION  10299 non-null  float64       
 13

In [10]:
#convert times to datetimes and sort
PittCits['CITEDTIME'] = pd.to_datetime(PittCits['CITEDTIME'])
PittCits = PittCits.sort_values(['CITEDTIME'], ascending=True)

## Trimming to relevant years and months

In [7]:
#Periodical Cicadas are confirmed in Pittsburgh for Brood V (2016), but there are nearby confirmations 
#for Brood VIII (2019).  Including data around both years in this dataframe.

In [11]:
PittCitstrim = PittCits[PittCits['CITEDTIME'] > dt.datetime(2015,5,1)]
PittCitstrim = PittCitstrim[PittCitstrim['CITEDTIME'] < dt.datetime(2020,7,1)]

In [12]:
#2015
#0 entries??? Reviewing the original file suggest to me that they didn't really 
#   start tracking citations here until 2016.  BUMMER this means I won't have data for the year prior to the emergence
PittCits2015 = PittCitstrim[PittCitstrim['CITEDTIME'] > dt.datetime(2015,5,1)]
PittCits2015 = PittCits2015[PittCits2015['CITEDTIME'] < dt.datetime(2015,7,1)]
PittCits2015.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PK                     0 non-null      int64         
 1   CCR                    0 non-null      object        
 2   GENDER                 0 non-null      object        
 3   RACE                   0 non-null      object        
 4   AGE                    0 non-null      float64       
 5   CITEDTIME              0 non-null      datetime64[ns]
 6   INCIDENTLOCATION       0 non-null      object        
 7   OFFENSES               0 non-null      object        
 8   NEIGHBORHOOD           0 non-null      object        
 9   ZONE                   0 non-null      object        
 10  INCIDENTTRACT          0 non-null      object        
 11  COUNCIL_DISTRICT       0 non-null      float64       
 12  PUBLIC_WORKS_DIVISION  0 non-null      float64       
 13  X                

In [36]:
#2016
#23 entries
PittCits2016 = PittCitstrim[PittCitstrim['CITEDTIME'] > dt.datetime(2016,5,1)]
PittCits2016 = PittCits2016[PittCits2016['CITEDTIME'] < dt.datetime(2016,7,1)]
PittCits2016.head(30)

Unnamed: 0,PK,CCR,GENDER,RACE,AGE,CITEDTIME,INCIDENTLOCATION,OFFENSES,NEIGHBORHOOD,ZONE,INCIDENTTRACT,COUNCIL_DISTRICT,PUBLIC_WORKS_DIVISION,X,Y
13,1078094,16083547,M,B,31.0,2016-05-07 13:47:00,"700 Block COLLINS AV PITTSBURGH, PA",601.10 (a) Spitting,East Liberty,5,1113.0,9.0,2.0,-79.918244,40.467392
12,1078093,16083547,M,B,31.0,2016-05-07 13:50:00,"700 Block COLLINS AV PITTSBURGH, PA",5503(a)(3) DISORDERLY CONDUCT - OBSCENE LANG/GEST,East Liberty,5,1113.0,9.0,2.0,-79.918244,40.467392
8,1078089,16083638,M,W,50.0,2016-05-07 16:30:00,"40TH ST & DAVISON ST PITTSBURGH, PA",5505 Public Drunkenness,Lower Lawrenceville,2,603.0,7.0,2.0,-79.960903,40.467585
14,1078042,16083707,M,B,30.0,2016-05-07 18:20:00,"300 Block S BRADDOCK AV PITTSBURGH, PA","6501(a)(1) Scattering Rubbish; Any waste, dang...",Point Breeze,4,1406.0,9.0,3.0,-79.89635,40.445425
174,1078022,16083920,M,O,23.0,2016-05-08 01:17:00,"60 Block S 15TH ST PITTSBURGH, PA",5505 Public Drunkenness,South Side Flats,3,1702.0,3.0,3.0,-79.983657,40.429305
173,1078021,16083956,M,W,26.0,2016-05-08 02:15:00,"1700 Block CAREY WY PITTSBURGH, PA",601.16(b)(1)(a) Any Person who urinates or def...,South Side Flats,3,1702.0,3.0,3.0,-79.982058,40.428323
175,1078023,16083961,F,W,22.0,2016-05-08 02:17:00,"S 15TH ST & ROLAND ST PITTSBURGH, PA",601.16(b)(1)(a) Any Person who urinates or def...,South Side Flats,3,1702.0,3.0,3.0,-79.983435,40.428555
273,1078017,16088619,M,W,30.0,2016-05-14 19:05:00,"2000 Block WHARTON SQ PITTSBURGH, PA",3304(a)(5) CRIMINAL MISCHIEF - Intentionally D...,South Side Flats,1,1702.0,3.0,3.0,-79.976813,40.430021
274,1078018,16088623,M,B,58.0,2016-05-14 19:46:00,"4300 Block STAFFORD ST PITTSBURGH, PA",3304(a)(5) CRIMINAL MISCHIEF - Intentionally D...,Sheraden,6,2018.0,2.0,5.0,-80.055093,40.46074
275,1078019,16088766,F,B,37.0,2016-05-14 23:55:00,"1200 Block MUTUAL ST PITTSBURGH, PA",2709(a)(3) Harassment No Legitimate Purpose,Sheraden,6,2022.0,2.0,5.0,-80.063519,40.452682


In [17]:
#2017
#557 entries
PittCits2017 = PittCitstrim[PittCitstrim['CITEDTIME'] > dt.datetime(2017,5,1)]
PittCits2017 = PittCits2017[PittCits2017['CITEDTIME'] < dt.datetime(2017,7,1)]
PittCits2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 557 entries, 2192 to 2730
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PK                     557 non-null    int64         
 1   CCR                    557 non-null    object        
 2   GENDER                 553 non-null    object        
 3   RACE                   555 non-null    object        
 4   AGE                    554 non-null    float64       
 5   CITEDTIME              557 non-null    datetime64[ns]
 6   INCIDENTLOCATION       557 non-null    object        
 7   OFFENSES               557 non-null    object        
 8   NEIGHBORHOOD           547 non-null    object        
 9   ZONE                   557 non-null    object        
 10  INCIDENTTRACT          547 non-null    object        
 11  COUNCIL_DISTRICT       530 non-null    float64       
 12  PUBLIC_WORKS_DIVISION  530 non-null    float64       
 13  X

In [18]:
#2018
#437 entries
PittCits2018 = PittCitstrim[PittCitstrim['CITEDTIME'] > dt.datetime(2018,5,1)]
PittCits2018 = PittCits2018[PittCits2018['CITEDTIME'] < dt.datetime(2018,7,1)]
PittCits2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 437 entries, 5396 to 5417
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PK                     437 non-null    int64         
 1   CCR                    437 non-null    object        
 2   GENDER                 435 non-null    object        
 3   RACE                   436 non-null    object        
 4   AGE                    434 non-null    float64       
 5   CITEDTIME              437 non-null    datetime64[ns]
 6   INCIDENTLOCATION       437 non-null    object        
 7   OFFENSES               437 non-null    object        
 8   NEIGHBORHOOD           426 non-null    object        
 9   ZONE                   437 non-null    object        
 10  INCIDENTTRACT          426 non-null    object        
 11  COUNCIL_DISTRICT       385 non-null    float64       
 12  PUBLIC_WORKS_DIVISION  385 non-null    float64       
 13  X

In [19]:
#2019
#485 entries
PittCits2019 = PittCitstrim[PittCitstrim['CITEDTIME'] > dt.datetime(2019,5,1)]
PittCits2019 = PittCits2019[PittCits2019['CITEDTIME'] < dt.datetime(2019,7,1)]
PittCits2019.head(30)

Unnamed: 0,PK,CCR,GENDER,RACE,AGE,CITEDTIME,INCIDENTLOCATION,OFFENSES,NEIGHBORHOOD,ZONE,INCIDENTTRACT,COUNCIL_DISTRICT,PUBLIC_WORKS_DIVISION,X,Y
7976,1092541,19081552,F,B,38.0,2019-05-01 08:00:00,"7200 Block FORMOSA WY PITTSBURGH, PA","2709(a)(1) Harassment by Physical Contact, or ...",Homewood South,5,1303,9.0,2.0,-79.897209,40.455061
7588,1091801,19081671,M,W,58.0,2019-05-01 13:28:00,"5TH AV & THOMAS BL PITTSBURGH, PA",601.16(b)(1)(a) Any Person who urinates or def...,Larimer,5,1208,9.0,2.0,-79.911422,40.454923
7589,1091802,19081897,M,B,49.0,2019-05-01 17:49:00,"7200 Block FRANKSTOWN AV PITTSBURGH, PA",601.08(a) Alcohol or Liquor Consumption on Str...,Homewood South,5,1303,9.0,2.0,-79.89627,40.457389
7590,1091803,19082033,M,B,33.0,2019-05-01 20:05:00,"E CARSON ST & S 23RD ST PITTSBURGH, PA",5505 Public Drunkenness,South Side Flats,3,1609,3.0,3.0,-79.97304,40.428328
7591,1091804,19082277,M,B,51.0,2019-05-02 03:13:00,"W CARSON ST & CORLISS STREET TUN PITTSBURGH, PA","5507(a) Obstructing highway, railroad or publi...",Sheraden,6,2018,,,0.0,0.0
7592,1091805,19082647,M,B,15.0,2019-05-02 16:00:00,"REED ST & BRACKENRIDGE ST PITTSBURGH, PA",5503(a)(1) DISORDERLY CONDUCT - ENGAGE IN FIGH...,Middle Hill,2,501,6.0,3.0,-79.971867,40.445001
7593,1091806,19082821,M,B,21.0,2019-05-02 20:20:00,"60 Block BRACEY DR PITTSBURGH, PA",627.02(b)(i) Possessing a small amount of mari...,East Hills,5,1306,,,0.0,0.0
7594,1091807,19082821,M,B,32.0,2019-05-02 20:20:00,"60 Block BRACEY DR PITTSBURGH, PA",627.02(b)(i) Possessing a small amount of mari...,East Hills,5,1306,,,0.0,0.0
7620,1091833,19082877,M,B,59.0,2019-05-02 21:20:00,"1100 Block BROWNSVILLE RD PITTSBURGH, PA",5505 Public Drunkenness,Carrick,3,2902,4.0,3.0,-79.990903,40.404947
7596,1091809,19083036,M,W,26.0,2019-05-03 02:10:00,"SIDNEY ST & S 18TH ST PITTSBURGH, PA",5505 Public Drunkenness,South Side Flats,3,1702,3.0,3.0,-79.980472,40.429687


In [20]:
#2020
#138 entries
PittCits2020 = PittCitstrim[PittCitstrim['CITEDTIME'] > dt.datetime(2020,5,1)]
PittCits2020 = PittCits2020[PittCits2020['CITEDTIME'] < dt.datetime(2020,7,1)]
PittCits2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138 entries, 10255 to 10810
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PK                     138 non-null    int64         
 1   CCR                    138 non-null    object        
 2   GENDER                 137 non-null    object        
 3   RACE                   137 non-null    object        
 4   AGE                    137 non-null    float64       
 5   CITEDTIME              138 non-null    datetime64[ns]
 6   INCIDENTLOCATION       138 non-null    object        
 7   OFFENSES               138 non-null    object        
 8   NEIGHBORHOOD           137 non-null    object        
 9   ZONE                   138 non-null    object        
 10  INCIDENTTRACT          137 non-null    object        
 11  COUNCIL_DISTRICT       125 non-null    float64       
 12  PUBLIC_WORKS_DIVISION  125 non-null    float64       
 13 

## Bring the Dataframes together and remove extraneous columns


In [22]:
years = [PittCits2015, PittCits2016, PittCits2017, PittCits2018, PittCits2019, PittCits2020]
PittCits_final = pd.concat(years)

In [None]:
PittCits_final = PittCits_final.drop(columns=['NEIGHBORHOOD', 'ZONE', 'INCIDENTTRACT', 'COUNCIL_DISTRICT','PUBLIC_WORKS_DIVISION'])

In [30]:
PittCits_final = PittCits_final.drop(columns=['X','Y'])

In [32]:
PittCits_final = PittCits_final.drop(columns=['INCIDENTLOCATION','CCR'])

In [33]:
PittCits_final.head()

Unnamed: 0,PK,GENDER,RACE,AGE,CITEDTIME,OFFENSES
13,1078094,M,B,31.0,2016-05-07 13:47:00,601.10 (a) Spitting
12,1078093,M,B,31.0,2016-05-07 13:50:00,5503(a)(3) DISORDERLY CONDUCT - OBSCENE LANG/GEST
8,1078089,M,W,50.0,2016-05-07 16:30:00,5505 Public Drunkenness
14,1078042,M,B,30.0,2016-05-07 18:20:00,"6501(a)(1) Scattering Rubbish; Any waste, dang..."
174,1078022,M,O,23.0,2016-05-08 01:17:00,5505 Public Drunkenness


In [34]:
# I forgot to separate the Date and the Time in the CITEDTIME column

PittCits_final['Date'] = [d.date() for d in PittCits_final['CITEDTIME']]
PittCits_final['Date'] = pd.to_datetime(PittCits_final['Date'])

## Export

In [35]:
#Save as a csv
PittCits_final.to_csv('PittCits_final.csv', index = False, header=True)