In [1]:
#Python Data Analysis on NYC Service Requests Data 

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import matplotlib.pyplot as plt 
import seaborn as sns

## Data Collection

A subset of the 311 NYC dataset was exported from the NYC Open Data portal at https://data.cityofnewyork.us/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9/data. I selected a subset of all the complaints according to the following criteria:
1. Created Date is between 7/1/19 and 7/1/21 
2. Status is 'Closed'
3. Closed Date is before 7/2/21 12:00:00 AM 

The data originally had 41 features:
 (0, 'Unique Key'),
 (1, 'Created Date'),
 (2, 'Closed Date'),
 (3, 'Agency'),
 (4, 'Agency Name'),
 (5, 'Complaint Type'),
 (6, 'Descriptor'),
 (7, 'Location Type'),
 (8, 'Incident Zip'),
 (9, 'Incident Address'),
 (10, 'Street Name'),
 (11, 'Cross Street 1'),
 (12, 'Cross Street 2'),
 (13, 'Intersection Street 1'),
 (14, 'Intersection Street 2'),
 (15, 'Address Type'),
 (16, 'City'),
 (17, 'Landmark'),
 (18, 'Facility Type'),
 (19, 'Status'),
 (20, 'Due Date'),
 (21, 'Resolution Description'),
 (22, 'Resolution Action Updated Date'),
 (23, 'Community Board'),
 (24, 'BBL'),
 (25, 'Borough'),
 (26, 'X Coordinate (State Plane)'),
 (27, 'Y Coordinate (State Plane)'),
 (28, 'Open Data Channel Type'),
 (29, 'Park Facility Name'),
 (30, 'Park Borough'),
 (31, 'Vehicle Type'),
 (32, 'Taxi Company Borough'),
 (33, 'Taxi Pick Up Location'),
 (34, 'Bridge Highway Name'),
 (35, 'Bridge Highway Direction'),
 (36, 'Road Ramp'),
 (37, 'Bridge Highway Segment'),
 (38, 'Latitude'),
 (39, 'Longitude'),
 (40, 'Location')
 
**I will keep 20 that provide information on the complaint logged, the agency involved, the time of case creation and closing, the mode of creation, and geovalidated fields related to location of the incident.**

## Data Loading

In [4]:
### Load in full dataset with selected columns 
filepath = '/Users/nikiagrawal/Desktop/NYCDSA/'
filename = '311_Service_Requests.csv'
sr_data = pd.read_csv(filepath + filename,\
                      usecols=[0,1,2,3,4,5,6,7,8,9,10,19,21,25,26,27,28,38,39,40])

In [5]:
#There are 4659228 observations and 20 features 
sr_data.shape

(4659228, 20)

In [44]:
#Preview data
pd.set_option('display.max_columns',30)
pd.set_option('display.max_rows', 30) 
sr_data.head(1)

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Status,Resolution Description,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Open Data Channel Type,Latitude,Longitude,Location
0,43179970,2019-07-01,2019-07-08,DOHMH,Department of Health and Mental Hygiene,Standing Water,Puddle in Ground,1-2 Family Mixed Use Building,11435.0,90-24 148 STREET,148 STREET,Closed,,QUEENS,1037709.0,195334.0,PHONE,40.702662,-73.807195,"(40.702661991089535, -73.80719482768488)"


In [43]:
list(enumerate(sr_data.columns.to_list()))

[(0, 'Unique Key'),
 (1, 'Created Date'),
 (2, 'Closed Date'),
 (3, 'Agency'),
 (4, 'Agency Name'),
 (5, 'Complaint Type'),
 (6, 'Descriptor'),
 (7, 'Location Type'),
 (8, 'Incident Zip'),
 (9, 'Incident Address'),
 (10, 'Street Name'),
 (11, 'Status'),
 (12, 'Resolution Description'),
 (13, 'Borough'),
 (14, 'X Coordinate (State Plane)'),
 (15, 'Y Coordinate (State Plane)'),
 (16, 'Open Data Channel Type'),
 (17, 'Latitude'),
 (18, 'Longitude'),
 (19, 'Location')]

### Datetime conversion with pd.to_datetime():

In [38]:
###Now that I have the relevant columns, I will convert the columns with date and time information 
###from strings into datetime objects. 
sr_data['Created Date'] = pd.to_datetime(sr_data['Created Date'], format = "%m/%d/%Y %I:%M:%S %p")
sr_data['Closed Date'] = pd.to_datetime(sr_data['Closed Date'], format = "%m/%d/%Y %I:%M:%S %p")

In [34]:
###Check that the dates converted properly and the data is within the expected date & time range
# sr_data['Created Date'].min()
# sr_data['Created Date'].max()
# sr_data['Closed Date'].min()
# sr_data['Closed Date'].max()

###Save edited dateframe as csv file 
# sr_data.to_csv('311_Service_Requests_edited',index = False)

In [36]:
sr_data.dtypes

Unique Key                             int64
Created Date                  datetime64[ns]
Closed Date                   datetime64[ns]
Agency                                object
Agency Name                           object
Complaint Type                        object
Descriptor                            object
Location Type                         object
Incident Zip                         float64
Incident Address                      object
Street Name                           object
Status                                object
Resolution Description                object
Borough                               object
X Coordinate (State Plane)           float64
Y Coordinate (State Plane)           float64
Open Data Channel Type                object
Latitude                             float64
Longitude                            float64
Location                              object
dtype: object

# Background 
- NYC Noise Codes 
- How does 311 work? What happens when a call comes in? Why was this set up and who is involved? How is it monitored? 
- Motivation - Who is interested in this analysis:
1. Government agencies
        a. Are they effectively responding to complaints? 
            1. Time of case duration 
            2. Percent of repeat complaints from same location over time 
            3. Analyze text of the Resolution Description 
            4. How many calls logged per day on average?
               How many calls logged per year on average?
            5. Which season has the most/least noise complaints?
            6. Which borough has the most / least complaints

2. Current NYC residents and new renters 
        a. Noise as an indicator of quality of life 
            1. Which neighborhoods have parties?
            2. Which neighborhoods complain the most (residential)?
            3. Which neighborhoods have a lot of construction / development?
            4. Which parks are spots of conflict? 
            5. WFH paradigm - resedential noise - 
            6. How does day time compare to night time? 

In [None]:
#sr_data['Time Elapsed'] = sr_data['Closed Date']-sr_data['Created Date']
#creates a Timedelta object. Can use Timedelta.components or Timedelta.total_seconds()


In [None]:
sr_data['Complaint Type'].unique()

In [None]:
sr_data['Complaint Type'].unique().shape

In [None]:
list(sr_data['Complaint Type'].value_counts())