In [2]:
import pandas as pd 
import numpy as np
import re

pd.set_option('display.max_columns',30)
pd.set_option('display.max_rows',30)


## Load in Noise complaints csv 

In [3]:
## Load in full dataset with selected columns 
filepath = '/Users/nikiagrawal/Desktop/EDA/Service-Requests/'
filename = '311_Service_Requests_Noise.csv'
sr_noise_data = pd.read_csv(filepath + filename,\
                      usecols=[0,1,2,3,4,5,6,7,8,9,10,19,21,25,26,27,28,38,39,40])

In [4]:
#sr_noise_data.shape

In [5]:
sr_noise_data.columns

Index(['Unique Key', 'Created Date', 'Closed Date', 'Agency', 'Agency Name',
       'Complaint Type', 'Descriptor', 'Location Type', 'Incident Zip',
       'Incident Address', 'Street Name', 'Status', 'Resolution Description',
       'Borough', 'X Coordinate (State Plane)', 'Y Coordinate (State Plane)',
       'Open Data Channel Type', 'Latitude', 'Longitude', 'Location'],
      dtype='object')

In [6]:
### Datetime conversion with pd.to_datetime():
sr_noise_data['Created Date'] = pd.to_datetime(sr_noise_data['Created Date'], format = "%m/%d/%Y %I:%M:%S %p")
sr_noise_data['Closed Date'] = pd.to_datetime(sr_noise_data['Closed Date'], format = "%m/%d/%Y %I:%M:%S %p")


In [7]:
#sr_noise_data.dtypes

### Noise Feature Engineering

There are all currently all kinds of labels used to describe noise.  
I will simplify the categoization of the noise data by creating 2 new features, "Noise Complaint Type" and "Noise Descriptor" based on the 3 features, 'Complaint Type','Descriptor', and 'Location Type' listed below. 

In [8]:
 sr_noise_data[['Complaint Type','Descriptor','Location Type','Unique Key']]\
            .groupby(['Complaint Type','Descriptor','Location Type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unique Key
Complaint Type,Descriptor,Location Type,Unnamed: 3_level_1
Collection Truck Noise,21 Collection Truck Noise,Sidewalk,118
Noise - Commercial,Banging/Pounding,Club/Bar/Restaurant,1603
Noise - Commercial,Banging/Pounding,Store/Commercial,10379
Noise - Commercial,Car/Truck Horn,Store/Commercial,2803
Noise - Commercial,Car/Truck Music,Store/Commercial,4870
Noise - Commercial,Loud Music/Party,Club/Bar/Restaurant,33834
Noise - Commercial,Loud Music/Party,Store/Commercial,41438
Noise - Commercial,Loud Talking,Club/Bar/Restaurant,3069
Noise - Commercial,Loud Talking,Store/Commercial,6320
Noise - Commercial,Loud Television,Club/Bar/Restaurant,73


In [9]:
## Helper function to check if string contains an element from a list
def contains_elements_from(string, lst):
    """This function will return True if 
    a given string contains any element from a
    given list. 
    
    Args: 
    string: input string
    lst: a list of strings 
    
    Returns:
    True or False 
    """
    return any(ele in string for ele in lst)

In [10]:
def get_description(complaint_descriptor):
    """This function takes in the 'Complaint Type' and 'Descriptor' elements 
    and outputs an appropriate relabel for the new feature 'Noise Descriptor.'
    
    Args:
    complaint_descriptor: contains 2 elements, complaint type and descriptor
    
    Returns:
    string label that will be used for new feature, Noise Descriptor. """
    
    complaint = complaint_descriptor[0]
    descriptor = complaint_descriptor[1]  
    #clean up DEP noise descriptor categories 
    if complaint == "Noise": 
        temp = re.search("(^Noise?[:,]*)([\s\w\/]*)", descriptor).group(2).strip(" ").capitalize()
    elif descriptor == descriptor:
        if descriptor == '21 Collection Truck Noise':
            temp = "truck"
        else:
            temp = descriptor.split("/",2)[0]
    else: 
        return "Unspecified"
        
        
    if contains_elements_from(temp,['dog','animals']):
        temp = "Animal"
    elif contains_elements_from(temp,['Car','Engine','truck','carting','Vehicle','Boat']):
        temp ="Vehicle"
    elif contains_elements_from(temp,['NYPD','News','Other']):
        temp ="Helicopter"
    elif contains_elements_from(temp,['Construction','Jack']):
        temp ="Construction"
    elif contains_elements_from(temp,['Music','Loud music']):
        temp ="Loud Music/Party"
    elif contains_elements_from(temp,['Manufacturing']):
        temp = "Manufacturing"
    elif contains_elements_from(temp,['Other noise sources']):
        temp = "Other Sources"
    return temp       

In [11]:
s = '21 Collection Truck Noise'
contains_elements_from(s,['21'])
s.split("/",2)[0]

'21 Collection Truck Noise'

In [12]:
d = "Noise, Ice Cream Truck (NR4)"
temp = d.split("/",2)[0]
temp

'Noise, Ice Cream Truck (NR4)'

In [13]:
def get_complaint_type(complaint_descriptor_location):
    """This function takes in the Complaint Type, Descriptor, and Location Type 
    elements and outputs an appropriate relabel for the new feature, 'Noise Complaint Type.'
    
    Args:
    complaint_descriptor_location: contains 3 elements, complaint type, descriptor, and location type
    
    Returns:
    string label that will be used for new feature, Noise Complaint Type."""
    
    complaint = complaint_descriptor_location[0]
    descriptor = complaint_descriptor_location[1]
    location = complaint_descriptor_location[2]
    if complaint == "Noise":
        return "Unspecified"
    elif complaint =="Collection Truck Noise":
        return "Vehicle"
    else:
        temp = re.split('Noise -',complaint)[1].strip(" ").capitalize() 
        if ((temp == "Commercial") & (location == location)):
            temp = location
    return temp

In [14]:
### Add new feature column "Noise Descriptor"
sr_noise_data["Noise Descriptor"] =\
    sr_noise_data[['Complaint Type','Descriptor']].apply(get_description, axis=1)

In [15]:
list(enumerate(sr_noise_data["Noise Descriptor"].unique()))

[(0, 'Loud Music/Party'),
 (1, 'Banging'),
 (2, 'Construction'),
 (3, 'Loud Talking'),
 (4, 'Vehicle'),
 (5, 'Alarms'),
 (6, 'Animal'),
 (7, 'Air condition/ventilation equipment'),
 (8, 'Loud Television'),
 (9, 'Helicopter'),
 (10, 'Lawn care equipment'),
 (11, 'Manufacturing'),
 (12, 'Unspecified')]

In [16]:
### Add new feature column "Noise Complaint Type"
sr_noise_data["Noise Complaint Type"] =\
    sr_noise_data[['Complaint Type','Descriptor','Location Type']].apply(get_complaint_type, axis=1)

In [17]:
list(enumerate(sr_noise_data["Noise Complaint Type"].unique()))

[(0, 'Residential'),
 (1, 'Club/Bar/Restaurant'),
 (2, 'Street/sidewalk'),
 (3, 'Unspecified'),
 (4, 'Store/Commercial'),
 (5, 'Vehicle'),
 (6, 'Park'),
 (7, 'Helicopter'),
 (8, 'House of worship'),
 (9, 'Commercial')]

In [19]:
sr_noise_data = sr_noise_data.drop(columns =['Status','Agency Name','Complaint Type',\
                             'Descriptor','Location Type'])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1664108 entries, 0 to 1664107
Data columns (total 17 columns):
 #   Column                      Non-Null Count    Dtype         
---  ------                      --------------    -----         
 0   Unique Key                  1664108 non-null  int64         
 1   Created Date                1664108 non-null  datetime64[ns]
 2   Closed Date                 1664108 non-null  datetime64[ns]
 3   Agency                      1664108 non-null  object        
 4   Incident Zip                1662656 non-null  float64       
 5   Incident Address            1631999 non-null  object        
 6   Street Name                 1631976 non-null  object        
 7   Resolution Description      1633253 non-null  object        
 8   Borough                     1664062 non-null  object        
 9   X Coordinate (State Plane)  1651888 non-null  float64       
 10  Y Coordinate (State Plane)  1651924 non-null  float64       
 11  Open Data Channel Type  

In [69]:
sr_noise_data['Time Elapsed'] = sr_noise_data['Closed Date']-sr_noise_data['Created Date']

sr_noise_data['Date_created']=pd.DatetimeIndex(sr_noise_data['Created Date']).date
sr_noise_data['Year_month_created']=pd.DatetimeIndex(sr_noise_data['Date_created']).to_period('M').to_timestamp()

sr_noise_data['Year_created'] = pd.DatetimeIndex(sr_noise_data['Created Date']).year
sr_noise_data['Month_created'] = pd.DatetimeIndex(sr_noise_data['Created Date']).month
sr_noise_data['Day_created'] = pd.DatetimeIndex(sr_noise_data['Created Date']).dayofweek
sr_noise_data['Hour_created'] = pd.DatetimeIndex(sr_noise_data['Created Date']).hour

In [63]:
sr_noise_data.info()

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Incident Zip,Incident Address,Street Name,Resolution Description,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Open Data Channel Type,Latitude,Longitude,Location,Noise Descriptor,Noise Complaint Type,Date_created,Year_month_created
0,41312454,2019-01-01 00:00:29,2019-01-01 02:22:46,NYPD,11231.0,135 RICHARDS STREET,RICHARDS STREET,The Police Department responded to the complai...,BROOKLYN,981379.0,185999.0,MOBILE,40.677201,-74.010351,"(40.677200581770165, -74.01035058400049)",Loud Music/Party,Residential,2019-01-01,2019-01-01
1,41310350,2019-01-01 00:00:43,2019-01-01 02:21:44,NYPD,11365.0,192-20C 67 AVENUE,67 AVENUE,The Police Department responded to the complai...,QUEENS,1044863.0,208248.0,ONLINE,40.738062,-73.781277,"(40.738061586139345, -73.78127702459108)",Loud Music/Party,Residential,2019-01-01,2019-01-01
2,41307350,2019-01-01 00:02:13,2019-01-02 02:07:11,NYPD,10003.0,106 3 AVENUE,3 AVENUE,The Police Department responded to the complai...,MANHATTAN,987682.0,206223.0,ONLINE,40.73271,-73.987617,"(40.73271049805848, -73.9876165807211)",Loud Music/Party,Club/Bar/Restaurant,2019-01-01,2019-01-01
3,41313215,2019-01-01 00:03:08,2019-01-01 07:01:51,NYPD,11417.0,103-21 106 STREET,106 STREET,The Police Department responded to the complai...,QUEENS,1029824.0,188632.0,ONLINE,40.684311,-73.835678,"(40.68431070731603, -73.83567813039491)",Loud Music/Party,Residential,2019-01-01,2019-01-01
4,41312661,2019-01-01 00:04:26,2019-07-29 12:17:22,NYPD,,95 STREET,95 STREET,Your complaint has been forwarded to the New Y...,Unspecified,,,ONLINE,,,,Banging,Residential,2019-01-01,2019-01-01
