In [None]:
import pandas as pd 
import numpy as np
import re

pd.set_option('display.max_columns',30)
pd.set_option('display.max_rows',30)


## Load in Noise complaints csv 

In [None]:
## Load in full dataset with selected columns 
filepath = '/Users/nikiagrawal/Desktop/EDA/Service-Requests/'
filename = '311_Service_Requests_Noise.csv'
sr_noise_data = pd.read_csv(filepath + filename,\
                      usecols=[0,1,2,3,4,5,6,7,8,9,10,19,21,25,26,27,28,38,39,40])

In [None]:
#sr_noise_data.shape

In [None]:
#sr_noise_data.columns

In [None]:
### Datetime conversion with pd.to_datetime():
sr_noise_data['Created Date'] = pd.to_datetime(sr_noise_data['Created Date'], format = "%m/%d/%Y %I:%M:%S %p")
sr_noise_data['Closed Date'] = pd.to_datetime(sr_noise_data['Closed Date'], format = "%m/%d/%Y %I:%M:%S %p")


In [None]:
#sr_noise_data.dtypes

### Noise Feature Engineering

There are all currently all kinds of labels used to describe noise.  
I will simplify the categoization of the noise data by creating 2 new features, "Noise Complaint Type" and "Noise Descriptor" based on the 3 features, 'Complaint Type','Descriptor', and 'Location Type' listed below. 

In [None]:
#  sr_noise_data[['Complaint Type','Descriptor','Location Type','Unique Key']]\
#             .groupby(['Complaint Type','Descriptor','Location Type']).count()

In [None]:
## Helper function to check if string contains an element from a list
def contains_elements_from(string, lst):
    """This function will return True if 
    a given string contains any element from a
    given list. 
    
    Args: 
    string: input string
    lst: a list of strings 
    
    Returns:
    True or False 
    """
    return any(ele in string for ele in lst)

In [None]:
def get_description(complaint_descriptor):
    """This function takes in the 'Complaint Type' and 'Descriptor' elements 
    and outputs an appropriate relabel for the new feature 'Noise Descriptor.'
    
    Args:
    complaint_descriptor: contains 2 elements, complaint type and descriptor
    
    Returns:
    string label that will be used for new feature, Noise Descriptor. """
    
    complaint = complaint_descriptor[0]
    descriptor = complaint_descriptor[1]  
    #clean up DEP noise descriptor categories 
    if complaint == "Noise": 
        temp = re.search("(^Noise?[:,]*)([\s\w\/]*)", descriptor).group(2).strip(" ").capitalize()
    elif descriptor == descriptor:
        if descriptor == '21 Collection Truck Noise':
            temp = "truck"
        else:
            temp = descriptor.split("/",2)[0]
    else: 
        return "Unspecified"
        
        
    if contains_elements_from(temp,['dog','animals']):
        temp = "Animal"
    elif contains_elements_from(temp,['Car','Engine','truck','carting','Vehicle','Boat']):
        temp ="Vehicle"
    elif contains_elements_from(temp,['NYPD','News','Other']):
        temp ="Helicopter"
    elif contains_elements_from(temp,['Construction','Jack']):
        temp ="Construction"
    elif contains_elements_from(temp,['Music','Loud music']):
        temp ="Loud Music/Party"
    elif contains_elements_from(temp,['Manufacturing']):
        temp = "Manufacturing"
    elif contains_elements_from(temp,['Other noise sources']):
        temp = "Other Sources"
    return temp       

In [None]:
def get_complaint_type(complaint_descriptor_location):
    """This function takes in the Complaint Type, Descriptor, and Location Type 
    elements and outputs an appropriate relabel for the new feature, 'Noise Complaint Type.'
    
    Args:
    complaint_descriptor_location: contains 3 elements, complaint type, descriptor, and location type
    
    Returns:
    string label that will be used for new feature, Noise Complaint Type."""
    
    complaint = complaint_descriptor_location[0]
    descriptor = complaint_descriptor_location[1]
    location = complaint_descriptor_location[2]
    if complaint == "Noise":
        return "Unspecified"
    elif complaint =="Collection Truck Noise":
        return "Vehicle"
    else:
        temp = re.split('Noise -',complaint)[1].strip(" ").capitalize() 
        if ((temp == "Commercial") & (location == location)):
            temp = location
    return temp

In [None]:
### Add new feature column "Noise Descriptor"
sr_noise_data["Noise Descriptor"] =\
    sr_noise_data[['Complaint Type','Descriptor']].apply(get_description, axis=1)

In [None]:
list(enumerate(sr_noise_data["Noise Descriptor"].unique()))

In [None]:
### Add new feature column "Noise Complaint Type"
sr_noise_data["Noise Complaint Type"] =\
    sr_noise_data[['Complaint Type','Descriptor','Location Type']].apply(get_complaint_type, axis=1)

In [None]:
list(enumerate(sr_noise_data["Noise Complaint Type"].unique()))

In [None]:
# Drop colummns that are no longer useful
sr_noise_data = sr_noise_data.drop(columns =['Status','Agency Name','Complaint Type',\
                             'Descriptor','Location Type'])


I will create some new date and time columns that will help in visualizations. 

In [None]:
# Add columns related to date and time 
sr_noise_data['Time Elapsed'] = sr_noise_data['Closed Date']-sr_noise_data['Created Date']

sr_noise_data['Date_created']=pd.DatetimeIndex(sr_noise_data['Created Date']).date
sr_noise_data['Year_month_created']=pd.DatetimeIndex(sr_noise_data['Date_created']).to_period('M').to_timestamp()

sr_noise_data['Year_created'] = pd.DatetimeIndex(sr_noise_data['Created Date']).year
sr_noise_data['Month_created'] = pd.DatetimeIndex(sr_noise_data['Created Date']).month
sr_noise_data['Day_created'] = pd.DatetimeIndex(sr_noise_data['Created Date']).dayofweek
sr_noise_data['Hour_created'] = pd.DatetimeIndex(sr_noise_data['Created Date']).hour

I will add some 'denominator' and 'numerator count' columns that will aid in visualizations. 

In [None]:
#1) Calculate Denominator: Total number of complaints created every month
denominator = sr_noise_data.groupby(['Year_month_created'])['Unique Key'].count()

#Add this new feature column to data table via merge 
sr_noise_data = pd.merge(sr_noise_data,denominator, on='Year_month_created')
sr_noise_data.rename(columns={'Unique Key_x': 'Unique Key','Unique Key_y': 'Month_Count_All'},inplace=True)

#2) Calculate Numerator #1: Number of complaints grouped by Noise Descriptor, per month
numerator = sr_noise_data.groupby(['Noise Descriptor',\
                                       'Year_month_created','Month_Count_All'])['Unique Key'].agg('count')
#Add this new feature column to data table via 
sr_noise_data = pd.merge(sr_noise_data,\
                                 numerator,\
                                 how='left',\
                                 on=['Noise Descriptor','Year_month_created','Month_Count_All'])
sr_noise_data.rename(columns={'Unique Key_x': 'Unique Key',\
                                      'Unique Key_y': 'Month_Count_Descriptor'},inplace=True)

#3) Calculate Numerator #2: Number of complaints grouped by Noise Complaint Type, per month
numerator = sr_noise_data.groupby(['Noise Complaint Type',\
                                       'Year_month_created','Month_Count_All'])['Unique Key'].agg('count')
#Add this new feature column to data table via 
sr_noise_data = pd.merge(sr_noise_data,\
                                 numerator,\
                                 how='left',\
                                 on=['Noise Complaint Type','Year_month_created','Month_Count_All'])
sr_noise_data.rename(columns={'Unique Key_x': 'Unique Key',\
                                      'Unique Key_y': 'Month_Count_Complaint_Type'},inplace=True)


In [None]:
### Check that all month_count_groups add up to the month_count_all total for a single month (Jan 2019)
# a = sr_noise_data[sr_noise_data['Month_created']==1]
# b = a[a['Year_created']==2019]
# print(b['Month_Count_Complaint_Type'].unique().sum())
# print(b['Month_Count_All'][0])

In [None]:
sr_noise_data.info()

In [None]:
filename = "311_Service_Requests_Noise_Cleaned.csv"
sr_noise_data.to_csv(filename,index = False)