# Part 1 - Initial Cleaning and Transforming of the Complaints Data

The following ETL section was done in VScode using pandas. \
Read in file containing complaints dataset from NYC Open Data Website 
https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i

In [None]:
crime = pd.read_csv('NYPD_Complaint_Data_Historic.csv', low_memory = False)

Separate date to day, month and year to replace incorrect dates

In [None]:
crime[['DAY','MONTH','YEAR']] = crime['CMPLNT_FR_DT'].str.split('/', 2, expand = True)

Replace incorrectly entered years

In [None]:
crime['YEAR'] = crime['YEAR'].replace(['1028'], '2018')
crime['YEAR'] = crime['YEAR'].replace(['1017'], '2017')
crime['YEAR'] = crime['YEAR'].replace(['1018'], '2018')
crime['YEAR'] = crime['YEAR'].replace(['1016'], '2016')
crime['YEAR'] = crime['YEAR'].replace(['1027'], '2017')
crime['YEAR'] = crime['YEAR'].replace(['1015'], '2015')
crime['YEAR'] = crime['YEAR'].replace(['1025'], '2015')
crime['YEAR'] = crime['YEAR'].replace(['1021'], '2021')
crime['YEAR'] = crime['YEAR'].replace(['1019'], '2019')
crime['YEAR'] = crime['YEAR'].replace(['1029'], '2019')
crime['YEAR'] = crime['YEAR'].replace(['1010'], '2020')
crime['YEAR'] = crime['YEAR'].replace(['1020'], '2020')
crime['YEAR'] = crime['YEAR'].replace(['1026'], '2016')

Recombine day, month and year to date

In [None]:
crime['DATE'] = crime[['DAY','MONTH','YEAR']].apply(lambda row: '/'.join(row.values.astype(str)), axis=1)

Drop previously created columns and column with incorrect dates

In [None]:
crime = crime.drop(['CMPLNT_FR_DT','DAY','MONTH','YEAR'], axis = 1)

Convert time related columns to datetime, then create a combined date and time column - This will take a while to run

In [None]:
import datetime
from datetime import date, time
crime = crime.astype({'DATE':str, 'CMPLNT_FR_TM':str})
crime['DATE_AND_TIME'] = pd.to_datetime(crime['DATE'] + ' ' + crime['CMPLNT_FR_TM'], errors = 'coerce')

Convert date column to datetime as well

In [None]:
crime['DATE'] = pd.to_datetime(crime['DATE'], format='%m/%d/%Y')

Sort to years 2017 and after

In [None]:
crime = crime[crime['DATE_AND_TIME'] >= '2017/01/01']

Remove rows with null coordinates

In [None]:
crime = crime.dropna(subset = ['Latitude'])
crime = crime.dropna(subset = ['Longitude'])

Export to csv in a folder for exported data - Uploaded to a blob in a storage container to pull from later

In [None]:
crime.to_csv(r'C:\Users\{file_path}\Exports\crime.csv', header=True)

# Part 2 - Further Cleaning of the Complaints Data

In [None]:
import pandas as pd
from config import SAS_TOKEN, CONTAINER, STOR_ACCT
import datetime
import re

spark.conf.set(f'fs.azure.sas.{CONTAINER}.{STOR_ACCT}.blob.core.windows.net', SAS_TOKEN)

read_path = ROOT_PATH + 'mta-nypd/crime.csv'
# df = spark.read.format('csv').option('header',True).load(read_path)
all_nyc_complaints_spark = spark.read.csv(
    read_path, 
    header=True, 
    mode="DROPMALFORMED", 
    multiLine = True
)

all_nyc_complaints_spark.display()

CMPLNT_NUM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,LAW_CAT_CD,BORO_NM,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,JURIS_DESC,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,Latitude,Longitude,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,DATE,DATE_AND_TIME
100016410,100.0,03/01/2018,126,MISCELLANEOUS PENAL LAW,198.0,CRIMINAL CONTEMPT 1,FELONY,QUEENS,FRONT OF,RESIDENCE - PUBLIC HOUSING,N.Y. HOUSING POLICE,25-44,BLACK,M,40.58766357000008,-73.81055977899997,PATROL BORO QUEENS SOUTH,,25-44,BLACK,F,02/22/2018,2018-02-22 22:30:00
100022845,102.0,11/17/2021,578,HARRASSMENT 2,637.0,"HARASSMENT,SUBD 1,CIVILIAN",VIOLATION,QUEENS,INSIDE,RESIDENCE-HOUSE,N.Y. POLICE DEPT,45-64,WHITE HISPANIC,F,40.69387313600004,-73.86731416099997,PATROL BORO QUEENS SOUTH,,18-24,WHITE HISPANIC,F,11/17/2021,2021-11-17 18:30:00
100053585,112.0,07/31/2020,112,THEFT-FRAUD,739.0,"FRAUD,UNCLASSIFIED-FELONY",FELONY,QUEENS,INSIDE,RESIDENCE-HOUSE,N.Y. POLICE DEPT,UNKNOWN,UNKNOWN,U,40.72279798100004,-73.85195635699995,PATROL BORO QUEENS NORTH,,45-64,ASIAN / PACIFIC ISLANDER,F,07/30/2020,2020-07-30 18:00:00
100055459,101.0,07/03/2019,233,SEX CRIMES,681.0,"CHILD, ENDANGERING WELFARE",MISDEMEANOR,QUEENS,INSIDE,RESIDENCE - APT. HOUSE,N.Y. POLICE DEPT,UNKNOWN,WHITE HISPANIC,M,40.60295159100008,-73.74997626099997,PATROL BORO QUEENS SOUTH,,<18,WHITE HISPANIC,M,06/26/2019,2019-06-26 10:00:00
100071324,105.0,05/10/2018,344,ASSAULT 3 & RELATED OFFENSES,101.0,ASSAULT 3,MISDEMEANOR,QUEENS,REAR OF,RESIDENCE-HOUSE,N.Y. POLICE DEPT,45-64,BLACK,F,40.70168393200005,-73.731421707,PATROL BORO QUEENS SOUTH,,<18,BLACK,F,05/10/2018,2018-05-10 19:35:00
100117814,108.0,03/21/2019,109,GRAND LARCENY,409.0,"LARCENY,GRAND BY EXTORTION",FELONY,QUEENS,INSIDE,COMMERCIAL BUILDING,N.Y. POLICE DEPT,,,,40.74296235400004,-73.95527424799997,PATROL BORO QUEENS NORTH,,25-44,WHITE HISPANIC,M,03/18/2019,2019-03-18 17:00:00
100124123,103.0,07/21/2019,109,GRAND LARCENY,404.0,"LARCENY,GRAND FROM PERSON,PERSONAL ELECTRONIC DEVICE(SNATCH)",FELONY,QUEENS,,BUS STOP,N.Y. POLICE DEPT,25-44,BLACK,M,40.706529671000055,-73.76057775999993,PATROL BORO QUEENS SOUTH,,25-44,BLACK,F,07/21/2019,2019-07-21 17:00:00
100137955,115.0,09/29/2017,578,HARRASSMENT 2,638.0,"HARASSMENT,SUBD 3,4,5",VIOLATION,QUEENS,INSIDE,OTHER,N.Y. POLICE DEPT,UNKNOWN,UNKNOWN,U,40.754639784,-73.89419876,PATROL BORO QUEENS NORTH,,45-64,WHITE HISPANIC,M,09/29/2017,2017-09-29 11:10:00
100159615,101.0,11/10/2018,578,HARRASSMENT 2,638.0,"HARASSMENT,SUBD 3,4,5",VIOLATION,QUEENS,INSIDE,RESIDENCE-HOUSE,N.Y. POLICE DEPT,UNKNOWN,BLACK,M,40.60377844000004,-73.75928604599994,PATROL BORO QUEENS SOUTH,,45-64,BLACK,F,11/09/2018,2018-11-09 17:00:00
100161373,109.0,02/12/2018,106,FELONY ASSAULT,105.0,STRANGULATION 1ST,FELONY,QUEENS,INSIDE,RESIDENCE-HOUSE,N.Y. POLICE DEPT,25-44,ASIAN / PACIFIC ISLANDER,F,40.76655815000004,-73.80125681899995,PATROL BORO QUEENS NORTH,,25-44,WHITE,M,02/12/2018,2018-02-12 21:05:00


The code above provides access to crime.csv file from a storage blob. This file is imported as a pyspark dataframe and covers complaints from 2017 to 2021 and in all five boroughs. Because this is such a large dataset, we will be breaking it down to borough and year. First, we remove columns that were deemed to be unnecessary, such as the park's name (if the crime occured at a park), using the code below.

In [None]:
nyc_complaints_spark = all_nyc_complaints_spark.drop('CMPLNT_FR_TM', 'CMPLNT_TO_DT', 'CMPLNT_TO_TM', 'CRM_ATPT_CPTD_CD', 'JURISDICTION_CODE', 'PARKS_NM', 'HADEVELOPT', 'HOUSING_PSA', 'X_COORD_CD', 'Y_COORD_CD', 'TRANSIT_DISTRICT', 'Lat_Lon')


The large dataset is broken down to smaller subsets since it was too large for pandas dataframe. Randomsplit was used to distribute the rows equally into 10 subsets.

In [None]:
df1,df2,df3,df4,df5,df6,df7,df8,df9,df10 = nyc_complaints_spark.randomSplit([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], seed = 0)


In [None]:
def only_borough(df):
    global BK
    global MN
    global BX
    global QN
    global SI
    global NA
    complaint_BK = df[df['BORO_NM'] == 'BROOKLYN']
    BK = pd.concat([BK, complaint_BK])
    complaint_MN = df[df['BORO_NM'] == 'MANHATTAN']
    MN = pd.concat([MN, complaint_MN])
    complaint_QN = df[df['BORO_NM'] == 'QUEENS']
    QN = pd.concat([QN, complaint_QN])    
    complaint_SI = df[df['BORO_NM'] == 'STATEN ISLAND']
    SI = pd.concat([SI, complaint_SI])
    complaint_BX = df[df['BORO_NM'] == 'BRONX']
    BX = pd.concat([BX, complaint_BX])
    complaint_NA = df[pd.isnull(df['BORO_NM'])]
    NA = pd.concat([NA, complaint_NA])

BK = pd.DataFrame()
MN = pd.DataFrame()
QN = pd.DataFrame()
SI = pd.DataFrame()
BX = pd.DataFrame()
NA = pd.DataFrame()

Six new dataframes were created, one for each borough and one for rows that did not name a borough. The helper function above filters out each row into its respective borough. Below is the actual iteration through all 10 subsets. The resulting six dataframes were converted back into a ppyspark dataframe and saved as csvs.

In [None]:
data = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10]    
boroughs = [MN, QN, BK, SI, NA, BX]
bor = ['MN', 'QN', 'BK', 'SI', 'NA', 'BX']
    
for i in data:    
    only_borough(pd_df)
    
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType
mySchema = StructType([ StructField("CMPLNT_NUM", StringType(), True)\
                       ,StructField("ADDR_PCT_CD", StringType(), True)\
                       ,StructField("RPT_DT", StringType(), True)\
                       ,StructField("KY_CD", StringType(), True)\
                       ,StructField("OFNS_DESC", StringType(), True)\
                       ,StructField("PD_CD", StringType(), True)\
                       ,StructField("PD_DESC", StringType(), True)\
                       ,StructField("LAW_CAT_CD", StringType(), True)\
                       ,StructField("BORO_NM", StringType(), True)\
                       ,StructField("LOC_OF_OCCUR_DESC", StringType(), True)\
                       ,StructField("PREM_TYP_DESC", StringType(), True)\
                       ,StructField("JURIS_DESC", StringType(), True)\
                       ,StructField("SUSP_AGE_GROUP", StringType(), True)\
                       ,StructField("SUSP_RACE", StringType(), True)\
                       ,StructField("SUSP_SEX", StringType(), True)\
                       ,StructField("Latitude", StringType(), True)\
                       ,StructField("Longitude", StringType(), True)\
                       ,StructField("PATROL_BORO", StringType(), True)\
                       ,StructField("STATION_NAME", StringType(), True)\
                       ,StructField("VIC_AGE_GROUP", StringType(), True)\
                       ,StructField("VIC_RACE", StringType(), True)\
                       ,StructField("VIC_SEX", StringType(), True)\
                       ,StructField("DATE", StringType(), True)\
                       ,StructField("DATE_AND_TIME", StringType(), True)])

for i in range(5):
    subway = spark.createDataFrame(borough[i],schema=mySchema)
    subway.coalesce(1).write.mode('overwrite').csv(ROOT_PATH + f"/{bor[i]}_complaints.csv", header = 'True')


#Part 3 - Transformations: Connect Complaints to Stations

In [None]:
spark.conf.set(f'fs.azure.sas.{CONTAINER}.{STOR_ACCT}.blob.core.windows.net', SAS_TOKEN)

read_path = ROOT_PATH + 'BX_complaints.csv'
BX_complaints_spark = spark.read.csv(
    read_path, 
    header=True, 
    mode="DROPMALFORMED", 
    inferSchema = True,
    multiLine = True
)

BX_complaints_spark.display()

CMPLNT_NUM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,LAW_CAT_CD,BORO_NM,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,JURIS_DESC,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,Latitude,Longitude,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,DATE,DATE_AND_TIME
100020324,47.0,2017-06-14,351,CRIMINAL MISCHIEF & RELATED OF,259.0,"CRIMINAL MISCHIEF,UNCLASSIFIED 4",MISDEMEANOR,BRONX,INSIDE,RESIDENCE - PUBLIC HOUSING,N.Y. HOUSING POLICE,18-24,BLACK,M,40.887332818,-73.847250013,PATROL BORO BRONX,,18-24,BLACK,F,2017-06-14,2017-06-14T10:50:00.000+0000
100043252,52.0,2019-02-18,578,HARRASSMENT 2,638.0,"HARASSMENT,SUBD 3,4,5",VIOLATION,BRONX,INSIDE,RESIDENCE - APT. HOUSE,N.Y. POLICE DEPT,UNKNOWN,UNKNOWN,F,40.86652284100006,-73.89404245199995,PATROL BORO BRONX,,45-64,WHITE,F,2019-02-18,2019-02-18T12:30:00.000+0000
100079454,49.0,2019-03-24,105,ROBBERY,394.0,"ROBBERY,LICENSED FOR HIRE VEHICLE",FELONY,BRONX,FRONT OF,STREET,N.Y. POLICE DEPT,18-24,BLACK,M,40.87193544900003,-73.85174002899998,PATROL BORO BRONX,,45-64,UNKNOWN,M,2019-03-24,2019-03-24T02:05:00.000+0000
100114671,52.0,2020-01-10,578,HARRASSMENT 2,637.0,"HARASSMENT,SUBD 1,CIVILIAN",VIOLATION,BRONX,INSIDE,OTHER,N.Y. POLICE DEPT,<18,WHITE HISPANIC,M,40.86940749200004,-73.87999831299999,PATROL BORO BRONX,,UNKNOWN,UNKNOWN,E,2020-01-10,2020-01-10T16:20:00.000+0000
100114720,46.0,2018-09-24,344,ASSAULT 3 & RELATED OFFENSES,101.0,ASSAULT 3,MISDEMEANOR,BRONX,FRONT OF,PRIVATE/PAROCHIAL SCHOOL,N.Y. POLICE DEPT,<18,UNKNOWN,M,40.85238190900003,-73.91351577099994,PATROL BORO BRONX,,<18,BLACK HISPANIC,M,2018-09-24,2018-09-24T15:30:00.000+0000
100119422,47.0,2018-10-14,344,ASSAULT 3 & RELATED OFFENSES,113.0,"MENACING,UNCLASSIFIED",MISDEMEANOR,BRONX,INSIDE,RESIDENCE - APT. HOUSE,N.Y. POLICE DEPT,25-44,BLACK,F,40.88561881400005,-73.86608159099995,PATROL BORO BRONX,,25-44,BLACK,M,2018-10-14,2018-10-14T03:01:00.000+0000
100127747,52.0,2018-10-16,361,OFF. AGNST PUB ORD SENSBLTY &,639.0,AGGRAVATED HARASSMENT 2,MISDEMEANOR,BRONX,INSIDE,HOSPITAL,N.Y. POLICE DEPT,25-44,WHITE,M,40.880335555000045,-73.87874183599996,PATROL BORO BRONX,,45-64,WHITE,M,2018-10-14,2018-10-14T02:23:00.000+0000
100137802,48.0,2020-12-01,105,ROBBERY,388.0,"ROBBERY,RESIDENTIAL COMMON AREA",FELONY,BRONX,INSIDE,RESIDENCE - APT. HOUSE,N.Y. POLICE DEPT,45-64,BLACK,M,40.84678146700002,-73.89245825499995,PATROL BORO BRONX,,UNKNOWN,UNKNOWN,E,2020-11-30,2020-11-30T21:55:00.000+0000
100155584,50.0,2021-02-22,109,GRAND LARCENY,439.0,"LARCENY,GRAND FROM OPEN AREAS, UNATTENDED",FELONY,BRONX,INSIDE,MAILBOX OUTSIDE,N.Y. POLICE DEPT,,,,40.87825607100008,-73.90301705399997,PATROL BORO BRONX,,45-64,WHITE,M,2020-05-01,2020-05-01T12:00:00.000+0000
100169781,47.0,2017-11-16,113,FORGERY,729.0,"FORGERY,ETC.,UNCLASSIFIED-FELO",FELONY,BRONX,,TRANSIT - NYC SUBWAY,N.Y. TRANSIT POLICE,45-64,BLACK,M,40.887313634,-73.847271758,PATROL BORO BRONX,GUN HILL ROAD,UNKNOWN,UNKNOWN,E,2017-11-16,2017-11-16T17:45:00.000+0000


At this point, each of the csvs we saved only contains complaints from one borough. The Bronx dataset is shown above. It is worth noting that the inferred imported schema has changed the last column to a timestamp, which will come in handy when the rows are further separated out into years. The code above was also altered so that it can also read in the datasets for the other boroughs. The first helper function below filters out rows that are between two dates and the second saves the rows as a new csv.

In [None]:
from pyspark.sql import functions as F

def only_year(old_df, first_date, second_date):
    return old_df\
        .filter((F.col('DATE') > F.lit(first_date)) &\
        (F.col('DATE') < F.lit(second_date)))

def save(df, borough, year):
    df_name = spark.createDataFrame(df)
    df_name.coalesce(1).write.mode('overwrite').csv(ROOT_PATH + f"/Borough_Year_Complaints/{borough}_{year}.csv", header = 'True')

The only_year function was used to separate the datasets of each borough into different years, ranging from 2017 to 2021. Below is an example of how the Manhattan crimes were separated out.

In [None]:
MN_2021 = only_year(MN_complaints_spark, '2020-12-31', '2022-01-01')
MN_2020 = only_year(MN_complaints_spark, '2019-12-31', '2021-01-01')
MN_2019 = only_year(MN_complaints_spark, '2018-12-31', '2020-01-01')
MN_2018 = only_year(MN_complaints_spark, '2017-12-31', '2019-01-01')
MN_2017 = only_year(MN_complaints_spark, '2016-12-31', '2018-01-01')

A dataset regarding the locations of the train stations was imported in as a pyspark dataframe in the cell below. This dataset will help us determine the closest train station to a complaint and what the distance is. It should be noted that the train stations dataset does not include Staten Island train stations.

In [None]:
read_path = ROOT_PATH + 'mta-nypd/stopsNYCgrouped.csv'
stations = spark.read.csv(
    read_path, 
    header=True,
    inferSchema = True,
    mode="DROPMALFORMED", 
    multiLine = True)

In [None]:
stations = stations.withColumnRenamed('Station Name','Station_Name').withColumnRenamed('Station Latitude','Station_Latitude').withColumnRenamed('Station Longitude','Station_Longitude').withColumnRenamed('All Lines','All_Lines')

s = stations.collect()

Two more helper functions below. The distance function was designed to determine the distance between two locations using latitudes and longitudes. Using the distance function, the closest_station function would iterate through all the stations' location and the location of a complaint to determine which station is the closest.

In [None]:
from math import cos, asin, sqrt, pi

def distance(lat1, lon1, lat2, lon2):
    p = pi/180    
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2    
    return 12742 * asin(sqrt(a))

def closest_station(lat, long, stations):
    min_distance = 10000000.0    
    closest = ''    
    for i in range(465):
        tmp = distance(
                    lat,
                    long,
                    stations[i].Station_Latitude,
                    stations[i].Station_Longitude)
        if tmp < min_distance:
            min_distance = tmp            
            closest = stations[i].Station_Name   
            line = stations[i].All_Lines
    return closest, min_distance, line

The code below changes each pyspark dataframe to a pandas dataframe, loops through each dataset for a borough and adds on a column for the closest station, the distance between the crime and the station, and all the trains that stops at that station. The dataframes are then saved to csv files.  Similar to the code previous, the one below was altered for each borough by changing out the two letter code the borough.

In [None]:
datas = [SI_2021, SI_2020, SI_2019, SI_2018, SI_2017]
year = 2021
for data in datas:
    data = data.drop('STATION_NAME')
    pd_data = data.toPandas()
    pd_data['closest_station'] = pd_data.apply(
                                   lambda row:
                                   closest_station(
                                                    row['Latitude'],
                                                    row['Longitude'],
                                                    s),
                                   axis=1)
    pd_data[['closest_station','station_distance','station_line']] = pd.DataFrame(
    pd_data['closest_station'].tolist(),
    index = pd_data.index)
    save(pd_data, 'Staten_Island', year)
    print(f'{year} for Staten Island data saved')
    year -=1
    
    

2021 for Staten Island data saved
2020 for Staten Island data saved
2019 for Staten Island data saved
2018 for Staten Island data saved
2017 for Staten Island data saved
