### DSC Challenge - Section-1

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats

%matplotlib inline

#### Load NYPD Motor Vehicle Collisions Data

In [None]:
mvc_data = pd.read_csv('NYPD_Motor_Vehicle_Collisions.csv', low_memory=False)
mvc_data.head()

#### Get DF for 2016 period and compute injuries and deaths for persons, pedestrians, cyclists, and motorists

In [None]:
# Output loaded data rows and columns for information
print ("mvc_data.shape - (rows, columns): {0}".format(mvc_data.shape))
# Get total incidents reported in 2016 from the loaded data
totalIncidentsIn2016 = mvc_data[(mvc_data['DATE'].apply(lambda d: d[6:] == '2016'))]
print ("totalIncidentsIn2016.shape - (rows, columns): {0}".format(totalIncidentsIn2016.shape))
# Total Persons injured or killed in 2016
tpinjAndkld = sum((totalIncidentsIn2016['NUMBER OF PERSONS INJURED']>0) | (totalIncidentsIn2016['NUMBER OF PERSONS KILLED']>0))
# Total Cyclist injuries or dealth in 2016
tcinjAndkld = sum((totalIncidentsIn2016['NUMBER OF CYCLIST INJURED']>0) | (totalIncidentsIn2016['NUMBER OF CYCLIST KILLED']>0))
# Total Pedestrians injured and killed in 2016
tpdinjAndkld = sum((totalIncidentsIn2016['NUMBER OF PEDESTRIANS INJURED']>0) | (totalIncidentsIn2016['NUMBER OF PEDESTRIANS KILLED']>0))
# Total Motorists injured and killed in 2016
tminjAndkld = sum((totalIncidentsIn2016['NUMBER OF MOTORIST INJURED']>0) | (totalIncidentsIn2016['NUMBER OF MOTORIST KILLED']>0))
print ("totalIncidentsIn2016 / persons - injured and killed: {0}".format(tpinjAndkld))
print ("totalIncidentsIn2016 / cyclist - injured and killed: {0}".format(tcinjAndkld))
print ("totalIncidentsIn2016 / pedestrians - injured and killed: {0}".format(tpdinjAndkld))
print ("totalIncidentsIn2016 / motorist - injured and killed: {0}".format(tminjAndkld))
print()

In [None]:
incReportedIn2016 = totalIncidentsIn2016['DATE'].count()
tinjAndKld = tpinjAndkld + tcinjAndkld + tpdinjAndkld + tminjAndkld
print ("total incidents in 2016: {0}; total injured or killed: {1}; cyclist injured and killed: {2}".format(incReportedIn2016, tinjAndKld, tcinjAndkld))

#### Proportion of cyclist injured or died in 2016 collision

In [None]:
propcycInjRkldWrtTotinjRkld = tcinjAndkld / tinjAndKld
propinjRkld = tcinjAndkld/incReportedIn2016
print ("Proportion of cyclist injured or killed in 2016 collisions wrt total injured or killed: {:.10f}".format(propcycInjRkldWrtTotinjRkld))
print ("Proportion of cyclist injured or killed in 2016 collisions: {:.10f}".format(propinjRkld))

#### Identify Vehicles involved in each collision in 2016

In [None]:
vehiclesAndZipCodeIn2016 = totalIncidentsIn2016[['ZIP CODE', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5']]
byZipCode = vehiclesAndZipCodeIn2016.groupby(['ZIP CODE']).count()
maxInZip = byZipCode.sum(axis='columns').max()
print ("Maximum number of vehicles: {0}".format(maxInZip))

#### Compute Total Number of collisions (2013-2018)

In [None]:
# Collect collision data for the years: 2013, 2014, 2015, 2016, 2017, and 2018
totalIncidentsIn2013 = mvc_data[(mvc_data['DATE'].apply(lambda d: d[6:] == '2013'))]
print ("Collision counts for 2013: {0}".format(len(totalIncidentsIn2013.index)))
totalIncidentsIn2014 = mvc_data[(mvc_data['DATE'].apply(lambda d: d[6:] == '2014'))]
print ("Collision counts for 2014: {0}".format(len(totalIncidentsIn2014.index)))
totalIncidentsIn2015 = mvc_data[(mvc_data['DATE'].apply(lambda d: d[6:] == '2015'))]
totalIncidentsIn2016 = mvc_data[(mvc_data['DATE'].apply(lambda d: d[6:] == '2016'))]
totalIncidentsIn2017 = mvc_data[(mvc_data['DATE'].apply(lambda d: d[6:] == '2017'))]
totalIncidentsIn2018 = mvc_data[(mvc_data['DATE'].apply(lambda d: d[6:] == '2018'))]
# Create dataframes with year and number of collisions
labels = ['Year', 'Collisions']
collisions = [(2013, len(totalIncidentsIn2013.index)), (2014, len(totalIncidentsIn2014.index)),
              (2015, len(totalIncidentsIn2015.index)), (2016, len(totalIncidentsIn2016.index)),
              (2017, len(totalIncidentsIn2017.index)), (2018, len(totalIncidentsIn2018.index))]
collisionsDF = pd.DataFrame.from_records(collisions, columns=labels)

sns.regplot(x='Year', y='Collisions', data=collisionsDF)

slope, intercept, r_value, p_value, std_err = stats.linregress(collisionsDF['Year'], collisionsDF['Collisions'])
print ("Slope of the linear regression: {:.10f}; intercept: {:.5f}; r_value: {:.5f}; p_value: {:.5f}".format(slope, intercept, r_value, p_value))