In [1]:
import pylab as pl
import pandas as pd
import numpy as np
import os
import scipy.stats
import statsmodels.formula.api as smf
from datetime import timedelta, date, datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf

pl.style.use('ggplot')
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


#Data Wrangling

In [2]:
complaints = pd.read_csv('311_complaints.csv')
collisions = pd.read_csv('matched.csv')

In [3]:
complaints.shape

(287899, 8)

In [4]:
merged = pd.merge(complaints, collisions, on=['created_date', 'closed_date'], how = 'left')

In [5]:
merged['collision'].fillna(0, inplace=True)

In [6]:
merged['created_date'] = pd.to_datetime(pd.Series(merged['created_date']))

In [7]:
#Here we create two columns for evaluating the time of day and if the event occurred during peak travel hours

daytime = 6
nighttime = 19
peak_start1 = 6
peak_end1 = 10
peak_start2 = 16
peak_end2 = 18

#Creating two columns based on evaluations of the TIME column
merged['Night'] = merged.created_date.apply(lambda x:  (x.hour <= daytime or x.hour >= nighttime))
merged['Peak'] = merged.created_date.apply(lambda x: ((peak_start1 <= x.hour < peak_end1) | (peak_start2 <= x.hour < peak_end2)))

In [8]:
merged['Night'] = merged['Night'].astype(int)
merged['Peak'] = merged['Peak'].astype(int)

In [9]:
merged.head(3)

Unnamed: 0.1,Unnamed: 0,weather_condition,created_date,closed_date,agency,complaint_type,descriptor,borough,collision,Night,Peak
0,0,Clear,2012-07-01 23:40:00,7/1/2012 23:50,DOT,Traffic Signal Condition,Junction Box,BRONX,0,1,0
1,1,Clear,2012-07-01 23:25:00,7/1/2012 23:35,DOT,Traffic Signal Condition,Base Door,QUEENS,0,1,0
2,2,Clear,2012-07-01 23:21:00,7/2/2012 10:02,DOT,Street Light Condition,Street Light Out,QUEENS,0,1,0


#Logistic Regression

### I get a weird linear algebra error when I run the logit with too many variables, so I have two different models

In [10]:
logit_mod = smf.logit('collision ~ complaint_type + borough', data = merged).fit()
print (logit_mod.summary())

         Current function value: 0.096612
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:              collision   No. Observations:               289575
Model:                          Logit   Df Residuals:                   289567
Method:                           MLE   Df Model:                            7
Date:                Sun, 13 Dec 2015   Pseudo R-squ.:                 0.02305
Time:                        13:14:38   Log-Likelihood:                -27976.
converged:                      False   LL-Null:                       -28636.
                                        LLR p-value:                7.354e-281
                                                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------------------------
Intercept                                     -4.2356      0.048    -88.270     



In [11]:
marginal = logit_mod.get_margeff()
print (marginal.summary())

        Logit Marginal Effects       
Dep. Variable:              collision
Method:                          dydx
At:                           overall
                                                dy/dx    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------------------------
complaint_type[T.Traffic]                     -0.3504      6.915     -0.051      0.960       -13.903    13.202
complaint_type[T.Traffic Signal Condition]    -0.0066      0.001    -11.144      0.000        -0.008    -0.005
borough[T.BROOKLYN]                            0.0147      0.001     14.145      0.000         0.013     0.017
borough[T.MANHATTAN]                           0.0232      0.001     17.153      0.000         0.021     0.026
borough[T.QUEENS]                              0.0082      0.001      7.847      0.000         0.006     0.010
borough[T.STATEN ISLAND]                      -0.0090      0.002     -5

##Statistically Significant Variables:
####TRAFFIC SIGNAL CONDITION has a NEGATIVE impact on the probability of a collision occuring. One additional traffic signal condition decreases this probability by 0.0066 or 0.66%. 
####STATEN ISLAND 311 calls have NEGATIVE impact on the probability of a collision occuring. One additional 311 call from Staten Island decreases this probability by 0.009 or 0.9%..
####BROOKLYN 311 calls have POSITIVE impact on the probability of a collision occuring. One additional 311 call from Brooklyn increases this probability by 0.0147 or 1.47%.
####MANAHTTAN 311 calls have POSITIVE impact on the probability of a collision occuring. One additional 311 call from Manhattan increases this probability by 0.0232 or 2.32%. 
####QUEENS 311 calls have POSITIVE impact on the probability of a collision occuring. One additional 311 call from Queens increases this probability by 0.0082 or 0.82%. 

In [12]:
logit_mod = smf.logit('collision ~ weather_condition + Night + Peak', data = merged).fit()
print (logit_mod.summary())

         Current function value: 0.098304
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:              collision   No. Observations:               289575
Model:                          Logit   Df Residuals:                   289562
Method:                           MLE   Df Model:                           12
Date:                Sun, 13 Dec 2015   Pseudo R-squ.:                0.005943
Time:                        13:14:50   Log-Likelihood:                -28466.
converged:                      False   LL-Null:                       -28636.
                                        LLR p-value:                 1.494e-65
                                               coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------------
Intercept                                   -3.8512      0.020   -193.055      0.000



In [13]:
marginal = logit_mod.get_margeff()
print (marginal.summary())

        Logit Marginal Effects       
Dep. Variable:              collision
Method:                          dydx
At:                           overall
                                              dy/dx    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------------
weather_condition[T.Haze]                    0.0057      0.002      3.113      0.002         0.002     0.009
weather_condition[T.Light Freezing Rain]    -0.4188    296.673     -0.001      0.999      -581.888   581.050
weather_condition[T.Light Rain]              0.0011      0.001      0.759      0.448        -0.002     0.004
weather_condition[T.Light Snow]             -0.0148      0.003     -5.602      0.000        -0.020    -0.010
weather_condition[T.Mostly Cloudy]          -0.0052      0.002     -2.972      0.003        -0.009    -0.002
weather_condition[T.Overcast]                0.0014      0.001      2.388      0.017 

##Statistically Significant Variables:

####311 calls during the NIGHT have a NEGATIVE impact on the probability of a collision occuring. One additional 311 call during the night decreases this probability by -0.0041 or -0.41%. 

####311 calls during PEAK hours have a NEGATIVE impact on the probability of a collision occuring. One additional 311 call during peak hours decreases this probability by -0.0027 or -0.27%. 

####311 calls during LIGHT SNOW have a NEGATIVE impact on the probability of a collision occuring. One additional 311 call during light now decreases this probability by -0.0148 or -1.48%. 

####311 calls during MOSTLY CLOUDY weather have a NEGATIVE impact on the probability of a collision occuring. One additional 311 call during mostly cloudy weather decreases this probability by -0.0052 or -0.52%. 

####311 calls during HAZE have a POSITIVE impact on the probability of a collision occuring. One additional 311 call during haze increases this probability by 0.0057 or 0.57%. 

####311 calls during OVERCAST weather have a POSITIVE impact on the probability of a collision occuring. One additional 311 call during overcast weather increases this probability by 0.0014 or 0.14%. 

####311 calls during RAIN have a POSITIVE impact on the probability of a collision occuring. One additional 311 call during rain increases this probability by 0.0141 or 1.41%. 

####311 calls during SNOW have a POSITIVE impact on the probability of a collision occuring. One additional 311 call during snow increases this probability by 0.018 or 1.8%. 

####311 calls during UNKNOWN weather have a POSITIVE impact on the probability of a collision occuring. One additional 311 call during unknown weather increases this probability by 0.023 or 2.3%. 

