# Exploratory Data Analysis and Data Cleaning

In [231]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from dateutil import parser
from sklearn.model_selection import train_test_split
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

%matplotlib inline
sns.set()

In [2]:
crashes = pd.read_csv('../Traffic_Crashes_-_Crashes.csv')

In [3]:
# Low_memory=False, because it has mixed types of data in the same column/too many rows
people = pd.read_csv('../Traffic_Crashes_-_People.csv', low_memory=False)

In [4]:
vehicles = pd.read_csv('../Traffic_Crashes_-_Vehicles.csv', low_memory=False)

In [5]:
def howmanyunique(data):
    tempo = []
    nombre = []
    for x in data.columns:
        tempo.append(len(data[x].value_counts().unique()))
        nombre.append(x)
    print(tempo)
    print('')
    print(nombre)

In [6]:
def howmanynan(data):
    print(data.isna().sum())   

In [7]:
def rows_w_nan(data):
   
    temp = []
    for x in data.columns:
        if data[x].isna().sum() > 0:
            percentage_of_data = data[x].isna().sum()/(list(data.shape)[0])
            print("Row: {} : is made {}% of NaN Values.".format(x, percentage_of_data.round(3)))
            if percentage_of_data > 0.3:
                temp.append(x)
    print(temp)

In [8]:
def howmanyduplicates(data):
    print(data.duplicated().sum())

In [9]:
def rows_w_nan1(data):
   
    for x in data.columns:
        print(x)

# Crashes Data Set

In [10]:
crashes.shape

(484022, 49)

In [11]:
howmanyunique(crashes)

[1, 1, 2, 21, 28, 19, 8, 12, 6, 18, 20, 22, 6, 7, 7, 3, 2, 2, 2, 2, 3, 12, 40, 39, 424, 4, 497, 259, 2, 2, 2, 2, 4, 2, 13, 5, 14, 5, 8, 14, 12, 24, 1, 24, 7, 12, 137, 137, 137]

['CRASH_RECORD_ID', 'RD_NO', 'CRASH_DATE_EST_I', 'CRASH_DATE', 'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'INTERSECTION_RELATED_I', 'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE', 'DATE_POLICE_NOTIFIED', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO', 'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE', 'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I', 'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS', 'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',

In [12]:
howmanyduplicates(crashes)

0


In [13]:
rows_w_nan(crashes)

Row: RD_NO : is made 0.007% of NaN Values.
Row: CRASH_DATE_EST_I : is made 0.925% of NaN Values.
Row: LANE_CNT : is made 0.589% of NaN Values.
Row: REPORT_TYPE : is made 0.024% of NaN Values.
Row: INTERSECTION_RELATED_I : is made 0.774% of NaN Values.
Row: NOT_RIGHT_OF_WAY_I : is made 0.953% of NaN Values.
Row: HIT_AND_RUN_I : is made 0.706% of NaN Values.
Row: STREET_DIRECTION : is made 0.0% of NaN Values.
Row: STREET_NAME : is made 0.0% of NaN Values.
Row: BEAT_OF_OCCURRENCE : is made 0.0% of NaN Values.
Row: PHOTOS_TAKEN_I : is made 0.987% of NaN Values.
Row: STATEMENTS_TAKEN_I : is made 0.98% of NaN Values.
Row: DOORING_I : is made 0.997% of NaN Values.
Row: WORK_ZONE_I : is made 0.994% of NaN Values.
Row: WORK_ZONE_TYPE : is made 0.995% of NaN Values.
Row: WORKERS_PRESENT_I : is made 0.998% of NaN Values.
Row: MOST_SEVERE_INJURY : is made 0.002% of NaN Values.
Row: INJURIES_TOTAL : is made 0.002% of NaN Values.
Row: INJURIES_FATAL : is made 0.002% of NaN Values.
Row: INJURIES_INCA

In [14]:
# We are going to drop all the columns with a lot of of NaN values, except for location that we can say right ahead that is
# a multicolinearity case, since it has the Latiutude and Longitude together
crashes = crashes.drop(columns=['CRASH_DATE_EST_I', 'LANE_CNT', 'INTERSECTION_RELATED_I',
                                'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'PHOTOS_TAKEN_I', 
                                'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I', 'WORK_ZONE_TYPE',
                                'WORKERS_PRESENT_I', 'LOCATION'])

### Data Manipulation with Missing Values (NaN)

> **RD_NO**

So we can see here that RD_NO and REPORT_TYPE still have a lot of missing data and that can be explained by the website, where they say and I quote:


> " For privacy reasons, this column is blank for recent crashes."

So since the RD_NO is another way to connect the cases between datasets, we can easily discard the whole column, because we still have CRASH_RECORD_ID, that has no missing values

> **REPORT_TYPE**

In [15]:
crashes.REPORT_TYPE.value_counts()

NOT ON SCENE (DESK REPORT)    278124
ON SCENE                      193888
AMENDED                          240
Name: REPORT_TYPE, dtype: int64

By checking the REPORT_TYPE column we realize that it doesn't offer any predictive value to our model, but still might use it for the business understanding part.

> So we decide to drop the RD_NO columns because we have a good substitute and drop the rows where the remaining NaN values are, so we have a cleaner dataset

In [16]:
crashes = crashes.drop(columns=['RD_NO'])

In [17]:
# Now we can drop the remaining rows with NaN values
crashes.dropna(inplace=True)

In [18]:
crashes.isna().sum().sum()

0

### Since "Injuries_Fatal" is important, let's give some more attention to it

We want to transform the column into a Yes or No kind of answer instead of numbers of deads per accident.

In [19]:
crashes.INJURIES_FATAL.value_counts(dropna=False)

0.0    468289
1.0       348
2.0        18
3.0         5
Name: INJURIES_FATAL, dtype: int64

In [20]:
crashes['INJURIES_FATAL'] = np.where(crashes['INJURIES_FATAL']>0, 1, 0)

In [21]:
crashes.INJURIES_FATAL.value_counts(dropna=False)

0    468289
1       371
Name: INJURIES_FATAL, dtype: int64

## Some more data manipulation with useful columns

### WEATHER_CONDITION

In [22]:
crashes.WEATHER_CONDITION.value_counts()

CLEAR                       369648
RAIN                         41293
UNKNOWN                      20530
SNOW                         19487
CLOUDY/OVERCAST              13910
OTHER                         1512
FOG/SMOKE/HAZE                 797
SLEET/HAIL                     692
FREEZING RAIN/DRIZZLE          549
BLOWING SNOW                   154
SEVERE CROSS WIND GATE          86
BLOWING SAND, SOIL, DIRT         2
Name: WEATHER_CONDITION, dtype: int64

In [23]:
crashes.WEATHER_CONDITION = np.where(crashes.WEATHER_CONDITION=='BLOWING SNOW', 'SNOW', crashes.WEATHER_CONDITION)

In [24]:
crashes.WEATHER_CONDITION = np.where(crashes.WEATHER_CONDITION=='FREEZING RAIN/DRIZZLE', 'RAIN', crashes.WEATHER_CONDITION)

In [25]:
crashes.WEATHER_CONDITION = np.where(crashes.WEATHER_CONDITION=='FOG/SMOKE/HAZE', 'OTHER', crashes.WEATHER_CONDITION)

In [26]:
crashes.WEATHER_CONDITION = np.where(crashes.WEATHER_CONDITION=='SLEET/HAIL', 'OTHER', crashes.WEATHER_CONDITION)

In [27]:
crashes.WEATHER_CONDITION = np.where(crashes.WEATHER_CONDITION=='BLOWING SAND, SOIL, DIRT', 'OTHER', crashes.WEATHER_CONDITION)

In [28]:
crashes.WEATHER_CONDITION = np.where(crashes.WEATHER_CONDITION=='SEVERE CROSS WIND GATE', 'OTHER', crashes.WEATHER_CONDITION)

In [29]:
crashes.WEATHER_CONDITION.value_counts()

CLEAR              369648
RAIN                41842
UNKNOWN             20530
SNOW                19641
CLOUDY/OVERCAST     13910
OTHER                3089
Name: WEATHER_CONDITION, dtype: int64

### ALIGNMENT

In [30]:
crashes.ALIGNMENT.value_counts()

STRAIGHT AND LEVEL       457042
STRAIGHT ON GRADE          5775
CURVE, LEVEL               3507
STRAIGHT ON HILLCREST      1440
CURVE ON GRADE              667
CURVE ON HILLCREST          229
Name: ALIGNMENT, dtype: int64

### POSTED_SPEED_LIMIT

In [31]:
# Bining Posted_Speed_Limit
crashes.POSTED_SPEED_LIMIT.value_counts()

30    345555
35     32216
25     28456
20     18121
15     16350
10      9874
0       6611
40      4282
5       3551
45      2696
55       360
3        113
50        94
9         89
99        66
39        51
1         33
60        24
2         19
24        16
32        14
33        10
34        10
65        10
6          7
11         5
36         5
70         3
7          2
14         2
12         2
18         2
31         2
26         2
49         1
38         1
22         1
63         1
4          1
23         1
29         1
Name: POSTED_SPEED_LIMIT, dtype: int64

In [32]:
# Let's bin the speed limit in 9 groups, the last one being 45 miles/hour or above
crashes.POSTED_SPEED_LIMIT = pd.cut(crashes.POSTED_SPEED_LIMIT,[0, 5, 10, 15, 20, 25, 30, 35, 40, 45],
                                    precision=0, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])

In [33]:
crashes.POSTED_SPEED_LIMIT.value_counts()

5    345558
6     32252
4     28474
3     18123
2     16359
1      9972
7      4339
0      3717
8      2696
Name: POSTED_SPEED_LIMIT, dtype: int64

In [34]:
howmanyunique(crashes)

[1, 21, 9, 19, 8, 6, 6, 18, 20, 6, 7, 7, 3, 2, 3, 12, 40, 40, 419, 4, 477, 256, 13, 5, 14, 2, 7, 13, 12, 25, 1, 24, 7, 12, 137, 137]

['CRASH_RECORD_ID', 'CRASH_DATE', 'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'DAMAGE', 'DATE_POLICE_NOTIFIED', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO', 'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE', 'NUM_UNITS', 'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION', 'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'LATITUDE', 'LONGITUDE']


In [35]:
crashes.DAMAGE.value_counts()

OVER $1,500      270353
$501 - $1,500    137706
$500 OR LESS      60601
Name: DAMAGE, dtype: int64

In [36]:
crashes.CRASH_DAY_OF_WEEK.value_counts()

6    76212
7    69228
3    67493
5    67214
4    66835
2    65195
1    56483
Name: CRASH_DAY_OF_WEEK, dtype: int64

In [37]:
crashes.CRASH_MONTH.value_counts()

10    45053
12    42845
9     41972
11    41620
8     40634
1     40078
2     40053
7     38888
6     36792
5     35784
3     34476
4     30465
Name: CRASH_MONTH, dtype: int64

In [38]:
crashes.CRASH_HOUR.value_counts()

16    35862
15    35718
17    35351
14    31849
18    29277
13    28993
12    27929
8     25128
11    24209
9     22104
10    21710
19    21359
7     20286
20    16865
21    14982
22    13776
23    11552
6     10548
0      9418
1      7845
2      6799
5      6385
3      5575
4      5140
Name: CRASH_HOUR, dtype: int64

In [39]:
crashes.CRASH_HOUR = pd.cut(crashes.CRASH_HOUR,[0, 6, 12, 18, 24],
                                    precision=0, labels=[0, 1, 2, 3])

In [40]:
crashes.CRASH_HOUR.value_counts()

2    197050
1    141366
3     78534
0     42292
Name: CRASH_HOUR, dtype: int64

In [41]:
crashes.ROADWAY_SURFACE_COND.value_counts()

DRY                348218
WET                 64433
UNKNOWN             31200
SNOW OR SLUSH       19664
ICE                  3810
OTHER                1124
SAND, MUD, DIRT       211
Name: ROADWAY_SURFACE_COND, dtype: int64

In [42]:
crashes.TRAFFICWAY_TYPE.value_counts()

NOT DIVIDED                        209625
DIVIDED - W/MEDIAN (NOT RAISED)     82982
ONE-WAY                             62008
PARKING LOT                         33261
DIVIDED - W/MEDIAN BARRIER          27768
FOUR WAY                            14658
OTHER                               13163
ALLEY                                7731
UNKNOWN                              5111
CENTER TURN LANE                     3992
T-INTERSECTION                       3112
DRIVEWAY                             1622
RAMP                                 1442
UNKNOWN INTERSECTION TYPE             907
Y-INTERSECTION                        373
FIVE POINT, OR MORE                   370
TRAFFIC ROUTE                         270
NOT REPORTED                          118
ROUNDABOUT                             94
L-INTERSECTION                         53
Name: TRAFFICWAY_TYPE, dtype: int64

In [43]:
crashes.MOST_SEVERE_INJURY.value_counts()

NO INDICATION OF INJURY     409702
NONINCAPACITATING INJURY     32671
REPORTED, NOT EVIDENT        18532
INCAPACITATING INJURY         7384
FATAL                          371
Name: MOST_SEVERE_INJURY, dtype: int64

In [44]:
crashes['MOST_SEVERE_INJURY'] = np.where(crashes['MOST_SEVERE_INJURY']=='REPORTED, NOT EVIDENT',
                                         'NONINCAPACITATING INJURY', crashes['MOST_SEVERE_INJURY'])

In [45]:
crashes['MOST_SEVERE_INJURY'] = np.where(crashes['MOST_SEVERE_INJURY']=='FATAL',
                                         'INCAPACITATING INJURY', crashes['MOST_SEVERE_INJURY'])

In [46]:
crashes['MOST_SEVERE_INJURY'] = np.where(crashes['MOST_SEVERE_INJURY']=='INCAPACITATING INJURY',
                                         'INCAPACITATING INJURY/FATAL', crashes['MOST_SEVERE_INJURY'])

In [47]:
crashes.MOST_SEVERE_INJURY.value_counts()

NO INDICATION OF INJURY        409702
NONINCAPACITATING INJURY        51203
INCAPACITATING INJURY/FATAL      7755
Name: MOST_SEVERE_INJURY, dtype: int64

In [48]:
# We could check to see how long the street is OR
# We could create a kind of multi-column based on the other column "TRAFFICWAY_TYPE" OR
# Just group them together by that same column
crashes.STREET_NAME.value_counts()

WESTERN AVE              12896
PULASKI RD               11244
CICERO AVE               10252
ASHLAND AVE              10239
HALSTED ST                8978
                         ...  
LUIS MUNOZ MARIN DR S        1
HOLLETT DR                   1
FRANKLIN ST XR               1
JULIA CT                     1
BURKHARDT DR                 1
Name: STREET_NAME, Length: 1520, dtype: int64

In [49]:
crashes.STREET_DIRECTION.value_counts()

W    166935
S    155627
N    113430
E     32668
Name: STREET_DIRECTION, dtype: int64

### Working with Date-Time Columns

- Can we explore and see if holidays play a part on accidents?

In [50]:
# CRASH_DATE is a string
crashes['CRASH_DATE'][0]

'07/10/2019 05:56:00 PM'

In [51]:
# We can check that here
type(crashes['CRASH_DATE'][0])

str

In [52]:
# We create a new column Date-Time with the previous column
crashes['DATE_ACCIDENT']= pd.to_datetime(crashes['CRASH_DATE'], format='%m/%d/%Y %I:%M:%S %p')

In [53]:
# We confirm that it worked
type(crashes.DATE_ACCIDENT[0])

pandas._libs.tslibs.timestamps.Timestamp

In [54]:
# Our earliest and latest date
print("Earliest Date: {} \nLatest Date: {}".format(crashes['DATE_ACCIDENT'].min(), crashes['DATE_ACCIDENT'].max()))

Earliest Date: 2013-03-03 16:48:00 
Latest Date: 2021-03-09 23:10:00


In [55]:
holidays = pd.tseries.holiday.USFederalHolidayCalendar().holidays(start='2012', end='2022').to_pydatetime()

In [56]:
holidays_date = [holiday.date() for holiday in holidays]


In [57]:
holidays_date[:3]

[datetime.date(2012, 1, 2),
 datetime.date(2012, 1, 16),
 datetime.date(2012, 2, 20)]

In [58]:
def isitaholiday(date):
    ''' super useful function'''
    if date.date() in holidays_date:
        return 1
    else: 
        return 0

In [59]:
isitaholiday(crashes['DATE_ACCIDENT'][100])

0

In [60]:
holidays[1].date()

datetime.date(2012, 1, 16)

In [61]:
crashes['IS_A_HOLIDAY'] = crashes['DATE_ACCIDENT'].apply(isitaholiday)

In [62]:
crashes['HOLIDAY_NAME'] = crashes['DATE_ACCIDENT'].apply(isitaholiday)

In [63]:
crashes['IS_A_HOLIDAY'].sum()

11064

In [64]:
crashes['DATE_ACCIDENT'][0].date()

datetime.date(2019, 7, 10)

In [65]:
crashes.drop(['CRASH_DATE'], axis = 1, inplace = True)

In [66]:
crashes.head(3)

Unnamed: 0,CRASH_RECORD_ID,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,...,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,DATE_ACCIDENT,IS_A_HOLIDAY,HOLIDAY_NAME
0,4fd0a3e0897b3335b94cd8d5b2d2b350eb691add56c62d...,6,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,TURNING,ONE-WAY,STRAIGHT AND LEVEL,DRY,...,3.0,0.0,2,4,7,41.919664,-87.773288,2019-07-10 17:56:00,0,0
1,009e9e67203442370272e1a13d6ee51a4155dac65e583d...,6,STOP SIGN/FLASHER,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,...,3.0,0.0,2,6,6,41.741804,-87.740954,2017-06-30 16:00:00,0,0
2,ee9283eff3a55ac50ee58f3d9528ce1d689b1c4180b4c4...,5,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,FOUR WAY,STRAIGHT AND LEVEL,DRY,...,3.0,0.0,1,6,7,41.773456,-87.585022,2020-07-10 10:25:00,0,0


In [67]:
crashes_holiday = crashes[crashes['IS_A_HOLIDAY']==1]

In [68]:
crashes_holiday.head(3)

Unnamed: 0,CRASH_RECORD_ID,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,...,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,DATE_ACCIDENT,IS_A_HOLIDAY,HOLIDAY_NAME
155,3bfec94191bdc869600d2c429c45a68871eb9733ce7ec6...,4,NO CONTROLS,NO CONTROLS,SNOW,DAYLIGHT,PARKED MOTOR VEHICLE,NOT DIVIDED,STRAIGHT AND LEVEL,SNOW OR SLUSH,...,1.0,0.0,1.0,6,11,41.932213,-87.653694,2017-11-10 12:45:00,1,1
168,43292e98b1ba60cb4c6efeb987a67a1ec0291acea4357e...,7,NO CONTROLS,OTHER,RAIN,"DARKNESS, LIGHTED ROAD",TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,WET,...,4.0,0.0,,2,2,41.88066,-87.740728,2018-02-19 00:25:00,1,1
234,5e45d4537c4d0bd12bfa11a7f75526f4fe0d61c68a0ec1...,1,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",PARKED MOTOR VEHICLE,PARKING LOT,STRAIGHT AND LEVEL,WET,...,1.0,0.0,0.0,3,1,41.879714,-87.774496,2019-01-01 02:40:00,1,1


In [69]:
crashes.isna().sum().sum()

16588

In [70]:
crashes.dropna(inplace=True)

In [71]:
crashes_holiday.isna().sum().sum()

477

In [72]:
crashes_holiday.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crashes_holiday.dropna(inplace=True)


### DateTime :


In [73]:
test = pd.tseries.holiday.USFederalHolidayCalendar().holidays(start='2012', end='2022', return_name=True)

In [74]:
test.keys()

DatetimeIndex(['2012-01-02', '2012-01-16', '2012-02-20', '2012-05-28',
               '2012-07-04', '2012-09-03', '2012-10-08', '2012-11-12',
               '2012-11-22', '2012-12-25',
               ...
               '2021-01-18', '2021-02-15', '2021-05-31', '2021-07-05',
               '2021-09-06', '2021-10-11', '2021-11-11', '2021-11-25',
               '2021-12-24', '2021-12-31'],
              dtype='datetime64[ns]', length=101, freq=None)

In [75]:
test.keys()[0].date()

datetime.date(2012, 1, 2)

In [76]:
aidento = []
for x in test:
    aidento.append(x)

In [77]:
aifora = []
for x in test.keys():
    aifora.append(x.date())

In [78]:
dicionarioinutil = dict(zip(aifora, aidento))

In [79]:
dates_holidays = pd.DataFrame.from_dict(dicionarioinutil, orient='index')

In [80]:
crashes_holiday['DATE_REAL'] = crashes_holiday['DATE_ACCIDENT'].apply(lambda x : x.date())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crashes_holiday['DATE_REAL'] = crashes_holiday['DATE_ACCIDENT'].apply(lambda x : x.date())


### **To Delete?**
- 'FIRST_CRASH_TYPE'
- 'INJURIES_INCAPACITATING' 
- 'INJURIES_NON_INCAPACITATING'
- 'INJURIES_REPORTED_NOT_EVIDENT'
- 'INJURIES_TOTAL'
- 'MOST_SEVERE_INJURY'
- 'NUM_UNITS'
- 'BEAT_OF_OCCURRENCE'
- 'DATE_POLICE_NOTIFIED'
- 'PRIM_CONTRIBUTORY_CAUSE'
- 'SEC_CONTRIBUTORY_CAUSE'

# People Data Set

In [81]:
people.shape

(1069147, 30)

In [82]:
howmanyunique(people)

[1, 6, 45, 45, 43, 51, 11, 335, 49, 364, 3, 108, 72, 61, 19, 7, 5, 5, 187, 175, 96, 20, 14, 12, 23, 4, 8, 4, 30, 2]

['PERSON_ID', 'PERSON_TYPE', 'CRASH_RECORD_ID', 'RD_NO', 'VEHICLE_ID', 'CRASH_DATE', 'SEAT_NO', 'CITY', 'STATE', 'ZIPCODE', 'SEX', 'AGE', 'DRIVERS_LICENSE_STATE', 'DRIVERS_LICENSE_CLASS', 'SAFETY_EQUIPMENT', 'AIRBAG_DEPLOYED', 'EJECTION', 'INJURY_CLASSIFICATION', 'HOSPITAL', 'EMS_AGENCY', 'EMS_RUN_NO', 'DRIVER_ACTION', 'DRIVER_VISION', 'PHYSICAL_CONDITION', 'PEDPEDAL_ACTION', 'PEDPEDAL_VISIBILITY', 'PEDPEDAL_LOCATION', 'BAC_RESULT', 'BAC_RESULT VALUE', 'CELL_PHONE_USE']


In [83]:
howmanyduplicates(people)

0


In [84]:
crashes.shape

(452176, 38)

In [85]:
crashes.head()

Unnamed: 0,CRASH_RECORD_ID,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,...,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,DATE_ACCIDENT,IS_A_HOLIDAY,HOLIDAY_NAME
0,4fd0a3e0897b3335b94cd8d5b2d2b350eb691add56c62d...,6,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,TURNING,ONE-WAY,STRAIGHT AND LEVEL,DRY,...,3.0,0.0,2,4,7,41.919664,-87.773288,2019-07-10 17:56:00,0,0
1,009e9e67203442370272e1a13d6ee51a4155dac65e583d...,6,STOP SIGN/FLASHER,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,...,3.0,0.0,2,6,6,41.741804,-87.740954,2017-06-30 16:00:00,0,0
2,ee9283eff3a55ac50ee58f3d9528ce1d689b1c4180b4c4...,5,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,FOUR WAY,STRAIGHT AND LEVEL,DRY,...,3.0,0.0,1,6,7,41.773456,-87.585022,2020-07-10 10:25:00,0,0
3,f8960f698e870ebdc60b521b2a141a5395556bc3704191...,5,NO CONTROLS,NO CONTROLS,CLEAR,DARKNESS,PARKED MOTOR VEHICLE,DIVIDED - W/MEDIAN (NOT RAISED),STRAIGHT AND LEVEL,DRY,...,3.0,0.0,0,7,7,41.802119,-87.622115,2020-07-11 01:00:00,0,0
5,00e47f189660cd8ba1e85fc63061bf1d8465184393f134...,5,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,...,2.0,0.0,3,5,3,41.741804,-87.740954,2019-03-21 22:50:00,0,0


In [86]:
rows_w_nan(people)

Row: RD_NO : is made 0.007% of NaN Values.
Row: VEHICLE_ID : is made 0.02% of NaN Values.
Row: SEAT_NO : is made 0.796% of NaN Values.
Row: CITY : is made 0.261% of NaN Values.
Row: STATE : is made 0.253% of NaN Values.
Row: ZIPCODE : is made 0.325% of NaN Values.
Row: SEX : is made 0.015% of NaN Values.
Row: AGE : is made 0.285% of NaN Values.
Row: DRIVERS_LICENSE_STATE : is made 0.407% of NaN Values.
Row: DRIVERS_LICENSE_CLASS : is made 0.485% of NaN Values.
Row: SAFETY_EQUIPMENT : is made 0.003% of NaN Values.
Row: AIRBAG_DEPLOYED : is made 0.019% of NaN Values.
Row: EJECTION : is made 0.012% of NaN Values.
Row: INJURY_CLASSIFICATION : is made 0.001% of NaN Values.
Row: HOSPITAL : is made 0.818% of NaN Values.
Row: EMS_AGENCY : is made 0.884% of NaN Values.
Row: EMS_RUN_NO : is made 0.981% of NaN Values.
Row: DRIVER_ACTION : is made 0.206% of NaN Values.
Row: DRIVER_VISION : is made 0.207% of NaN Values.
Row: PHYSICAL_CONDITION : is made 0.206% of NaN Values.
Row: PEDPEDAL_ACTION : 

In [87]:
people = people.drop(columns=['CELL_PHONE_USE', 'BAC_RESULT VALUE', 'PEDPEDAL_LOCATION', 'PEDPEDAL_VISIBILITY',
                              'EMS_RUN_NO', 'EMS_AGENCY', 'HOSPITAL', 'DRIVERS_LICENSE_CLASS', 
                              'DRIVERS_LICENSE_STATE', 'ZIPCODE', 'SEAT_NO', 'PEDPEDAL_ACTION'])

In [88]:
people.dropna(inplace=True)

In [89]:
people.isna().sum()

PERSON_ID                0
PERSON_TYPE              0
CRASH_RECORD_ID          0
RD_NO                    0
VEHICLE_ID               0
CRASH_DATE               0
CITY                     0
STATE                    0
SEX                      0
AGE                      0
SAFETY_EQUIPMENT         0
AIRBAG_DEPLOYED          0
EJECTION                 0
INJURY_CLASSIFICATION    0
DRIVER_ACTION            0
DRIVER_VISION            0
PHYSICAL_CONDITION       0
BAC_RESULT               0
dtype: int64

In [90]:
people.INJURY_CLASSIFICATION.value_counts()

NO INDICATION OF INJURY     540485
NONINCAPACITATING INJURY     22849
REPORTED, NOT EVIDENT        15391
INCAPACITATING INJURY         4186
FATAL                          237
Name: INJURY_CLASSIFICATION, dtype: int64

In [91]:
people.AIRBAG_DEPLOYED.value_counts()

DID NOT DEPLOY                            425305
NOT APPLICABLE                             91019
DEPLOYMENT UNKNOWN                         26896
DEPLOYED, FRONT                            22228
DEPLOYED, COMBINATION                      12206
DEPLOYED, SIDE                              5208
DEPLOYED OTHER (KNEE, AIR, BELT, ETC.)       286
Name: AIRBAG_DEPLOYED, dtype: int64

In [92]:
# Useless?
people.EJECTION.value_counts()

NONE                  572084
UNKNOWN                 8366
TOTALLY EJECTED         1786
PARTIALLY EJECTED        546
TRAPPED/EXTRICATED       366
Name: EJECTION, dtype: int64

In [93]:
people.PHYSICAL_CONDITION.value_counts()

NORMAL                          497653
UNKNOWN                          73147
IMPAIRED - ALCOHOL                3471
REMOVED BY EMS                    2036
FATIGUED/ASLEEP                   2023
OTHER                             1549
EMOTIONAL                         1481
ILLNESS/FAINTED                    709
HAD BEEN DRINKING                  439
IMPAIRED - DRUGS                   399
IMPAIRED - ALCOHOL AND DRUGS       148
MEDICATED                           93
Name: PHYSICAL_CONDITION, dtype: int64

In [94]:
people.PERSON_TYPE.value_counts()

DRIVER                 583110
NON-CONTACT VEHICLE        38
Name: PERSON_TYPE, dtype: int64

# Vehicles Data Set

**SEAT_NO**
- Code for seating position of motor vehicle occupant: 1= driver, 2= center front, 3 = front passenger, 4 = second row left, 5 = second row center, 6 = second row right, 7 = enclosed passengers, 8 = exposed passengers, 9= unknown position, 10 = third row left, 11 = third row center, 12 = third row right

In [95]:
vehicles.shape

(988131, 72)

In [96]:
howmanyunique(vehicles)

[1, 17, 17, 40, 17, 9, 22, 1, 2, 149, 540, 51, 68, 17, 21, 25, 9, 28, 2, 2, 25, 2, 130, 119, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 14, 1, 33, 11, 11, 4, 40, 37, 39, 59, 2, 3, 8, 3, 3, 1, 3, 1, 3, 3, 9, 2, 3, 3, 26, 4, 39, 11, 8, 9, 6, 2, 2, 7]

['CRASH_UNIT_ID', 'CRASH_RECORD_ID', 'RD_NO', 'CRASH_DATE', 'UNIT_NO', 'UNIT_TYPE', 'NUM_PASSENGERS', 'VEHICLE_ID', 'CMRC_VEH_I', 'MAKE', 'MODEL', 'LIC_PLATE_STATE', 'VEHICLE_YEAR', 'VEHICLE_DEFECT', 'VEHICLE_TYPE', 'VEHICLE_USE', 'TRAVEL_DIRECTION', 'MANEUVER', 'TOWED_I', 'FIRE_I', 'OCCUPANT_CNT', 'EXCEED_SPEED_LIMIT_I', 'TOWED_BY', 'TOWED_TO', 'AREA_00_I', 'AREA_01_I', 'AREA_02_I', 'AREA_03_I', 'AREA_04_I', 'AREA_05_I', 'AREA_06_I', 'AREA_07_I', 'AREA_08_I', 'AREA_09_I', 'AREA_10_I', 'AREA_11_I', 'AREA_12_I', 'AREA_99_I', 'FIRST_CONTACT_POINT', 'CMV_ID', 'USDOT_NO', 'CCMC_NO', 'ILCC_NO', 'COMMERCIAL_SRC', 'GVWR', 'CARRIER_NAME', 'CARRIER_STATE', 'CARRIER_CITY', 'HAZMAT_PLACARDS_I', 'HAZMAT_NAME', 'UN_NO', 'HAZMAT_PRESENT_I', 'HAZMAT_REPORT

In [97]:
howmanyduplicates(vehicles)

0


In [98]:
rows_w_nan(vehicles)

Row: RD_NO : is made 0.007% of NaN Values.
Row: UNIT_TYPE : is made 0.002% of NaN Values.
Row: NUM_PASSENGERS : is made 0.85% of NaN Values.
Row: VEHICLE_ID : is made 0.023% of NaN Values.
Row: CMRC_VEH_I : is made 0.981% of NaN Values.
Row: MAKE : is made 0.023% of NaN Values.
Row: MODEL : is made 0.023% of NaN Values.
Row: LIC_PLATE_STATE : is made 0.106% of NaN Values.
Row: VEHICLE_YEAR : is made 0.181% of NaN Values.
Row: VEHICLE_DEFECT : is made 0.023% of NaN Values.
Row: VEHICLE_TYPE : is made 0.023% of NaN Values.
Row: VEHICLE_USE : is made 0.023% of NaN Values.
Row: TRAVEL_DIRECTION : is made 0.023% of NaN Values.
Row: MANEUVER : is made 0.023% of NaN Values.
Row: TOWED_I : is made 0.887% of NaN Values.
Row: FIRE_I : is made 0.999% of NaN Values.
Row: OCCUPANT_CNT : is made 0.023% of NaN Values.
Row: EXCEED_SPEED_LIMIT_I : is made 0.998% of NaN Values.
Row: TOWED_BY : is made 0.918% of NaN Values.
Row: TOWED_TO : is made 0.948% of NaN Values.
Row: AREA_00_I : is made 0.961% of 

In [99]:
vehicles = vehicles.drop(columns=['NUM_PASSENGERS', 'CMRC_VEH_I', 'TOWED_I', 'FIRE_I', 'EXCEED_SPEED_LIMIT_I', 
                                  'TOWED_BY', 'TOWED_TO', 'AREA_00_I', 'AREA_01_I', 'AREA_02_I', 'AREA_03_I', 
                                  'AREA_04_I', 'AREA_05_I', 'AREA_06_I', 'AREA_07_I', 'AREA_08_I', 'AREA_09_I', 
                                  'AREA_10_I', 'AREA_11_I', 'AREA_12_I', 'AREA_99_I', 'CMV_ID', 'USDOT_NO', 'CCMC_NO', 
                                  'ILCC_NO', 'COMMERCIAL_SRC', 'GVWR', 'CARRIER_NAME', 'CARRIER_STATE', 'CARRIER_CITY',
                                  'HAZMAT_PLACARDS_I', 'HAZMAT_NAME', 'UN_NO', 'HAZMAT_PRESENT_I', 'HAZMAT_REPORT_I',
                                  'HAZMAT_REPORT_NO', 'MCS_REPORT_I', 'MCS_REPORT_NO', 'HAZMAT_VIO_CAUSE_CRASH_I',
                                  'MCS_VIO_CAUSE_CRASH_I', 'IDOT_PERMIT_NO', 'WIDE_LOAD_I', 'TRAILER1_WIDTH',
                                  'TRAILER2_WIDTH', 'TRAILER1_LENGTH', 'TRAILER2_LENGTH', 'TOTAL_VEHICLE_LENGTH',
                                  'AXLE_CNT', 'VEHICLE_CONFIG', 'CARGO_BODY_TYPE', 'LOAD_TYPE', 'HAZMAT_OUT_OF_SERVICE_I',
                                  'MCS_OUT_OF_SERVICE_I', 'HAZMAT_CLASS', 'LIC_PLATE_STATE'])

In [100]:
vehicles.dropna(inplace=True)

In [101]:
vehicles.isna().sum()

CRASH_UNIT_ID          0
CRASH_RECORD_ID        0
RD_NO                  0
CRASH_DATE             0
UNIT_NO                0
UNIT_TYPE              0
VEHICLE_ID             0
MAKE                   0
MODEL                  0
VEHICLE_YEAR           0
VEHICLE_DEFECT         0
VEHICLE_TYPE           0
VEHICLE_USE            0
TRAVEL_DIRECTION       0
MANEUVER               0
OCCUPANT_CNT           0
FIRST_CONTACT_POINT    0
dtype: int64

### Done Treating the Data

In [102]:
def columns_repeat(data, data1, data2):
    data_columns = []
    data1_columns = []
    data2_columns = []
    data_data1 = []
    data_data2 = []
    data_data3 = []
    for x in data.columns:
        data_columns.append(x)
    for y in data1.columns:
        data1_columns.append(y)
    for z in data2.columns:
        data2_columns.append(z)
     
    for a in data_columns:
        if a in data1_columns:
            data_data1.append(a)
            
    for b in data_columns:
        if b in data2_columns:
            data_data2.append(b)
    
    for c in data1_columns:
        if c in data2_columns:
            data_data3.append(c)
        
    return(data_data1, data_data2, data_data3)
        
        
        

In [103]:
columns_repeat(people, crashes, vehicles)

(['CRASH_RECORD_ID'],
 ['CRASH_RECORD_ID', 'RD_NO', 'VEHICLE_ID', 'CRASH_DATE'],
 ['CRASH_RECORD_ID'])

In [104]:
# Since we only want ONE columns to be repeated, we drop the others that repeat
vehicles = vehicles.drop(columns=['RD_NO', 'VEHICLE_ID', 'CRASH_DATE'])

In [105]:
people.shape

(583148, 18)

In [106]:
crashes.shape

(452176, 38)

In [107]:
vehicles.shape

(797587, 14)

In [108]:
# Join the 3 datasets together
inner_merged_total = pd.merge(vehicles, crashes, on=['CRASH_RECORD_ID'])
inner_merged_total.shape

(740710, 51)

In [109]:
inner_merged_total.shape

(740710, 51)

In [110]:
inner_merged_total = pd.merge(inner_merged_total, people, on=['CRASH_RECORD_ID'])

In [111]:
inner_merged_total.shape

(1020442, 68)

In [112]:
inner_merged_total.isna().sum().sum()

0

In [113]:
inner_merged_total.duplicated().sum()

0

In [114]:
inner_merged_total.columns

Index(['CRASH_UNIT_ID', 'CRASH_RECORD_ID', 'UNIT_NO', 'UNIT_TYPE', 'MAKE',
       'MODEL', 'VEHICLE_YEAR', 'VEHICLE_DEFECT', 'VEHICLE_TYPE',
       'VEHICLE_USE', 'TRAVEL_DIRECTION', 'MANEUVER', 'OCCUPANT_CNT',
       'FIRST_CONTACT_POINT', 'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE',
       'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION',
       'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ALIGNMENT',
       'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE',
       'DAMAGE', 'DATE_POLICE_NOTIFIED', 'PRIM_CONTRIBUTORY_CAUSE',
       'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO', 'STREET_DIRECTION',
       'STREET_NAME', 'BEAT_OF_OCCURRENCE', 'NUM_UNITS', 'MOST_SEVERE_INJURY',
       'INJURIES_TOTAL', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING',
       'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT',
       'INJURIES_NO_INDICATION', 'INJURIES_UNKNOWN', 'CRASH_HOUR',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'LATITUDE', 'LONGITUDE',
       'DATE_ACCID

# Notes:

In [115]:
#plt.style.use('ggplot')

1) Chicago Car Crashes (Links to an external site.)
Note this links also to Vehicle Data (Links to an external site.) and to Driver/Passenger Data (Links to an external site.).

Build a classifier to predict the primary contributory cause of a car accident, given information about the car,
the people in the car,the road conditions etc. You might imagine your audience as a Vehicle Safety Board who's
interested in reducing traffic accidents, or as the City of Chicago who's interested in becoming aware of any
interesting patterns. Note that there is a multi-class classification problem. You will almost certainly want to bin 
or trim or otherwise limit the number of target categories on which you ultimately predict. Note e.g. that some
primary contributory causes have very few samples.

This criterion is linked to a Learning OutcomeBusiness Understanding

Explains the project's real-world value for a specific stakeholder

This criterion is linked to a Learning OutcomeData Understanding

Explicitly relates data's source and properties to real-world problem

This criterion is linked to a Learning OutcomeData Preparation

Data preparation is reproducible and well-documented with valid justifications

Modeling

Model development is iterative and documented with valid justifications

This criterion is linked to a Learning OutcomeClassification Results

Explains how well the project solves the real-world problem

This criterion is linked to a Learning OutcomeVisualization

Includes three polished visualizations of relevant findings

GitHub Repository

Repository uses all "best practices" for README, structure, and commits

Presentation Content

Presentation clearly shows stakeholders how well the project meets their needs

This criterion is linked to a Learning OutcomeSlide Style

Slides have a fully professional style

This criterion is linked to a Learning OutcomePresentation Delivery

Presentation engagingly and clearly delivers the intended content

This criterion is linked to a Learning OutcomeAnswers to Questions

Answers are fully clear and appropriate

# Modeling

In [116]:
crashes.MOST_SEVERE_INJURY.value_counts()

NO INDICATION OF INJURY        395211
NONINCAPACITATING INJURY        49506
INCAPACITATING INJURY/FATAL      7459
Name: MOST_SEVERE_INJURY, dtype: int64

In [117]:
crashes.head()

Unnamed: 0,CRASH_RECORD_ID,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,...,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,DATE_ACCIDENT,IS_A_HOLIDAY,HOLIDAY_NAME
0,4fd0a3e0897b3335b94cd8d5b2d2b350eb691add56c62d...,6,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,TURNING,ONE-WAY,STRAIGHT AND LEVEL,DRY,...,3.0,0.0,2,4,7,41.919664,-87.773288,2019-07-10 17:56:00,0,0
1,009e9e67203442370272e1a13d6ee51a4155dac65e583d...,6,STOP SIGN/FLASHER,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,...,3.0,0.0,2,6,6,41.741804,-87.740954,2017-06-30 16:00:00,0,0
2,ee9283eff3a55ac50ee58f3d9528ce1d689b1c4180b4c4...,5,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,FOUR WAY,STRAIGHT AND LEVEL,DRY,...,3.0,0.0,1,6,7,41.773456,-87.585022,2020-07-10 10:25:00,0,0
3,f8960f698e870ebdc60b521b2a141a5395556bc3704191...,5,NO CONTROLS,NO CONTROLS,CLEAR,DARKNESS,PARKED MOTOR VEHICLE,DIVIDED - W/MEDIAN (NOT RAISED),STRAIGHT AND LEVEL,DRY,...,3.0,0.0,0,7,7,41.802119,-87.622115,2020-07-11 01:00:00,0,0
5,00e47f189660cd8ba1e85fc63061bf1d8465184393f134...,5,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,...,2.0,0.0,3,5,3,41.741804,-87.740954,2019-03-21 22:50:00,0,0


In [118]:
crashes.columns

Index(['CRASH_RECORD_ID', 'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE',
       'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION',
       'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ALIGNMENT',
       'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE',
       'DAMAGE', 'DATE_POLICE_NOTIFIED', 'PRIM_CONTRIBUTORY_CAUSE',
       'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO', 'STREET_DIRECTION',
       'STREET_NAME', 'BEAT_OF_OCCURRENCE', 'NUM_UNITS', 'MOST_SEVERE_INJURY',
       'INJURIES_TOTAL', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING',
       'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT',
       'INJURIES_NO_INDICATION', 'INJURIES_UNKNOWN', 'CRASH_HOUR',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'LATITUDE', 'LONGITUDE',
       'DATE_ACCIDENT', 'IS_A_HOLIDAY', 'HOLIDAY_NAME'],
      dtype='object')

In [119]:
X_model_features = ['IS_A_HOLIDAY', 'STREET_NO', 'DAMAGE', 'ROADWAY_SURFACE_COND', 'POSTED_SPEED_LIMIT', 
                    'WEATHER_CONDITION', 'LIGHTING_CONDITION']

In [120]:
X = crashes[X_model_features]

In [121]:
from sklearn.preprocessing import OneHotEncoder

In [122]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit_transform(X.select_dtypes(include='object'))

array([[0., 0., 1., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       ...,
       [0., 0., 1., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.]])

In [123]:
ohe.get_feature_names()

array(['x0_$500 OR LESS', 'x0_$501 - $1,500', 'x0_OVER $1,500', 'x1_DRY',
       'x1_ICE', 'x1_OTHER', 'x1_SAND, MUD, DIRT', 'x1_SNOW OR SLUSH',
       'x1_UNKNOWN', 'x1_WET', 'x2_CLEAR', 'x2_CLOUDY/OVERCAST',
       'x2_OTHER', 'x2_RAIN', 'x2_SNOW', 'x2_UNKNOWN', 'x3_DARKNESS',
       'x3_DARKNESS, LIGHTED ROAD', 'x3_DAWN', 'x3_DAYLIGHT', 'x3_DUSK',
       'x3_UNKNOWN'], dtype=object)

In [124]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score


kf = KFold()
precision_scores = []
for trained_indices, val_indices in kf.split(X, y):
    X_t = X.iloc[trained_indices]
    X_val = X.iloc[val_indices]
    y_t = y.iloc[trained_indices]
    y_val = y.iloc[val_indices]
    ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
    X_t_cat = ohe.fit_transform(X_t.select_dtypes(include='object'))
    X_t_num = ohe.fit_transform(X_t.select_dtypes(exclude='object'))
    
    
    precision_score(average='weighted')
    
    dates_holidays crashes_holiday['DATE_REAL'] inner_merged_total = pd.merge(inner_merged_total, people, on=['CRASH_RECORD_ID'])

In [125]:
y = crashes['MOST_SEVERE_INJURY']

In [126]:
y.value_counts()

NO INDICATION OF INJURY        395211
NONINCAPACITATING INJURY        49506
INCAPACITATING INJURY/FATAL      7459
Name: MOST_SEVERE_INJURY, dtype: int64

In [127]:
dates_holidays.head(3)

Unnamed: 0,0
2012-01-02,New Years Day
2012-01-16,Martin Luther King Jr. Day
2012-02-20,Presidents Day


In [128]:
dates_holidays.index.name = 'DATE_REAL'

In [129]:
crashes_holiday.head(3)

Unnamed: 0,CRASH_RECORD_ID,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,...,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,DATE_ACCIDENT,IS_A_HOLIDAY,HOLIDAY_NAME,DATE_REAL
155,3bfec94191bdc869600d2c429c45a68871eb9733ce7ec6...,4,NO CONTROLS,NO CONTROLS,SNOW,DAYLIGHT,PARKED MOTOR VEHICLE,NOT DIVIDED,STRAIGHT AND LEVEL,SNOW OR SLUSH,...,0.0,1,6,11,41.932213,-87.653694,2017-11-10 12:45:00,1,1,2017-11-10
234,5e45d4537c4d0bd12bfa11a7f75526f4fe0d61c68a0ec1...,1,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",PARKED MOTOR VEHICLE,PARKING LOT,STRAIGHT AND LEVEL,WET,...,0.0,0,3,1,41.879714,-87.774496,2019-01-01 02:40:00,1,1,2019-01-01
456,c13c5b5561f60d0df2d3ee215a7bfdceb2ab82fa3806d8...,5,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,RAIN,DARKNESS,REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,WET,...,0.0,3,4,7,41.869729,-87.624268,2018-07-04 23:00:00,1,1,2018-07-04


In [130]:
final_holidays = pd.merge(dates_holidays, crashes_holiday, on=['DATE_REAL'])

In [131]:
final_holidays.head(3)

Unnamed: 0,DATE_REAL,0,CRASH_RECORD_ID,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,...,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,DATE_ACCIDENT,IS_A_HOLIDAY,HOLIDAY_NAME
0,2014-11-11,Veterans Day,b15daae7f809226ff0cfc93865ddcf9ff60edcf4187dc4...,5,NO CONTROLS,NO CONTROLS,UNKNOWN,UNKNOWN,PARKED MOTOR VEHICLE,DIVIDED - W/MEDIAN (NOT RAISED),...,1.0,0.0,3,3,11,41.99827,-87.662997,2014-11-11 20:00:00,1,1
1,2015-09-07,Labor Day,0335e90fd6f5f3d1d54bf4113afa1fe5c283b140bfaf6b...,5,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,SIDESWIPE SAME DIRECTION,ONE-WAY,...,2.0,0.0,2,2,9,41.88847,-87.629413,2015-09-07 16:45:00,1,1
2,2015-09-07,Labor Day,03f950b1dbe2177e773fca438b58da82ccfde31f9af265...,5,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,ONE-WAY,...,1.0,0.0,2,2,9,41.682311,-87.645861,2015-09-07 14:00:00,1,1


In [132]:
df = inner_merged_total.sort_values(by=['MOST_SEVERE_INJURY'], ascending=False)

In [133]:
df['MOST_SEVERE_INJURY'].value_counts()

NO INDICATION OF INJURY        864359
NONINCAPACITATING INJURY       137131
INCAPACITATING INJURY/FATAL     18952
Name: MOST_SEVERE_INJURY, dtype: int64

In [134]:
df['MOST_SEVERE_INJURY']

271515       NONINCAPACITATING INJURY
799590       NONINCAPACITATING INJURY
253194       NONINCAPACITATING INJURY
253195       NONINCAPACITATING INJURY
253196       NONINCAPACITATING INJURY
                     ...             
747095    INCAPACITATING INJURY/FATAL
506748    INCAPACITATING INJURY/FATAL
506747    INCAPACITATING INJURY/FATAL
747094    INCAPACITATING INJURY/FATAL
510221    INCAPACITATING INJURY/FATAL
Name: MOST_SEVERE_INJURY, Length: 1020442, dtype: object

In [135]:
df.shape

(1020442, 68)

In [136]:
df = df[118131:]

In [137]:
df['MOST_SEVERE_INJURY'] = np.where(df['MOST_SEVERE_INJURY']=='NO INDICATION OF INJURY',
                                         'aNO INDICATION OF INJURY', df['MOST_SEVERE_INJURY'])

In [138]:
df['MOST_SEVERE_INJURY'].value_counts()

aNO INDICATION OF INJURY       864359
NONINCAPACITATING INJURY        19000
INCAPACITATING INJURY/FATAL     18952
Name: MOST_SEVERE_INJURY, dtype: int64

In [139]:
df = df.sort_values(by=['MOST_SEVERE_INJURY'], ascending=False)

In [140]:
df['MOST_SEVERE_INJURY']

203997       aNO INDICATION OF INJURY
17205        aNO INDICATION OF INJURY
17193        aNO INDICATION OF INJURY
17180        aNO INDICATION OF INJURY
17181        aNO INDICATION OF INJURY
                     ...             
320703    INCAPACITATING INJURY/FATAL
108314    INCAPACITATING INJURY/FATAL
815407    INCAPACITATING INJURY/FATAL
815408    INCAPACITATING INJURY/FATAL
510221    INCAPACITATING INJURY/FATAL
Name: MOST_SEVERE_INJURY, Length: 902311, dtype: object

In [141]:
df = df[845359:]

In [142]:
df['MOST_SEVERE_INJURY'].value_counts()

aNO INDICATION OF INJURY       19000
NONINCAPACITATING INJURY       19000
INCAPACITATING INJURY/FATAL    18952
Name: MOST_SEVERE_INJURY, dtype: int64

In [143]:
df['MOST_SEVERE_INJURY'] = np.where(df['MOST_SEVERE_INJURY']=='aNO INDICATION OF INJURY',
                                         'NO INDICATION OF INJURY', df['MOST_SEVERE_INJURY'])

In [144]:
df.isna().sum().sum()

0

In [145]:
df.head(3)

Unnamed: 0,CRASH_UNIT_ID,CRASH_RECORD_ID,UNIT_NO,UNIT_TYPE,MAKE,MODEL,VEHICLE_YEAR,VEHICLE_DEFECT,VEHICLE_TYPE,VEHICLE_USE,...,SEX,AGE,SAFETY_EQUIPMENT,AIRBAG_DEPLOYED,EJECTION,INJURY_CLASSIFICATION,DRIVER_ACTION,DRIVER_VISION,PHYSICAL_CONDITION,BAC_RESULT
915909,792811,5ea48c577199355af854c24b15801ffbab4225ab0c5a3d...,2,DRIVER,FORD,TAURUS,2013.0,UNKNOWN,PASSENGER,PERSONAL,...,F,60.0,USAGE UNKNOWN,DID NOT DEPLOY,NONE,NO INDICATION OF INJURY,UNKNOWN,UNKNOWN,UNKNOWN,TEST NOT OFFERED
933899,809434,d40dfc549fbe7322fc518e2663056326ce56b3fd0e8ef5...,2,DRIVER,FORD,Focus,2013.0,NONE,PASSENGER,PERSONAL,...,M,47.0,SAFETY BELT USED,DID NOT DEPLOY,NONE,NO INDICATION OF INJURY,FOLLOWED TOO CLOSELY,UNKNOWN,NORMAL,TEST NOT OFFERED
933939,809479,1d69a100333f07cea11211024e928317060efba3493f59...,1,DRIVER,NISSAN,SENTRA (DATSUN AND NISSAN HAVE MERGED),2009.0,UNKNOWN,PASSENGER,PERSONAL,...,M,26.0,SAFETY BELT USED,DID NOT DEPLOY,NONE,NO INDICATION OF INJURY,OTHER,NOT OBSCURED,NORMAL,TEST NOT OFFERED


In [146]:
df.shape

(56952, 68)

In [147]:
df_a = df

In [148]:
def basic_info(data):
    print("Dataset shape is: ", data.shape)
    print("Dataset size is: ", data.size)
    print("Dataset columns are: ", data.columns)
    print("Dataset info is: ", data.info())
    categorical = []
    numerical = []
    for i in data.columns:
        if data[i].dtype == object:
            categorical.append(i)
        else:
            numerical.append(i)
    print("Categorical variables are:\n ", categorical)
    print("Numerical variables are:\n ", numerical)
    return categorical, numerical

In [149]:
categorical, numerical = basic_info(df_a)

Dataset shape is:  (56952, 68)
Dataset size is:  3872736
Dataset columns are:  Index(['CRASH_UNIT_ID', 'CRASH_RECORD_ID', 'UNIT_NO', 'UNIT_TYPE', 'MAKE',
       'MODEL', 'VEHICLE_YEAR', 'VEHICLE_DEFECT', 'VEHICLE_TYPE',
       'VEHICLE_USE', 'TRAVEL_DIRECTION', 'MANEUVER', 'OCCUPANT_CNT',
       'FIRST_CONTACT_POINT', 'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE',
       'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION',
       'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ALIGNMENT',
       'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE',
       'DAMAGE', 'DATE_POLICE_NOTIFIED', 'PRIM_CONTRIBUTORY_CAUSE',
       'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO', 'STREET_DIRECTION',
       'STREET_NAME', 'BEAT_OF_OCCURRENCE', 'NUM_UNITS', 'MOST_SEVERE_INJURY',
       'INJURIES_TOTAL', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING',
       'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT',
       'INJURIES_NO_INDICATION', 'INJURIES_UNKNOWN', 'CRASH_HOUR',
       

In [150]:
df_a2 = df_a.copy(deep = True)

In [151]:
df_a2.isnull().sum().sum()

0

In [152]:
#categorical2

In [153]:
#numerical2

In [154]:
#df2 = df[numerical2]

In [155]:
#df2 = df2.drop(columns=['DATE_ACCIDENT', 'VEHICLE_ID', 'HOLIDAY_NAME', 'LONGITUDE', 'LATITUDE', 'BEAT_OF_OCCURRENCE', 'VEHICLE_YEAR'])

In [156]:
#df2

In [157]:
#df2.hist(figsize = [20,20], bins = 50)
#plt.show()

# MODEL

In [158]:
df1 = df.drop(columns=['MOST_SEVERE_INJURY'])

In [159]:
categoricalx, numericalx = basic_info(df1)

Dataset shape is:  (56952, 67)
Dataset size is:  3815784
Dataset columns are:  Index(['CRASH_UNIT_ID', 'CRASH_RECORD_ID', 'UNIT_NO', 'UNIT_TYPE', 'MAKE',
       'MODEL', 'VEHICLE_YEAR', 'VEHICLE_DEFECT', 'VEHICLE_TYPE',
       'VEHICLE_USE', 'TRAVEL_DIRECTION', 'MANEUVER', 'OCCUPANT_CNT',
       'FIRST_CONTACT_POINT', 'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE',
       'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION',
       'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ALIGNMENT',
       'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE',
       'DAMAGE', 'DATE_POLICE_NOTIFIED', 'PRIM_CONTRIBUTORY_CAUSE',
       'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO', 'STREET_DIRECTION',
       'STREET_NAME', 'BEAT_OF_OCCURRENCE', 'NUM_UNITS', 'INJURIES_TOTAL',
       'INJURIES_FATAL', 'INJURIES_INCAPACITATING',
       'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT',
       'INJURIES_NO_INDICATION', 'INJURIES_UNKNOWN', 'CRASH_HOUR',
       'CRASH_DAY_OF_WEEK', '

In [160]:
df3 = df1[categoricalx]

In [161]:
deletar = []
for x in df3.columns:
    print("Unique Values: {} :  {} Column Name.".format(len(df3[x].value_counts()), x))
    if len(df3[x].value_counts()) > 50:
        deletar.append(x)

Unique Values: 18081 :  CRASH_RECORD_ID Column Name.
Unique Values: 4 :  UNIT_TYPE Column Name.
Unique Values: 202 :  MAKE Column Name.
Unique Values: 1020 :  MODEL Column Name.
Unique Values: 14 :  VEHICLE_DEFECT Column Name.
Unique Values: 21 :  VEHICLE_TYPE Column Name.
Unique Values: 23 :  VEHICLE_USE Column Name.
Unique Values: 9 :  TRAVEL_DIRECTION Column Name.
Unique Values: 28 :  MANEUVER Column Name.
Unique Values: 14 :  FIRST_CONTACT_POINT Column Name.
Unique Values: 19 :  TRAFFIC_CONTROL_DEVICE Column Name.
Unique Values: 8 :  DEVICE_CONDITION Column Name.
Unique Values: 6 :  WEATHER_CONDITION Column Name.
Unique Values: 6 :  LIGHTING_CONDITION Column Name.
Unique Values: 18 :  FIRST_CRASH_TYPE Column Name.
Unique Values: 20 :  TRAFFICWAY_TYPE Column Name.
Unique Values: 6 :  ALIGNMENT Column Name.
Unique Values: 7 :  ROADWAY_SURFACE_COND Column Name.
Unique Values: 7 :  ROAD_DEFECT Column Name.
Unique Values: 3 :  REPORT_TYPE Column Name.
Unique Values: 2 :  CRASH_TYPE Colu

In [162]:
deletar

['CRASH_RECORD_ID',
 'MAKE',
 'MODEL',
 'DATE_POLICE_NOTIFIED',
 'STREET_NAME',
 'PERSON_ID',
 'RD_NO',
 'CRASH_DATE',
 'CITY']

In [163]:
df4 = df3.drop(columns=['CRASH_RECORD_ID',
 'MAKE',
 'MODEL',
 'DATE_POLICE_NOTIFIED',
 'STREET_NAME',
 'PERSON_ID',
 'RD_NO',
 'CRASH_DATE',
 'CITY'])

In [164]:
categoricalx, numericalx = basic_info(df4)

Dataset shape is:  (56952, 33)
Dataset size is:  1879416
Dataset columns are:  Index(['UNIT_TYPE', 'VEHICLE_DEFECT', 'VEHICLE_TYPE', 'VEHICLE_USE',
       'TRAVEL_DIRECTION', 'MANEUVER', 'FIRST_CONTACT_POINT',
       'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION',
       'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE',
       'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'REPORT_TYPE',
       'CRASH_TYPE', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE',
       'SEC_CONTRIBUTORY_CAUSE', 'STREET_DIRECTION', 'PERSON_TYPE', 'STATE',
       'SEX', 'SAFETY_EQUIPMENT', 'AIRBAG_DEPLOYED', 'EJECTION',
       'INJURY_CLASSIFICATION', 'DRIVER_ACTION', 'DRIVER_VISION',
       'PHYSICAL_CONDITION', 'BAC_RESULT'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 56952 entries, 915909 to 510221
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   UNIT_TYPE       

In [165]:
def making_new_df(data, columnlist):
    for i in columnlist:
        dummy = pd.get_dummies(data[i])
        #print(dummy)
        del dummy[dummy.columns[-1]]
        data = pd.concat([data, dummy], axis = 1)
    return data

In [166]:
df5 = making_new_df(df4, categoricalx)

In [167]:
list(df5.columns)

['UNIT_TYPE',
 'VEHICLE_DEFECT',
 'VEHICLE_TYPE',
 'VEHICLE_USE',
 'TRAVEL_DIRECTION',
 'MANEUVER',
 'FIRST_CONTACT_POINT',
 'TRAFFIC_CONTROL_DEVICE',
 'DEVICE_CONDITION',
 'WEATHER_CONDITION',
 'LIGHTING_CONDITION',
 'FIRST_CRASH_TYPE',
 'TRAFFICWAY_TYPE',
 'ALIGNMENT',
 'ROADWAY_SURFACE_COND',
 'ROAD_DEFECT',
 'REPORT_TYPE',
 'CRASH_TYPE',
 'DAMAGE',
 'PRIM_CONTRIBUTORY_CAUSE',
 'SEC_CONTRIBUTORY_CAUSE',
 'STREET_DIRECTION',
 'PERSON_TYPE',
 'STATE',
 'SEX',
 'SAFETY_EQUIPMENT',
 'AIRBAG_DEPLOYED',
 'EJECTION',
 'INJURY_CLASSIFICATION',
 'DRIVER_ACTION',
 'DRIVER_VISION',
 'PHYSICAL_CONDITION',
 'BAC_RESULT',
 'DISABLED VEHICLE',
 'DRIVER',
 'DRIVERLESS',
 'BRAKES',
 'CARGO',
 'ENGINE/MOTOR',
 'FUEL SYSTEM',
 'LIGHTS',
 'NONE',
 'OTHER',
 'SIGNALS',
 'STEERING',
 'SUSPENSION',
 'TIRES',
 'UNKNOWN',
 'WHEELS',
 '3-WHEELED MOTORCYCLE (2 REAR WHEELS)',
 'ALL-TERRAIN VEHICLE (ATV)',
 'AUTOCYCLE',
 'BUS OVER 15 PASS.',
 'BUS UP TO 15 PASS.',
 'FARM EQUIPMENT',
 'MOPED OR MOTORIZED BICYCLE

In [168]:
df_a2 = df_a2.drop(columns=['CRASH_RECORD_ID', 'MODEL', 'DATE_POLICE_NOTIFIED', 'STREET_NAME', 'PERSON_ID', 'RD_NO', 'CRASH_DATE', 'CITY', 'STATE'])

In [169]:
df_a2 = df_a2.drop(columns=['CRASH_UNIT_ID',
 'MAKE',
 'VEHICLE_YEAR',
 'STREET_NO',
 'BEAT_OF_OCCURRENCE',
 'LATITUDE',
 'LONGITUDE',
 'DATE_ACCIDENT',
 'VEHICLE_ID'])

In [170]:
categorical2 = df_a2.columns

In [171]:
df_a3 = making_new_df(df_a2, categorical2)

In [172]:
deletar = []
for x in df_a3.columns:
    print("Unique Values: {} :  {} Column Name.".format(len(df_a2[x].value_counts()), x))
    if len(df_a2[x].value_counts()) > 50:
        deletar.append(x)

Unique Values: 11 :  UNIT_NO Column Name.
Unique Values: 4 :  UNIT_TYPE Column Name.
Unique Values: 14 :  VEHICLE_DEFECT Column Name.
Unique Values: 21 :  VEHICLE_TYPE Column Name.
Unique Values: 23 :  VEHICLE_USE Column Name.
Unique Values: 9 :  TRAVEL_DIRECTION Column Name.
Unique Values: 28 :  MANEUVER Column Name.
Unique Values: 19 :  OCCUPANT_CNT Column Name.
Unique Values: 14 :  FIRST_CONTACT_POINT Column Name.
Unique Values: 9 :  POSTED_SPEED_LIMIT Column Name.
Unique Values: 19 :  TRAFFIC_CONTROL_DEVICE Column Name.
Unique Values: 8 :  DEVICE_CONDITION Column Name.
Unique Values: 6 :  WEATHER_CONDITION Column Name.
Unique Values: 6 :  LIGHTING_CONDITION Column Name.
Unique Values: 18 :  FIRST_CRASH_TYPE Column Name.
Unique Values: 20 :  TRAFFICWAY_TYPE Column Name.
Unique Values: 6 :  ALIGNMENT Column Name.
Unique Values: 7 :  ROADWAY_SURFACE_COND Column Name.
Unique Values: 7 :  ROAD_DEFECT Column Name.
Unique Values: 3 :  REPORT_TYPE Column Name.
Unique Values: 2 :  CRASH_TYP

KeyError: 1

In [173]:
deletar

['AGE']

In [174]:
df_a3.head(3)

Unnamed: 0,UNIT_NO,UNIT_TYPE,VEHICLE_DEFECT,VEHICLE_TYPE,VEHICLE_USE,TRAVEL_DIRECTION,MANEUVER,OCCUPANT_CNT,FIRST_CONTACT_POINT,POSTED_SPEED_LIMIT,...,IMPAIRED - ALCOHOL,IMPAIRED - ALCOHOL AND DRUGS,IMPAIRED - DRUGS,MEDICATED,NORMAL,OTHER,REMOVED BY EMS,TEST NOT OFFERED,"TEST PERFORMED, RESULTS UNKNOWN",TEST REFUSED
915909,2,DRIVER,UNKNOWN,PASSENGER,PERSONAL,NW,TURNING LEFT,1.0,OTHER,6,...,0,0,0,0,0,0,0,1,0,0
933899,2,DRIVER,NONE,PASSENGER,PERSONAL,S,STRAIGHT AHEAD,1.0,REAR-LEFT,5,...,0,0,0,0,1,0,0,1,0,0
933939,1,DRIVER,UNKNOWN,PASSENGER,PERSONAL,UNKNOWN,LEAVING TRAFFIC LANE TO PARK,1.0,SIDE-RIGHT,2,...,0,0,0,0,1,0,0,1,0,0


In [175]:
df_a3.isna().sum().sum()

0

In [176]:
df_a3.duplicated().sum()

81

In [177]:
df_a3.drop_duplicates(inplace=True)

In [178]:
# ALL OVER AGAIN




In [179]:
df_a3

Unnamed: 0,UNIT_NO,UNIT_TYPE,VEHICLE_DEFECT,VEHICLE_TYPE,VEHICLE_USE,TRAVEL_DIRECTION,MANEUVER,OCCUPANT_CNT,FIRST_CONTACT_POINT,POSTED_SPEED_LIMIT,...,IMPAIRED - ALCOHOL,IMPAIRED - ALCOHOL AND DRUGS,IMPAIRED - DRUGS,MEDICATED,NORMAL,OTHER,REMOVED BY EMS,TEST NOT OFFERED,"TEST PERFORMED, RESULTS UNKNOWN",TEST REFUSED
915909,2,DRIVER,UNKNOWN,PASSENGER,PERSONAL,NW,TURNING LEFT,1.0,OTHER,6,...,0,0,0,0,0,0,0,1,0,0
933899,2,DRIVER,NONE,PASSENGER,PERSONAL,S,STRAIGHT AHEAD,1.0,REAR-LEFT,5,...,0,0,0,0,1,0,0,1,0,0
933939,1,DRIVER,UNKNOWN,PASSENGER,PERSONAL,UNKNOWN,LEAVING TRAFFIC LANE TO PARK,1.0,SIDE-RIGHT,2,...,0,0,0,0,1,0,0,1,0,0
933941,1,DRIVER,NONE,PASSENGER,PERSONAL,SE,TURNING RIGHT,1.0,FRONT,5,...,0,0,0,0,1,0,0,1,0,0
916097,2,DRIVER,UNKNOWN,PASSENGER,PERSONAL,N,STRAIGHT AHEAD,1.0,ROOF,5,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320703,1,DRIVER,UNKNOWN,PASSENGER,PERSONAL,S,TURNING LEFT,1.0,FRONT-LEFT,5,...,0,0,0,0,1,0,0,1,0,0
108314,1,DRIVER,UNKNOWN,PASSENGER,UNKNOWN/NA,S,SLOW/STOP IN TRAFFIC,1.0,FRONT,8,...,0,0,0,0,1,0,0,0,1,0
815407,3,PARKED,NONE,PASSENGER,NOT IN USE,E,PARKED,0.0,ROOF,5,...,0,0,0,0,0,0,0,1,0,0
815408,3,PARKED,NONE,PASSENGER,NOT IN USE,E,PARKED,0.0,ROOF,5,...,0,0,0,0,1,0,0,1,0,0


In [180]:
list(df_a3.dtypes)

[dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('float64'),
 dtype('O'),
 CategoricalDtype(categories=[0, 1, 2, 3, 4, 5, 6, 7, 8], ordered=True),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('O'),
 dtype('float64'),
 dtype('int32'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 CategoricalDtype(categories=[0, 1, 2, 3], ordered=True),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('float64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),

In [181]:
list(df_a3.columns)

['UNIT_NO',
 'UNIT_TYPE',
 'VEHICLE_DEFECT',
 'VEHICLE_TYPE',
 'VEHICLE_USE',
 'TRAVEL_DIRECTION',
 'MANEUVER',
 'OCCUPANT_CNT',
 'FIRST_CONTACT_POINT',
 'POSTED_SPEED_LIMIT',
 'TRAFFIC_CONTROL_DEVICE',
 'DEVICE_CONDITION',
 'WEATHER_CONDITION',
 'LIGHTING_CONDITION',
 'FIRST_CRASH_TYPE',
 'TRAFFICWAY_TYPE',
 'ALIGNMENT',
 'ROADWAY_SURFACE_COND',
 'ROAD_DEFECT',
 'REPORT_TYPE',
 'CRASH_TYPE',
 'DAMAGE',
 'PRIM_CONTRIBUTORY_CAUSE',
 'SEC_CONTRIBUTORY_CAUSE',
 'STREET_DIRECTION',
 'NUM_UNITS',
 'MOST_SEVERE_INJURY',
 'INJURIES_TOTAL',
 'INJURIES_FATAL',
 'INJURIES_INCAPACITATING',
 'INJURIES_NON_INCAPACITATING',
 'INJURIES_REPORTED_NOT_EVIDENT',
 'INJURIES_NO_INDICATION',
 'INJURIES_UNKNOWN',
 'CRASH_HOUR',
 'CRASH_DAY_OF_WEEK',
 'CRASH_MONTH',
 'IS_A_HOLIDAY',
 'HOLIDAY_NAME',
 'PERSON_TYPE',
 'SEX',
 'AGE',
 'SAFETY_EQUIPMENT',
 'AIRBAG_DEPLOYED',
 'EJECTION',
 'INJURY_CLASSIFICATION',
 'DRIVER_ACTION',
 'DRIVER_VISION',
 'PHYSICAL_CONDITION',
 'BAC_RESULT',
 1,
 2,
 3,
 4,
 5,
 6,
 7,

In [182]:
df_a4 = df_a3.drop(columns=['UNIT_TYPE',
 'VEHICLE_DEFECT',
 'VEHICLE_TYPE',
 'VEHICLE_USE',
 'TRAVEL_DIRECTION',
 'MANEUVER', 'FIRST_CONTACT_POINT', 'TRAFFIC_CONTROL_DEVICE',
 'DEVICE_CONDITION',
 'WEATHER_CONDITION',
 'LIGHTING_CONDITION',
 'FIRST_CRASH_TYPE',
 'TRAFFICWAY_TYPE',
 'ALIGNMENT',
 'ROADWAY_SURFACE_COND',
 'ROAD_DEFECT',
 'REPORT_TYPE',
 'CRASH_TYPE',
 'DAMAGE',
 'PRIM_CONTRIBUTORY_CAUSE',
 'SEC_CONTRIBUTORY_CAUSE',
 'STREET_DIRECTION', 'MOST_SEVERE_INJURY'])

In [183]:
df_a4.shape

(56871, 592)

In [184]:
list(df_a4.dtypes)

[dtype('int64'),
 dtype('float64'),
 CategoricalDtype(categories=[0, 1, 2, 3, 4, 5, 6, 7, 8], ordered=True),
 dtype('int64'),
 dtype('float64'),
 dtype('int32'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 CategoricalDtype(categories=[0, 1, 2, 3], ordered=True),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('float64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('uint8'),
 dtype('u

In [185]:
list(df_a4.columns)

['UNIT_NO',
 'OCCUPANT_CNT',
 'POSTED_SPEED_LIMIT',
 'NUM_UNITS',
 'INJURIES_TOTAL',
 'INJURIES_FATAL',
 'INJURIES_INCAPACITATING',
 'INJURIES_NON_INCAPACITATING',
 'INJURIES_REPORTED_NOT_EVIDENT',
 'INJURIES_NO_INDICATION',
 'INJURIES_UNKNOWN',
 'CRASH_HOUR',
 'CRASH_DAY_OF_WEEK',
 'CRASH_MONTH',
 'IS_A_HOLIDAY',
 'HOLIDAY_NAME',
 'PERSON_TYPE',
 'SEX',
 'AGE',
 'SAFETY_EQUIPMENT',
 'AIRBAG_DEPLOYED',
 'EJECTION',
 'INJURY_CLASSIFICATION',
 'DRIVER_ACTION',
 'DRIVER_VISION',
 'PHYSICAL_CONDITION',
 'BAC_RESULT',
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 'DISABLED VEHICLE',
 'DRIVER',
 'DRIVERLESS',
 'BRAKES',
 'CARGO',
 'ENGINE/MOTOR',
 'FUEL SYSTEM',
 'LIGHTS',
 'NONE',
 'OTHER',
 'SIGNALS',
 'STEERING',
 'SUSPENSION',
 'TIRES',
 'UNKNOWN',
 'WHEELS',
 '3-WHEELED MOTORCYCLE (2 REAR WHEELS)',
 'ALL-TERRAIN VEHICLE (ATV)',
 'AUTOCYCLE',
 'BUS OVER 15 PASS.',
 'BUS UP TO 15 PASS.',
 'FARM EQUIPMENT',
 'MOPED OR MOTORIZED BICYCLE',
 'MOTOR DRIVEN CYCLE',
 'MOTORCYCLE (OVER 150CC)',
 'OTH

In [186]:
df['MOST_SEVERE_INJURY']

915909        NO INDICATION OF INJURY
933899        NO INDICATION OF INJURY
933939        NO INDICATION OF INJURY
933941        NO INDICATION OF INJURY
916097        NO INDICATION OF INJURY
                     ...             
320703    INCAPACITATING INJURY/FATAL
108314    INCAPACITATING INJURY/FATAL
815407    INCAPACITATING INJURY/FATAL
815408    INCAPACITATING INJURY/FATAL
510221    INCAPACITATING INJURY/FATAL
Name: MOST_SEVERE_INJURY, Length: 56952, dtype: object

In [187]:
df.shape

(56952, 68)

In [188]:
ds = df.drop(columns=['MOST_SEVERE_INJURY'])

In [189]:
ds.shape

(56952, 67)

In [190]:
def shallwedelete(data):
    deletar = []
    for x in data.columns:
        print("Unique Values: {} :  {} Column Name.".format(len(data[x].value_counts()), x))
        if len(data[x].value_counts()) > 50:
            deletar.append(x)

In [191]:
deletar = []
for x in ds.columns:
    print("Unique Values: {} :  {} Column Name.".format(len(ds[x].value_counts()), x))
    if len(ds[x].value_counts()) > 50:
            deletar.append(x)

Unique Values: 32999 :  CRASH_UNIT_ID Column Name.
Unique Values: 18081 :  CRASH_RECORD_ID Column Name.
Unique Values: 11 :  UNIT_NO Column Name.
Unique Values: 4 :  UNIT_TYPE Column Name.
Unique Values: 202 :  MAKE Column Name.
Unique Values: 1020 :  MODEL Column Name.
Unique Values: 67 :  VEHICLE_YEAR Column Name.
Unique Values: 14 :  VEHICLE_DEFECT Column Name.
Unique Values: 21 :  VEHICLE_TYPE Column Name.
Unique Values: 23 :  VEHICLE_USE Column Name.
Unique Values: 9 :  TRAVEL_DIRECTION Column Name.
Unique Values: 28 :  MANEUVER Column Name.
Unique Values: 19 :  OCCUPANT_CNT Column Name.
Unique Values: 14 :  FIRST_CONTACT_POINT Column Name.
Unique Values: 9 :  POSTED_SPEED_LIMIT Column Name.
Unique Values: 19 :  TRAFFIC_CONTROL_DEVICE Column Name.
Unique Values: 8 :  DEVICE_CONDITION Column Name.
Unique Values: 6 :  WEATHER_CONDITION Column Name.
Unique Values: 6 :  LIGHTING_CONDITION Column Name.
Unique Values: 18 :  FIRST_CRASH_TYPE Column Name.
Unique Values: 20 :  TRAFFICWAY_T

In [192]:
deletar

['CRASH_UNIT_ID',
 'CRASH_RECORD_ID',
 'MAKE',
 'MODEL',
 'VEHICLE_YEAR',
 'DATE_POLICE_NOTIFIED',
 'STREET_NO',
 'STREET_NAME',
 'BEAT_OF_OCCURRENCE',
 'LATITUDE',
 'LONGITUDE',
 'DATE_ACCIDENT',
 'PERSON_ID',
 'RD_NO',
 'VEHICLE_ID',
 'CRASH_DATE',
 'CITY',
 'AGE']

In [193]:
ds1 = ds.drop(columns=['CRASH_UNIT_ID',
 'CRASH_RECORD_ID',
 'MAKE',
 'MODEL',
 'VEHICLE_YEAR',
 'DATE_POLICE_NOTIFIED',
 'STREET_NO',
 'STREET_NAME',
 'BEAT_OF_OCCURRENCE',
 'LATITUDE',
 'LONGITUDE',
 'DATE_ACCIDENT',
 'PERSON_ID',
 'RD_NO',
 'VEHICLE_ID',
 'CRASH_DATE',
 'CITY',
 'AGE'])

In [194]:
ds1.columns

Index(['UNIT_NO', 'UNIT_TYPE', 'VEHICLE_DEFECT', 'VEHICLE_TYPE', 'VEHICLE_USE',
       'TRAVEL_DIRECTION', 'MANEUVER', 'OCCUPANT_CNT', 'FIRST_CONTACT_POINT',
       'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT',
       'REPORT_TYPE', 'CRASH_TYPE', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE',
       'SEC_CONTRIBUTORY_CAUSE', 'STREET_DIRECTION', 'NUM_UNITS',
       'INJURIES_TOTAL', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING',
       'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT',
       'INJURIES_NO_INDICATION', 'INJURIES_UNKNOWN', 'CRASH_HOUR',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'IS_A_HOLIDAY', 'HOLIDAY_NAME',
       'PERSON_TYPE', 'STATE', 'SEX', 'SAFETY_EQUIPMENT', 'AIRBAG_DEPLOYED',
       'EJECTION', 'INJURY_CLASSIFICATION', 'DRIVER_ACTION', 'DRIVER_VISION',
       'PHYSICAL_CONDITION', 'BAC_RESULT']

In [195]:
ds1 = ds1.drop(columns=['INJURIES_TOTAL', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING',
       'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT',
       'INJURIES_NO_INDICATION', 'INJURIES_UNKNOWN'])

In [196]:
ds1 = ds1.drop(columns=['INJURY_CLASSIFICATION'])

In [197]:
ds1.head()

Unnamed: 0,UNIT_NO,UNIT_TYPE,VEHICLE_DEFECT,VEHICLE_TYPE,VEHICLE_USE,TRAVEL_DIRECTION,MANEUVER,OCCUPANT_CNT,FIRST_CONTACT_POINT,POSTED_SPEED_LIMIT,...,PERSON_TYPE,STATE,SEX,SAFETY_EQUIPMENT,AIRBAG_DEPLOYED,EJECTION,DRIVER_ACTION,DRIVER_VISION,PHYSICAL_CONDITION,BAC_RESULT
915909,2,DRIVER,UNKNOWN,PASSENGER,PERSONAL,NW,TURNING LEFT,1.0,OTHER,6,...,DRIVER,IL,F,USAGE UNKNOWN,DID NOT DEPLOY,NONE,UNKNOWN,UNKNOWN,UNKNOWN,TEST NOT OFFERED
933899,2,DRIVER,NONE,PASSENGER,PERSONAL,S,STRAIGHT AHEAD,1.0,REAR-LEFT,5,...,DRIVER,IL,M,SAFETY BELT USED,DID NOT DEPLOY,NONE,FOLLOWED TOO CLOSELY,UNKNOWN,NORMAL,TEST NOT OFFERED
933939,1,DRIVER,UNKNOWN,PASSENGER,PERSONAL,UNKNOWN,LEAVING TRAFFIC LANE TO PARK,1.0,SIDE-RIGHT,2,...,DRIVER,IL,M,SAFETY BELT USED,DID NOT DEPLOY,NONE,OTHER,NOT OBSCURED,NORMAL,TEST NOT OFFERED
933941,1,DRIVER,NONE,PASSENGER,PERSONAL,SE,TURNING RIGHT,1.0,FRONT,5,...,DRIVER,IL,F,SAFETY BELT USED,DID NOT DEPLOY,NONE,NONE,NOT OBSCURED,NORMAL,TEST NOT OFFERED
916097,2,DRIVER,UNKNOWN,PASSENGER,PERSONAL,N,STRAIGHT AHEAD,1.0,ROOF,5,...,DRIVER,IL,F,USAGE UNKNOWN,DEPLOYMENT UNKNOWN,NONE,UNKNOWN,UNKNOWN,UNKNOWN,TEST NOT OFFERED


In [198]:
ds1.dtypes

UNIT_NO                       int64
UNIT_TYPE                    object
VEHICLE_DEFECT               object
VEHICLE_TYPE                 object
VEHICLE_USE                  object
TRAVEL_DIRECTION             object
MANEUVER                     object
OCCUPANT_CNT                float64
FIRST_CONTACT_POINT          object
POSTED_SPEED_LIMIT         category
TRAFFIC_CONTROL_DEVICE       object
DEVICE_CONDITION             object
WEATHER_CONDITION            object
LIGHTING_CONDITION           object
FIRST_CRASH_TYPE             object
TRAFFICWAY_TYPE              object
ALIGNMENT                    object
ROADWAY_SURFACE_COND         object
ROAD_DEFECT                  object
REPORT_TYPE                  object
CRASH_TYPE                   object
DAMAGE                       object
PRIM_CONTRIBUTORY_CAUSE      object
SEC_CONTRIBUTORY_CAUSE       object
STREET_DIRECTION             object
NUM_UNITS                     int64
CRASH_HOUR                 category
CRASH_DAY_OF_WEEK           

In [199]:
#'UNIT_TYPE, VEHICLE_DEFECT, VEHICLE_TYPE, VEHICLE_USE, TRAVEL_DIRECTION, MANEUVER, PERSON_TYPE, STATE, SEX, SAFETY_EQUIPMENT,


In [200]:
categoricalz, numericalz = basic_info(ds1)

Dataset shape is:  (56952, 41)
Dataset size is:  2335032
Dataset columns are:  Index(['UNIT_NO', 'UNIT_TYPE', 'VEHICLE_DEFECT', 'VEHICLE_TYPE', 'VEHICLE_USE',
       'TRAVEL_DIRECTION', 'MANEUVER', 'OCCUPANT_CNT', 'FIRST_CONTACT_POINT',
       'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT',
       'REPORT_TYPE', 'CRASH_TYPE', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE',
       'SEC_CONTRIBUTORY_CAUSE', 'STREET_DIRECTION', 'NUM_UNITS', 'CRASH_HOUR',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'IS_A_HOLIDAY', 'HOLIDAY_NAME',
       'PERSON_TYPE', 'STATE', 'SEX', 'SAFETY_EQUIPMENT', 'AIRBAG_DEPLOYED',
       'EJECTION', 'DRIVER_ACTION', 'DRIVER_VISION', 'PHYSICAL_CONDITION',
       'BAC_RESULT'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 56952 entries, 915909 to 510221
Data columns (total 41 columns):
 

In [201]:
def making_new_df(data, columnlist):
    for i in columnlist:
        dummy = pd.get_dummies(data[i])
        #print(dummy)
        del dummy[dummy.columns[-1]]
        data = pd.concat([data, dummy], axis = 1)
    return data

In [202]:
ds2 = making_new_df(ds1, categoricalz)

In [203]:
ds2 = ds2.drop(categoricalz, axis = 1)

In [204]:
ds2

Unnamed: 0,UNIT_NO,OCCUPANT_CNT,POSTED_SPEED_LIMIT,NUM_UNITS,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,IS_A_HOLIDAY,HOLIDAY_NAME,DISABLED VEHICLE,...,IMPAIRED - ALCOHOL,IMPAIRED - ALCOHOL AND DRUGS,IMPAIRED - DRUGS,MEDICATED,NORMAL,OTHER,REMOVED BY EMS,TEST NOT OFFERED,"TEST PERFORMED, RESULTS UNKNOWN",TEST REFUSED
915909,2,1.0,6,2,2,7,11,0,0,0,...,0,0,0,0,0,0,0,1,0,0
933899,2,1.0,5,2,2,2,12,0,0,0,...,0,0,0,0,1,0,0,1,0,0
933939,1,1.0,2,2,1,3,12,0,0,0,...,0,0,0,0,1,0,0,1,0,0
933941,1,1.0,5,2,2,3,12,0,0,0,...,0,0,0,0,1,0,0,1,0,0
916097,2,1.0,5,2,0,7,11,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320703,1,1.0,5,2,1,6,10,0,0,0,...,0,0,0,0,1,0,0,1,0,0
108314,1,1.0,8,2,2,3,4,0,0,0,...,0,0,0,0,1,0,0,0,1,0
815407,3,0.0,5,4,1,7,7,0,0,0,...,0,0,0,0,0,0,0,1,0,0
815408,3,0.0,5,4,1,7,7,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [205]:
ds2.isna().sum().sum()

0

In [206]:
dfs = df

In [207]:
dfs['MOST_SEVERE_INJURY'].value_counts()

NO INDICATION OF INJURY        19000
NONINCAPACITATING INJURY       19000
INCAPACITATING INJURY/FATAL    18952
Name: MOST_SEVERE_INJURY, dtype: int64

In [208]:
dfs['MOST_SEVERE_INJURY'] = np.where(dfs['MOST_SEVERE_INJURY']=='NONINCAPACITATING INJURY', 0, dfs['MOST_SEVERE_INJURY'])

In [209]:
dfs['MOST_SEVERE_INJURY'] = np.where(dfs['MOST_SEVERE_INJURY']=='NO INDICATION OF INJURY', 1, dfs['MOST_SEVERE_INJURY'])

In [210]:
dfs['MOST_SEVERE_INJURY'] = np.where(dfs['MOST_SEVERE_INJURY']=='INCAPACITATING INJURY/FATAL', 2, dfs['MOST_SEVERE_INJURY'])

In [211]:
Y = dfs['MOST_SEVERE_INJURY']

In [212]:
Y.value_counts()

1    19000
0    19000
2    18952
Name: MOST_SEVERE_INJURY, dtype: int64

In [213]:
Y=Y.astype('int')

In [214]:
Y.shape

(56952,)

In [215]:
X = ds2

In [216]:
X.shape

(56952, 399)

In [217]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size = 0.20, random_state=42)

In [218]:
LR = LogisticRegression(max_iter=5000)

In [219]:
model_lr = LR.fit(train_x, train_y)

In [220]:
y_lr_predict = model_lr.predict(test_x)

In [221]:
LR_df = pd.DataFrame(data = {"Actual": test_y, "Predicted": y_lr_predict})

In [222]:
LR_df

Unnamed: 0,Actual,Predicted
934741,1,1
917975,1,1
906955,1,1
890296,1,1
975386,0,2
...,...,...
977488,0,1
847410,2,0
16132,0,0
936028,0,1


In [223]:
model_lr.score(test_x, test_y)

0.7502414186638574

In [224]:
rfc = RandomForestClassifier()

In [225]:
model_rfr = rfc.fit(train_x, train_y)
y_rfr_predict = model_rfr.predict(test_x)

In [226]:
RFR_df = pd.DataFrame(data = {"Actual": test_y, "Predicted": y_rfr_predict})

In [227]:
RFR_df

Unnamed: 0,Actual,Predicted
934741,1,1
917975,1,1
906955,1,1
890296,1,1
975386,0,2
...,...,...
977488,0,0
847410,2,0
16132,0,0
936028,0,2


In [228]:
model_rfr.score(test_x, test_y)

0.9338951804055834

In [230]:
precision_score(test_y, y_rfr_predict, average='weighted')

0.9344489562934016

In [232]:
f1_score(test_y, y_rfr_predict, average='weighted')

0.9341260822853127

In [233]:
recall_score(test_y, y_rfr_predict, average='weighted')

0.9338951804055834

In [None]:
model_rfr.