In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
df_train = pd.read_csv('pp_data_train.csv')
df_test = pd.read_csv('pp_data_all.csv')

print(df_train.shape)
print(df_test.shape)

(145063, 65)
(289671, 64)


In [3]:
df_train['mod_lb'] = 'train'
df_test['mod_lb'] = 'test'

In [4]:
# define an encoded function

def encode_y(y, encoding):
    return y.map(encoding)

In [5]:
# encode the response variable 'injury'

encoding_y = {'Unharmed': 0, 'Possible': 1, 'Minor': 2, 'Major': 3, 'Fatal': 4}

y = encode_y(df_train['injury'], encoding_y)

In [6]:
df_train.drop(['injury'], axis = 1,inplace = True)

In [7]:
print(df_train.shape)
print(df_test.shape)

(145063, 65)
(289671, 65)


In [8]:
df_train.head()

Unnamed: 0,row_id,incident_id,vehicle_id,occupant_id,total_veh_in_inc,total_occ_in_inc,total_occ_in_veh,actual_veh_in_inc,actual_per_in_inc,actual_occ_in_veh,...,inc_speeding,veh_speed,veh_braking,veh_stability,veh_movement,veh_impact,inc_descrip,veh_descrip,veh_damage,mod_lb
0,2,2,1,1,1,2,2,1,2,2,...,No,-1,Swerving,Skidding with twist,Departed road,Not a collision,Instability,Instability,Disabling,train
1,3,2,1,2,1,2,2,1,2,2,...,No,-1,Swerving,Skidding with twist,Departed road,Not a collision,Instability,Instability,Disabling,train
2,10,4,1,1,2,3,2,2,3,2,...,No,-1,No maneuver,Wheel misalignment,Departed lane,Side left,Collision with in-transit vehicle,Collision with in-transit vehicle,Disabling,train
3,11,4,1,2,2,3,2,2,3,2,...,No,-1,No maneuver,Wheel misalignment,Departed lane,Side left,Collision with in-transit vehicle,Collision with in-transit vehicle,Disabling,train
4,12,4,2,1,2,3,1,2,3,1,...,No,-1,Braking,Skidding,Stayed in lane,Head on,Collision with in-transit vehicle,Collision with in-transit vehicle,Disabling,train


In [9]:
df_all = pd.concat([df_train, df_test], axis = 0)
print(df_all.shape)

(434734, 65)


In [1]:
# calculate the missing value rate for each column

# for col in df_all.columns:
#     print('column name: ', col)
#     print(df_all[col].isnull().sum()/len(df_all))
#     print('\n')

In [11]:
# observe datetime columns

df_all['inc_datetime'] = pd.to_datetime(df_all['inc_datetime'], infer_datetime_format = True)
df_all['inc_datetime'].head()

0   2013-09-19 10:15:00
1   2013-09-19 10:15:00
2   2011-07-23 03:00:00
3   2011-07-23 03:00:00
4   2011-07-23 03:00:00
Name: inc_datetime, dtype: datetime64[ns]

In [12]:
# a protential problem for the datetime?

df_all['inc_datetime'][3]

3   2011-07-23 03:00:00
3   2014-01-26 10:15:00
Name: inc_datetime, dtype: datetime64[ns]

In [197]:
#pd.to_datetime(df_all['veh_year'], infer_datetime_format = True)

In [13]:
df_all['veh_year'].head()

0    2004
1    2004
2    2004
3    2004
4    2003
Name: veh_year, dtype: int64

In [14]:
df_all.loc[df_all['veh_year'] == -1, 'veh_year'] = np.nan

In [15]:
df_all['inc_datetime'].dt.year.head()

0    2013.0
1    2013.0
2    2011.0
3    2011.0
4    2011.0
Name: inc_datetime, dtype: float64

In [16]:
(df_all['inc_datetime'].dt.year - df_all['veh_year']).head()

0    9.0
1    9.0
2    7.0
3    7.0
4    8.0
dtype: float64

In [209]:
#pd.to_datetime(df_all['veh_year'], format = '%Y').head()

In [210]:
#df_all['veh_year'] = pd.to_datetime(df_all['veh_year'], infer_datetime_format = True)

In [17]:
# create a new feature: veh_age_new = inc_datetime - veh_year (in month)

df_all['veh_age_new'] = (df_all['inc_datetime'].dt.year - df_all['veh_year'])
df_all['veh_age_new'].head()

0    9.0
1    9.0
2    7.0
3    7.0
4    8.0
Name: veh_age_new, dtype: float64

In [18]:
524/12

43.666666666666664

In [19]:
df_all['inc_datetime'][0]

0   2013-09-19 10:15:00
0   2014-10-14 08:45:00
Name: inc_datetime, dtype: datetime64[ns]

In [20]:
df_all['inc_datetime'][1]

1   2013-09-19 10:15:00
1   2013-09-19 10:15:00
Name: inc_datetime, dtype: datetime64[ns]

In [21]:
df_all['inc_datetime'].head()

0   2013-09-19 10:15:00
1   2013-09-19 10:15:00
2   2011-07-23 03:00:00
3   2011-07-23 03:00:00
4   2011-07-23 03:00:00
Name: inc_datetime, dtype: datetime64[ns]

In [22]:
df_all['veh_year'].head()

0    2004.0
1    2004.0
2    2004.0
3    2004.0
4    2003.0
Name: veh_year, dtype: float64

In [23]:
#numeric encoding 4 datetime columns: drv_ban1_mo, drv_ban1_yr, drv_ban2_mo, drv_ban2_yr

encoding_month = {'Jan.': 1, 'Feb.': 2, 'Mar.': 3, 'Apr.': 4, 'May': 5, 'June': 6, 'July': 7,
                 'Aug.': 8, 'Sept.': 9, 'Oct.': 10, 'Nov.': 11, 'Dec.': 12, 'No record': np.nan, 'unknown': np.nan}

df_all['drv_ban1_mo'] = encode_y(df_all['drv_ban1_mo'], encoding_month)
df_all['drv_ban2_mo'] = encode_y(df_all['drv_ban2_mo'], encoding_month)

In [24]:
encoding_year = {'2007': 2007, '2008': 2008, '2009': 2009, '2010': 2010, '2011': 2011, '2012': 2012, '2013': 2013,
                 '2014': 2014, 'No record': np.nan, 'unknown': np.nan}

df_all['drv_ban1_yr'] = encode_y(df_all['drv_ban1_yr'], encoding_year)
df_all['drv_ban2_yr'] = encode_y(df_all['drv_ban2_yr'], encoding_year)

In [123]:
#df_all['drv_ban1_yr'].head()

In [124]:
#df_all['drv_ban1_yr'].head()

In [25]:
df_all['drv_ban_day'] = 1

In [26]:
df_all[['drv_ban1_yr', 'drv_ban1_mo', 'drv_ban_day']].head()

Unnamed: 0,drv_ban1_yr,drv_ban1_mo,drv_ban_day
0,,,1
1,,,1
2,,,1
3,,,1
4,2009.0,3.0,1


In [27]:
df_1 = df_all[['drv_ban1_yr', 'drv_ban1_mo', 'drv_ban_day']]
df_1.columns = [['Year', 'Month', 'Day']]
df_1.head()

Unnamed: 0,Year,Month,Day
0,,,1
1,,,1
2,,,1
3,,,1
4,2009.0,3.0,1


In [131]:
#pd.to_datetime(df_1)

In [28]:
df_all['drv_ban1_dt'] = pd.to_datetime(df_all['drv_ban1_yr']*10000+df_all['drv_ban1_mo']*100+df_all['drv_ban_day'], format = '%Y%m%d')
df_all['drv_ban1_dt'].head()

0          NaT
1          NaT
2          NaT
3          NaT
4   2009-03-01
Name: drv_ban1_dt, dtype: datetime64[ns]

In [29]:
df_all['drv_ban2_dt'] = pd.to_datetime(df_all['drv_ban2_yr']*10000+df_all['drv_ban2_mo']*100+df_all['drv_ban_day'], format = '%Y%m%d')
df_all['drv_ban2_dt'].head()

0          NaT
1          NaT
2          NaT
3          NaT
4   2009-03-01
Name: drv_ban2_dt, dtype: datetime64[ns]

In [30]:
df_all['drv_ban1_new'] = (df_all['inc_datetime'] - df_all['drv_ban1_dt'])/np.timedelta64(1, 'M')
df_all['drv_ban1_new'].head()

0          NaN
1          NaN
2          NaN
3          NaN
4    28.719276
Name: drv_ban1_new, dtype: float64

In [31]:
df_all['drv_ban2_new'] = (df_all['inc_datetime'] - df_all['drv_ban2_dt'])/np.timedelta64(1, 'M')
df_all['drv_ban2_new'].head()

0          NaN
1          NaN
2          NaN
3          NaN
4    28.719276
Name: drv_ban2_new, dtype: float64

In [32]:
# create 3 new features for driver age: drv_age_min, drv_age_max, drv_age_avg

df_drv = df_all[df_all['occ_role'] == 'Driver']

df_1 = pd.DataFrame(df_all.groupby('incident_id')['occ_age'].min())
df_2 = pd.DataFrame(df_all.groupby('incident_id')['occ_age'].max())
df_3 = pd.DataFrame(df_all.groupby('incident_id')['occ_age'].mean())

In [33]:
df_1.head()

Unnamed: 0_level_0,occ_age
incident_id,Unnamed: 1_level_1
1,67
2,72
3,10
4,21
5,37


In [34]:
df_1.columns

Index(['occ_age'], dtype='object')

In [35]:
df_1.rename(columns = {'occ_age': 'occ_age_min'}, inplace = True)

In [36]:
df_2.rename(columns = {'occ_age': 'occ_age_max'}, inplace = True)
df_3.rename(columns = {'occ_age': 'occ_age_mean'}, inplace = True)

In [37]:
df_1.head()

Unnamed: 0_level_0,occ_age_min
incident_id,Unnamed: 1_level_1
1,67
2,72
3,10
4,21
5,37


In [38]:
df_2.head()

Unnamed: 0_level_0,occ_age_max
incident_id,Unnamed: 1_level_1
1,67
2,75
3,54
4,29
5,58


In [39]:
df_3.head()

Unnamed: 0_level_0,occ_age_mean
incident_id,Unnamed: 1_level_1
1,67.0
2,73.5
3,22.166667
4,25.0
5,47.5


In [40]:
df_1.reset_index(level = 0, inplace = True)
df_2.reset_index(level = 0, inplace = True)
df_3.reset_index(level = 0, inplace = True)

In [2]:
# df_all_new = pd.merge(df_all, df_1, left_on = 'incident_id', right_on = 'incident_id')
# df_all_new.head()

In [3]:
# df_all_new = pd.merge(df_all_new, df_2, left_on = 'incident_id', right_on = 'incident_id')
# df_all_new = pd.merge(df_all_new, df_3, left_on = 'incident_id', right_on = 'incident_id')

# df_all_new.head()

In [43]:
#df_all.columns.values

In [148]:
df_all['inc_datetime'].head()

0   2013-09-19 10:15:00
1   2013-09-19 10:15:00
2   2011-07-23 03:00:00
3   2011-07-23 03:00:00
4   2011-07-23 03:00:00
Name: inc_datetime, dtype: datetime64[ns]

In [162]:
df_all['inc_datetime'].dt.year.head()

0    2013.0
1    2013.0
2    2011.0
3    2011.0
4    2011.0
Name: inc_datetime, dtype: float64

In [163]:
df_all['inc_datetime'].dt.month.head()

0    9.0
1    9.0
2    7.0
3    7.0
4    7.0
Name: inc_datetime, dtype: float64

In [166]:
df_all['inc_datetime'].dt.hour.head()

0    10.0
1    10.0
2     3.0
3     3.0
4     3.0
Name: inc_datetime, dtype: float64

In [44]:
# create 2 new feature: inc_month_new and inc_hour_new

df_all['inc_month_new'] = df_all['inc_datetime'].dt.month
df_all['inc_hour_new'] = df_all['inc_datetime'].dt.hour

In [4]:
# df_all.tail()

In [46]:
df_all['inc_datetime'].tail()

289666   2011-07-17 10:00:00
289667   2011-07-17 10:00:00
289668   2011-07-17 10:00:00
289669   2011-07-17 10:00:00
289670   2011-07-17 10:00:00
Name: inc_datetime, dtype: datetime64[ns]

In [47]:
#df_all.columns.values

In [48]:
df_4 = pd.DataFrame(df_all.groupby('incident_id')['veh_age_new'].max())
df_4.head()

Unnamed: 0_level_0,veh_age_new
incident_id,Unnamed: 1_level_1
1,9.0
2,9.0
3,12.0
4,8.0
5,1.0


In [49]:
df_4.rename(columns = {'veh_age_new': 'veh_inc_maxage'}, inplace = True)
df_4.head()

Unnamed: 0_level_0,veh_inc_maxage
incident_id,Unnamed: 1_level_1
1,9.0
2,9.0
3,12.0
4,8.0
5,1.0


In [50]:
df_4.reset_index(level = 0, inplace = True)
df_4.head()

Unnamed: 0,incident_id,veh_inc_maxage
0,1,9.0
1,2,9.0
2,3,12.0
3,4,8.0
4,5,1.0


In [51]:
df_all = pd.merge(df_all, df_4, left_on = 'incident_id', right_on = 'incident_id')

In [52]:
df_all.columns

Index(['row_id', 'incident_id', 'vehicle_id', 'occupant_id',
       'total_veh_in_inc', 'total_occ_in_inc', 'total_occ_in_veh',
       'actual_veh_in_inc', 'actual_per_in_inc', 'actual_occ_in_veh',
       'actual_ped', 'inc_datetime', 'inc_day', 'inc_region', 'inc_daylight',
       'inc_weather', 'veh_roadcond', 'veh_roadslope', 'veh_lanes',
       'veh_lanediv', 'veh_speedlim', 'occ_age', 'occ_sex', 'occ_role',
       'occ_position', 'occ_seatbelt', 'occ_airbag', 'occ_alcohol',
       'veh_owner', 'veh_type', 'veh_style', 'veh_weight', 'veh_manu',
       'veh_year', 'veh_doors', 'veh_wheels', 'veh_drive', 'veh_fuel',
       'veh_eng1', 'veh_eng2', 'veh_trailer', 'drv_height', 'drv_weight',
       'drv_licence', 'drv_status', 'drv_crash', 'drv_pts', 'drv_dui',
       'drv_spd', 'drv_ban1_mo', 'drv_ban1_yr', 'drv_ban2_mo', 'drv_ban2_yr',
       'drv_alcohol', 'inc_type', 'inc_speeding', 'veh_speed', 'veh_braking',
       'veh_stability', 'veh_movement', 'veh_impact', 'inc_descrip',
    

In [5]:
# df_all.drop(['inc_datetime', 'veh_year', 'drv_ban1_mo', 'drv_ban1_yr', 'drv_ban2_mo', 'drv_ban2_yr', 'drv_ban_day', 'drv_ban1_dt',
#             'drv_ban2_dt'], axis = 1, inplace = True)

# df_all.head()

In [6]:
# df_all.columns

In [55]:
# further data cleaning: 1. for numerical feature replace '-1' with np.nan 2. for categorical data also check NANs

In [7]:
# for col in df_all.columns:
#     print('column name: ', col)
#     print(df_all[col].isnull().sum()/len(df_all))
#     print('\n')

In [8]:
# df_all.select_dtypes(include = ['object'])

In [9]:
# for col in df_all.select_dtypes(include = ['object']).columns:
#     print('categorical feature name: ', col)
#     print(df_all[col].value_counts(dropna = False))
#     print('\n')

In [62]:
df_all.loc[df_all['drv_status'].isnull(), 'drv_status'] = 'unknown'

In [63]:
df_all['drv_status'].value_counts(dropna = False)

Clean         367415
Penalised      30634
No licence     24952
unknown         6185
Expired         5548
Name: drv_status, dtype: int64

In [67]:
df_all.select_dtypes(exclude = ['object']).head()

Unnamed: 0,row_id,incident_id,vehicle_id,occupant_id,total_veh_in_inc,total_occ_in_inc,total_occ_in_veh,actual_veh_in_inc,actual_per_in_inc,actual_occ_in_veh,...,drv_pts,drv_dui,drv_spd,veh_speed,veh_age_new,drv_ban1_new,drv_ban2_new,inc_month_new,inc_hour_new,veh_inc_maxage
0,2,2,1,1,1,2,2,1,2,2,...,0,0,0,-1,9.0,,,9.0,10.0,9.0
1,3,2,1,2,1,2,2,1,2,2,...,0,0,0,-1,9.0,,,9.0,10.0,9.0
2,2,2,1,1,1,2,2,1,2,2,...,0,0,0,-1,9.0,,,9.0,10.0,9.0
3,3,2,1,2,1,2,2,1,2,2,...,0,0,0,-1,9.0,,,9.0,10.0,9.0
4,10,4,1,1,2,3,2,2,3,2,...,0,0,0,-1,7.0,,,7.0,3.0,8.0


In [10]:
# print out median for each numerical feature

# for col in df_all.select_dtypes(exclude = ['object']).columns:
#     print('Feature name: ', col)
#     print(df_all.loc[df_all[col] != -1, col].median())
#     print('\n')

In [11]:
# df_all.select_dtypes(exclude = ['object']).columns

In [12]:
# numerical column names

# col_num = ['total_veh_in_inc', 'total_occ_in_inc', 'total_occ_in_veh',
#        'actual_veh_in_inc', 'actual_per_in_inc', 'actual_occ_in_veh',
#        'actual_ped', 'inc_region', 'veh_lanes', 'veh_speedlim', 'occ_age',
#        'veh_doors', 'veh_wheels', 'veh_eng1', 'veh_eng2', 'drv_height',
#        'drv_weight', 'drv_crash', 'drv_pts', 'drv_dui', 'drv_spd', 'veh_speed',
#        'veh_age_new', 'drv_ban1_new', 'drv_ban2_new', 'inc_month_new',
#        'inc_hour_new', 'veh_inc_maxage']

In [75]:
df_all.loc[df_all[col_num[-1]] == -1, col_num[-1]].head()

17948   -1.0
17949   -1.0
28668   -1.0
28669   -1.0
34276   -1.0
Name: veh_inc_maxage, dtype: float64

In [76]:
for col in col_num:
    df_all.loc[df_all[col] == -1, col] = np.nan

In [13]:
# for col in col_num:
#     print('Feature name: ', col)
#     print(df_all[col].isnull().sum()/len(df_all))

In [79]:
df_all[col_num[-1]].tail()

434729    4.0
434730    4.0
434731    4.0
434732    4.0
434733    4.0
Name: veh_inc_maxage, dtype: float64

In [14]:
# for col in df_all.columns:
#     print('Feature name: ', col)
#     print(df_all[col].isnull().sum()/len(df_all))
#     print('\n')

In [83]:
# pickle the datset with categorical faetures

df_all.to_pickle('df_all_1.pkl')

In [87]:
# encode the categorical levels into numerical values:

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
#df_all['occ_role'].apply(le.fit_transform)
le.fit_transform(df_all['occ_role'])

array([0, 2, 0, ..., 2, 2, 2])

In [88]:
df_all.select_dtypes(include = ['object']).apply(le.fit_transform)

TypeError: ("'<' not supported between instances of 'str' and 'float'", 'occurred at index veh_type')

In [92]:
df_all['veh_type'].value_counts(dropna = False)

Truck            226383
Passenger Car    164808
Motorcycle        33645
NaN                9898
Name: veh_type, dtype: int64

In [91]:
le.fit_transform(df_all['veh_type'])

TypeError: '<' not supported between instances of 'str' and 'float'

In [93]:
le.fit_transform(df_all['inc_type'])

array([14, 14, 14, ..., 15, 15, 15])

In [94]:
df_all['inc_type'].value_counts(dropna = False)

Road departure                    68832
unknown                           66857
Head on collision                 50223
Impact pedestrian                 42746
Crossing paths                    39679
Road departure loss of control    36241
Side collision                    32930
Turning across path               30776
Turning into path                 19137
Rear ended slowed                 15316
Rear ended stationary             13125
Rear ended braking                 5182
Impact other                       3175
Rear ended other                   3030
Side collision changing lanes      2337
Impact parked vehicle              1621
Road departure avoid collision     1206
Impact other frontal                954
Reversing                           758
Impact fixed object                 609
Name: inc_type, dtype: int64

In [95]:
df_all['veh_style'].value_counts(dropna = False)

Saloon                         98027
SUV                            91984
Utility Vehicle                72073
Sedan                          29292
Passenger Van                  22032
Cab-Tractor Unit               18392
Coupe                          17640
Hatchback                      13160
Cruiser                        12854
Sports                         10769
NaN                             9898
Box Truck                       6289
Touring                         5449
Van_Cargo                       3974
Convertible                     3595
Van_Wagon                       3097
Van_MediumDuty                  2799
Bus                             2656
School Bus                      1690
Public Service Van              1435
Utility Truck                   1387
All-terrain Vehicle             1371
Historic Style                   979
Scooter                          700
Offroad Motorbike                687
Specialised Vocational Unit      633
All-terrain compact open         552
M

In [97]:
le.fit_transform(df_all['veh_style'].astype(str))

array([28, 28, 28, ...,  9,  9,  9])

In [98]:
le.fit_transform(df_all['veh_style'])

TypeError: '<' not supported between instances of 'str' and 'float'

In [100]:
df_all['veh_type']

AttributeError: 'Series' object has no attribute 'info'

In [15]:
# df_all.select_dtypes(include = ['object'])

In [103]:
df_all.select_dtypes(include = ['object']).columns

Index(['inc_day', 'inc_daylight', 'inc_weather', 'veh_roadcond',
       'veh_roadslope', 'veh_lanediv', 'occ_sex', 'occ_role', 'occ_position',
       'occ_seatbelt', 'occ_airbag', 'occ_alcohol', 'veh_owner', 'veh_type',
       'veh_style', 'veh_weight', 'veh_manu', 'veh_drive', 'veh_fuel',
       'veh_trailer', 'drv_licence', 'drv_status', 'drv_alcohol', 'inc_type',
       'inc_speeding', 'veh_braking', 'veh_stability', 'veh_movement',
       'veh_impact', 'inc_descrip', 'veh_descrip', 'veh_damage', 'mod_lb'],
      dtype='object')

In [109]:
df_all['veh_type'].value_counts(dropna = False)

Truck            226383
Passenger Car    164808
Motorcycle        33645
NaN                9898
Name: veh_type, dtype: int64

In [110]:
encoding_dic = {'Truck': 0, 'Passenger Car': 1, 'Motorcycle': 2}

In [112]:
df_all['veh_type'].map(encoding_dic).isnull().sum()

9898

In [116]:
np.unique(df_all.loc[~df_all['veh_type'].isnull(), 'veh_type'])

array(['Motorcycle', 'Passenger Car', 'Truck'], dtype=object)

In [16]:
# df_cat = df_all.select_dtypes(include = ['object']).copy()
# df_cat

In [120]:
len(df_cat.columns)

33

In [17]:
# df_cat.columns

In [122]:
col_names = ['inc_day', 'inc_daylight', 'inc_weather', 'veh_roadcond',
       'veh_roadslope', 'veh_lanediv', 'occ_sex', 'occ_role', 'occ_position',
       'occ_seatbelt', 'occ_airbag', 'occ_alcohol', 'veh_owner', 'veh_type',
       'veh_style', 'veh_weight', 'veh_manu', 'veh_drive', 'veh_fuel',
       'veh_trailer', 'drv_licence', 'drv_status', 'drv_alcohol', 'inc_type',
       'inc_speeding', 'veh_braking', 'veh_stability', 'veh_movement',
       'veh_impact', 'inc_descrip', 'veh_descrip', 'veh_damage']
col_names

['inc_day',
 'inc_daylight',
 'inc_weather',
 'veh_roadcond',
 'veh_roadslope',
 'veh_lanediv',
 'occ_sex',
 'occ_role',
 'occ_position',
 'occ_seatbelt',
 'occ_airbag',
 'occ_alcohol',
 'veh_owner',
 'veh_type',
 'veh_style',
 'veh_weight',
 'veh_manu',
 'veh_drive',
 'veh_fuel',
 'veh_trailer',
 'drv_licence',
 'drv_status',
 'drv_alcohol',
 'inc_type',
 'inc_speeding',
 'veh_braking',
 'veh_stability',
 'veh_movement',
 'veh_impact',
 'inc_descrip',
 'veh_descrip',
 'veh_damage']

In [131]:
for col in col_names:
    levels = np.unique(df_cat.loc[~df_cat[col].isnull(), col])
    dic_levels = {levels[i]: i for i in range(0, len(levels))}
    df_cat[col] = df_cat[col].map(dic_levels)

In [130]:
np.unique(df_cat.loc[~df_cat['veh_type'].isnull(), 'veh_type'])

array(['Motorcycle', 'Passenger Car', 'Truck'], dtype=object)

In [134]:
df_cat.isnull().sum()

inc_day               0
inc_daylight          0
inc_weather           0
veh_roadcond          0
veh_roadslope         0
veh_lanediv           0
occ_sex               0
occ_role              0
occ_position          0
occ_seatbelt          0
occ_airbag            0
occ_alcohol           0
veh_owner             0
veh_type           9898
veh_style          9898
veh_weight       400005
veh_manu          20496
veh_drive             0
veh_fuel              0
veh_trailer           0
drv_licence           0
drv_status            0
drv_alcohol           0
inc_type              0
inc_speeding          0
veh_braking           0
veh_stability         0
veh_movement          0
veh_impact            0
inc_descrip           0
veh_descrip           0
veh_damage            0
mod_lb                0
dtype: int64

In [136]:
# apply numeric encoding on the whole dataset

for col in col_names:
    levels = np.unique(df_all.loc[~df_all[col].isnull(), col])
    dic_levels = {levels[i]: i for i in range(0, len(levels))}
    df_all[col] = df_all[col].map(dic_levels)

In [18]:
# df_all

In [138]:
df_all.to_pickle('df_all_2.pkl')

In [140]:
y.to_pickle('resp.pkl')