# Data Preparation

## Data Load

In [1]:
% matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from scipy import stats

In [None]:
acc = pd.read_csv (r'./Accident15.csv')
veh = pd.read_csv (r'./Vehicle15.csv')

In [None]:
df = pd.merge(veh, acc, how='left', on=['Accident_Index', 'Accident_Index'])

In [None]:
#df1.shape
#df1.head()
list(df)
#df.ACC_VEH_ID.nunique()
#df.Accident_Index.nunique()

In [None]:
df.shape
#df.head()
#df.duplicated()


## Data Cleansing

#### Replace 'Unknown' and 'null' with nan

In [None]:
df = df.replace('null', np.nan).copy()  #replace null with nan
df= df.replace('Unknown', np.nan).copy()
df.isnull().sum() 

#### Remove nan values from data set

In [None]:
#delete observations with nan values

#df.apply(pd.Series.nunique)    #number of distinct values

df = df.loc[df['Vehicle_Type'].notnull()]
df = df.loc[df['Sex_of_Driver'].notnull()]
df = df.loc[df['Age_of_Driver'].notnull()]
df = df.loc[df['Engine_Capacity'].notnull()]
df = df.loc[df['Age_of_Vehicle'].notnull()]
df = df.loc[df['Time'].notnull()]
df = df.loc[df['Road_Type'].notnull()]
df = df.loc[df['Weather_Conditions'].notnull()]
df = df.loc[df['Road_Surface_Conditions'].notnull()]
df = df.loc[df['Towing_Ind'].notnull()]
df = df.loc[df['Vehicle_Manoeuvre'].notnull()]
df = df.loc[df['Skidding_and_Overturning'].notnull()]
df = df.loc[df['Point_of_Impact'].notnull()]

df.isnull().sum()              #number of nan values

#### Create Target Feature

In [None]:
#Create Target
df.is_copy = False
df['Target_ind'] = (df.Accident_Severity.isin (['Fatal' , 'Serious'])).astype(int)
#df['Target_ind'] = (df.Accident_Severity.isin (['Fatal'])).astype(int)

In [None]:
list(df)

## Data Understanding

### Columns to graph

In [None]:
#Distributions
columns =[
         'Vehicle_Type',
         'Sex_of_Driver',
         #'Age_of_Driver',   too many distinct values
         #'Engine_Capacity', too many distinct values
         #'Age_of_Vehicle',  too many distinct values
         'Towing_Ind',
         'Vehicle_Manoeuvre',
         'Skidding_and_Overturning',
         'Point_of_Impact',
         'Accident_Severity',
         #'Date',            too many distinct values
         'Month',
         'Day_of_Week',
         'Weekend_Ind',
         #'Time',            too many distinct values
         'Road_Type',
         'Speed_limit',
         'Light_Conditions',
         'Weather_Conditions',
         'Road_Surface_Conditions',
         'Urban_or_Rural'   ] #columns to graph 

### Frequency Charts

In [None]:
#Frequency Charts
for i in range(len(columns)):

    
    fig, ax = plt.subplots()
    df[columns[i]].value_counts().plot(ax=ax, kind='bar',title=columns[i])

### 100% Stacked Bar Chart: Target vs Non Target

In [None]:
#100% stacked compare Target vs Non Target
for i in range(len(columns)):

    
    compare= pd.crosstab(df[columns[i]], df.Target_ind.astype(bool))
    compare.div(compare.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, color=["red","green"])

plt.xticks(rotation='vertical')

for i in range(len(columns)):plt.show

### Convert strings to numerics

In [None]:
#convert time to minutes since 00:00 to make it easier to use
time = pd.DatetimeIndex(df['Time'])
df['Minutes_since_midnight'] = (time.hour * 60 + time.minute).astype(int)

#convert strings to numerics
df['Driver_Age'] = (df.Age_of_Driver).astype(int)
df['Vehicle_Age'] = (df.Age_of_Vehicle).astype(int)
df['Engine_Capacity'] = (df.Engine_Capacity).astype(int)
df['Speed_Limit'] = (df.Speed_limit).astype(int)

### Boxplots for numeric variables

In [None]:
#Vehicle age boxplot
df.boxplot(column='Vehicle_Age', by='Target_ind')
#Driver age boxplot
df.boxplot(column='Driver_Age', by='Target_ind')
#Engine capacity boxplot
df.boxplot(column='Engine_Capacity', by='Target_ind')
#Engine capacity boxplot
df.boxplot(column='Minutes_since_midnight', by='Target_ind')

## Feature Creation

In [None]:
np.percentile(df['Engine_Capacity'] ,80)

In [None]:
#use to list values for new indicators
df.Skidding_and_Overturning.unique()

### Create New Variables

In [None]:
#Create indicators for categoricals
#Driver Gender
df['Male_ind'] = (df.Sex_of_Driver.isin (['Male'])).astype(int)
#Urban vs Rural
df['Urban_ind'] = (df.Urban_or_Rural.isin (['Urban'])).astype(int)
#Weather Conditions
df['Fine_Weather_ind'] = (df.Weather_Conditions.isin (['Fine no high winds', 'Fine & high winds'])).astype(int)
#Road Conditions
df['Dry_Road_ind'] = (df.Road_Surface_Conditions.isin (['Dry'])).astype(int)
#Light Conditions
df['Daylight_ind'] = (df.Light_Conditions.isin (['Daylight'])).astype(int)
#Month
df['Spring_ind'] = (df.Month.isin (['Feb', 'Mar', 'Apr'])).astype(int)  #used Irish definition for season
df['Summer_ind'] = (df.Month.isin (['May', 'Jun', 'Jul'])).astype(int)  #used Irish definition for season
df['Autumn_ind'] = (df.Month.isin (['May', 'Jun', 'Jul'])).astype(int)  #used Irish definition for season
df['Winter_ind'] = (df.Month.isin (['Jan','Nov','Dec'])).astype(int)    #used Irish definition for season
#Vehicle Type
df['Veh_Car_ind'] = (df.Vehicle_Type.isin (['Car'])).astype(int)
df['Veh_Motorcycle_ind'] = (df.Vehicle_Type.isin (['Motorcycle'])).astype(int)
df['Veh_Van_ind'] = (df.Vehicle_Type.isin (['Van'])).astype(int)
df['Veh_Truck_ind'] = (df.Vehicle_Type.isin (['Truck'])).astype(int)
df['Veh_Bus_ind'] = (df.Vehicle_Type.isin (['Bus'])).astype(int)
df['Veh_Agri_ind'] = (df.Vehicle_Type.isin (['Agri_vehicle'])).astype(int)
df['Veh_Other_ind'] = (df.Vehicle_Type.isin (['Other'])).astype(int)

#Road Type
df['Rd_Dual_Crgwy_ind'] = (df.Road_Type.isin (['Dual carriageway'])).astype(int)
df['Rd_Sngl_Crgwy_ind'] = (df.Road_Type.isin (['Single carriageway'])).astype(int)
df['Rd_Rndabt_ind'] = (df.Road_Type.isin (['Roundabout'])).astype(int)
df['Rd_One_Way_St_ind'] = (df.Road_Type.isin (['One way street'])).astype(int)
df['Rd_Slip_Road_ind'] = (df.Road_Type.isin (['Slip road'])).astype(int)
#Day of Week
df['Monday_ind'] = (df.Day_of_Week.isin (['Monday'])).astype(int)
df['Tuesday_ind'] = (df.Day_of_Week.isin (['Tuesday'])).astype(int)
df['Wednesday_ind'] = (df.Day_of_Week.isin (['Wednesday'])).astype(int)
df['Thursday_ind'] = (df.Day_of_Week.isin (['Thursday'])).astype(int)
df['Friday_ind'] = (df.Day_of_Week.isin (['Friday'])).astype(int)
df['Saturday_ind'] = (df.Day_of_Week.isin (['Saturday'])).astype(int)
df['Sunday_ind'] = (df.Day_of_Week.isin (['Sunday'])).astype(int)

#Point of impact
df['POI_Nearside_ind'] = (df.Point_of_Impact.isin (['Nearside'])).astype(int)
df['POI_Front_ind'] = (df.Point_of_Impact.isin (['Front'])).astype(int)
df['POI_Back_ind'] = (df.Point_of_Impact.isin (['Back'])).astype(int)
df['POI_Offside_ind'] = (df.Point_of_Impact.isin (['Offside'])).astype(int)
df['POI_No_Impact_ind'] = (df.Point_of_Impact.isin (['Did not impact'])).astype(int)

#Towing
df['Towing_ind'] = (df.Towing_Ind.isin (['Y'])).astype(int)


#Vehicle manouvre
df['VM_Parked_ind'] = (df.Vehicle_Manoeuvre.isin (['Parked'])).astype(int)
df['VM_Reversing_ind'] = (df.Vehicle_Manoeuvre.isin (['Reversing'])).astype(int)
df['VM_Slowing_ind'] = (df.Vehicle_Manoeuvre.isin (['Slowing or stopping'])).astype(int)
df['VM_Moving_off_ind'] = (df.Vehicle_Manoeuvre.isin (['Moving off'])).astype(int)
df['VM_U_Turn_ind'] = (df.Vehicle_Manoeuvre.isin (['U-turn'])).astype(int)
df['VM_Turning_Left_ind'] = (df.Vehicle_Manoeuvre.isin (['Turning left'])).astype(int)
df['VM_Turning_Right_ind'] = (df.Vehicle_Manoeuvre.isin (['Turning right'])).astype(int)
df['VM_Waiting_ind'] = (df.Vehicle_Manoeuvre.isin (['Waiting to turn right' ,'Waiting to go held up' ,'Waiting to turn left'])).astype(int)
df['VM_Changing_lane_ind'] = (df.Vehicle_Manoeuvre.isin (['Changing lane to left' , 'Changing lane to right'])).astype(int)
df['VM_Overtaking_ind'] = (df.Vehicle_Manoeuvre.isin (['Overtaking nearside' , 'Overtaking moving vehicle offside' , 'Overtaking static vehicle offside'])).astype(int)
df['VM_Going_Ahead_ind'] = (df.Vehicle_Manoeuvre.isin (['Going ahead other','Going ahead left-hand bend', 'Going ahead right-hand bend'])).astype(int)

#Skidding and Overturning
df['Skidded_ind'] = (df.Skidding_and_Overturning.isin (['Skidded'])).astype(int)
df['Jackknife_and_Overturn_ind'] = (df.Skidding_and_Overturning.isin (['Jackknifed and overturned'])).astype(int)
df['Jackknifed_ind'] = (df.Skidding_and_Overturning.isin (['Jackknifed'])).astype(int)
df['Overturned_ind'] = (df.Skidding_and_Overturning.isin (['Overturned'])).astype(int)
df['Skid_and_Overturn_ind'] = (df.Skidding_and_Overturning.isin (['Skidded and overturned'])).astype(int)

#Speed Limit
df['SL_10_ind'] = (df.Speed_limit.isin (['10'])).astype(int)
df['SL_20_ind'] = (df.Speed_limit.isin (['20'])).astype(int)
df['SL_30_ind'] = (df.Speed_limit.isin (['30'])).astype(int)
df['SL_40_ind'] = (df.Speed_limit.isin (['40'])).astype(int)
df['SL_50_ind'] = (df.Speed_limit.isin (['50'])).astype(int)
df['SL_60_ind'] = (df.Speed_limit.isin (['60'])).astype(int)
df['SL_70_ind'] = (df.Speed_limit.isin (['70'])).astype(int)

#Bin Driver Age
df['Dr_Age_decile'] = (pd.qcut(df['Driver_Age'], 10, labels=False))+1

df['Dr_Age_Up_to_25_ind'] = (df['Driver_Age'] <= 25 ).astype(int)
df['Dr_Age_26_to_34_ind'] = ((df['Driver_Age'] > 25) & (df['Driver_Age'] <= 34)).astype(int)
df['Dr_Age_35_to_44_ind'] = ((df['Driver_Age'] > 34) & (df['Driver_Age'] <= 44)).astype(int)
df['Dr_Age_44_to_54_ind'] = ((df['Driver_Age'] > 44) & (df['Driver_Age'] <= 54)).astype(int)
df['Dr_Age_55_plus_ind'] = ((df['Driver_Age'] > 54)).astype(int)

#Bin Vehicle Age
df['Veh_Age_decile'] = (pd.qcut(df['Vehicle_Age'], 10, labels=False))+1

df['Veh_Age_0_to_3_ind'] = (df['Vehicle_Age'] <= 3 ).astype(int)
df['Veh_Age_4_to_6_ind'] = ((df['Vehicle_Age'] > 3) & (df['Vehicle_Age'] <= 6)).astype(int)
df['Veh_Age_7_to_9_ind'] = ((df['Vehicle_Age'] > 7) & (df['Vehicle_Age'] <= 9)).astype(int)
df['Veh_Age_10_to_12_ind'] = ((df['Vehicle_Age'] > 9) & (df['Vehicle_Age'] <= 12)).astype(int)
df['Veh_Age_13_plus_ind'] = ((df['Vehicle_Age'] > 12)).astype(int)

#Bin Engine Capacity
df['Eng_Cap_decile'] = (pd.qcut(df['Engine_Capacity'], 10, labels=False))+1

df['Eng_Cap_0_to_1229_ind'] = (df['Engine_Capacity'] <= 1229 ).astype(int)
df['Eng_Cap_1230_to_1497_ind'] = ((df['Engine_Capacity'] > 1229) & (df['Engine_Capacity'] <= 1497)).astype(int)
df['Eng_Cap_1498_to_1794_ind'] = ((df['Engine_Capacity'] > 1497) & (df['Engine_Capacity'] <= 1794)).astype(int)
df['Eng_Cap_1795_to_1997_ind'] = ((df['Engine_Capacity'] > 1794) & (df['Engine_Capacity'] <= 1997)).astype(int)
df['Eng_Cap_1998_plus_ind'] = ((df['Engine_Capacity'] > 1997)).astype(int)


#Age Gender Interactions
df['Male_Dr_Age_Up_to_25_ind'] = ((df['Dr_Age_Up_to_25_ind'] == 1) & (df['Male_ind'] == 1)).astype(int)
df['Male_Dr_Age_26_to_34_ind'] = ((df['Dr_Age_26_to_34_ind'] == 1) & (df['Male_ind'] == 1)).astype(int)
df['Male_Dr_Age_35_to_44_ind'] = ((df['Dr_Age_35_to_44_ind'] == 1) & (df['Male_ind'] == 1)).astype(int)
df['Male_Dr_Age_44_to_54_ind'] = ((df['Dr_Age_44_to_54_ind'] == 1) & (df['Male_ind'] == 1)).astype(int)
df['Male_Dr_Age_55_plus_ind'] = ((df['Dr_Age_55_plus_ind'] == 1) & (df['Male_ind'] == 1)).astype(int)
df['Female_Dr_Age_Up_to_25_ind'] = ((df['Dr_Age_Up_to_25_ind'] == 1) & (df['Male_ind'] == 0)).astype(int)
df['Female_Dr_Age_26_to_34_ind'] = ((df['Dr_Age_26_to_34_ind'] == 1) & (df['Male_ind'] == 0)).astype(int)
df['Female_Dr_Age_35_to_44_ind'] = ((df['Dr_Age_35_to_44_ind'] == 1) & (df['Male_ind'] == 0)).astype(int)
df['Female_Dr_Age_44_to_54_ind'] = ((df['Dr_Age_44_to_54_ind'] == 1) & (df['Male_ind'] == 0)).astype(int)
df['Female_Dr_Age_55_plus_ind'] = ((df['Dr_Age_55_plus_ind'] == 1) & (df['Male_ind'] == 0)).astype(int)


#Time of day
df['Mins_since_12_decile'] = (pd.qcut(df['Minutes_since_midnight'], 10, labels=False))+1

#columns
list(df)

In [None]:
df.Eng_Cap_1998_plus_ind.sum()

#### Remove Categorical Columns and ID column

In [None]:
#update dataframe so as to remove redundant columns

df = df[['Target_ind',
         'Weekend_Ind',
         #'ACC_VEH_ID',   #don't need this column unless you were scoring overall population and wanted to use to identify fraudster
         'Male_ind',
         'Urban_ind',
         'Fine_Weather_ind',
         'Dry_Road_ind',
         'Daylight_ind',
         'Spring_ind',
         'Summer_ind',
         'Autumn_ind',
         'Winter_ind',
         'Veh_Car_ind',
         'Veh_Motorcycle_ind',
         'Veh_Van_ind',
         'Veh_Truck_ind',
         'Veh_Bus_ind',
         'Veh_Agri_ind',
         'Veh_Other_ind',
         'Rd_Dual_Crgwy_ind',
         'Rd_Sngl_Crgwy_ind',
         'Rd_Rndabt_ind',
         'Rd_One_Way_St_ind',
         'Rd_Slip_Road_ind',
         'Monday_ind',
         'Tuesday_ind',
         'Wednesday_ind',
         'Thursday_ind',
         'Friday_ind',
         'Saturday_ind',
         'Sunday_ind',
         'POI_Nearside_ind',
         'POI_Front_ind',
         'POI_Back_ind',
         'POI_Offside_ind',
         'POI_No_Impact_ind',
         'Towing_ind',
         'VM_Parked_ind',
         'VM_Reversing_ind',
         'VM_Slowing_ind',
         'VM_Moving_off_ind',
         'VM_U_Turn_ind',
         'VM_Turning_Left_ind',
         'VM_Turning_Right_ind',
         'VM_Waiting_ind',
         'VM_Changing_lane_ind',
         'VM_Overtaking_ind',
         'VM_Going_Ahead_ind',
         'Skidded_ind',
         'Jackknife_and_Overturn_ind',
         'Jackknifed_ind',
         'Overturned_ind',
         'Skid_and_Overturn_ind',
         
         'Dr_Age_decile',
         'Eng_Cap_decile',
         'Veh_Age_decile',
         'Mins_since_12_decile',
         
         'Dr_Age_Up_to_25_ind',
         'Dr_Age_26_to_34_ind',
         'Dr_Age_35_to_44_ind',
         'Dr_Age_44_to_54_ind',
         'Dr_Age_55_plus_ind',
         'Veh_Age_0_to_3_ind',
         'Veh_Age_4_to_6_ind',
         'Veh_Age_7_to_9_ind',
         'Veh_Age_10_to_12_ind',
         'Veh_Age_13_plus_ind',
         'Eng_Cap_0_to_1229_ind',
         'Eng_Cap_1230_to_1497_ind',
         'Eng_Cap_1498_to_1794_ind',
         'Eng_Cap_1795_to_1997_ind',
         'Eng_Cap_1998_plus_ind'  ,      
        'SL_10_ind',
        'SL_20_ind',
        'SL_30_ind',
        'SL_40_ind',
        'SL_50_ind',
        'SL_60_ind',
        'SL_70_ind',
        'Speed_Limit',
        'Male_Dr_Age_Up_to_25_ind',
 'Male_Dr_Age_26_to_34_ind',
 'Male_Dr_Age_35_to_44_ind',
 'Male_Dr_Age_44_to_54_ind',
 'Male_Dr_Age_55_plus_ind',
 'Female_Dr_Age_Up_to_25_ind',
 'Female_Dr_Age_26_to_34_ind',
 'Female_Dr_Age_35_to_44_ind',
 'Female_Dr_Age_44_to_54_ind',
 'Female_Dr_Age_55_plus_ind'
           ]].copy()

#### Tests to ensure its copied correctly

In [None]:
#df.head()
#df.shape
list(df)

#### Check that all columns are numerics

In [None]:
#check which columns are numerics
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

newdf = df.select_dtypes(exclude=numerics)  #exclude numeric columns

list(newdf)

# Modelling

## Creating Training and Test Sets

In [None]:
#Create train and test sets
train=df.sample(frac=0.7,random_state=200)  #random sample of 70% in train set
test=df.drop(train.index)                   #remaining observations go into blind test set

#Test the distributions are correct
#len(train)
#len(test)

#Check percentage of target in each is roughly the same
#train.Target_ind.sum()
#test.Target_ind.sum()

#### Undersampling Training Set

In [None]:
#undersample training set as target is rare event

#Step1: create dataframe containing all training set with target_ind=1
df4 = train.loc[df['Target_ind']==1]

#Step2: create dataframe containing target_ind=0 but same number of observations as the numbr of observations where target_ind=1
df5 = train.loc[df['Target_ind']==0]
df5 = df5.sample(n=len(df4)  ,random_state=17) # replace  n=len(df4) with frac=.25 to undersample to a quarter

#Step3: merge both into final training set dataframe
final_train = pd.concat([df4, df5])
len (final_train)

### Input Variables

In [None]:
data_cols = pd.DataFrame(columns=['Weekend_Ind',
 'Male_ind',
 'Urban_ind',
 'Fine_Weather_ind',
 'Dry_Road_ind',
 'Daylight_ind',
# 'Spring_ind',
# 'Summer_ind',
# 'Autumn_ind',
# 'Winter_ind',
 'Veh_Car_ind',
 'Veh_Motorcycle_ind',
 'Veh_Van_ind',
 'Veh_Truck_ind',
 'Veh_Bus_ind',
 'Veh_Agri_ind',
 'Veh_Other_ind',
 'Rd_Dual_Crgwy_ind',
 'Rd_Sngl_Crgwy_ind',
 'Rd_Rndabt_ind',
 'Rd_One_Way_St_ind',
 'Rd_Slip_Road_ind',
# 'Monday_ind',
# 'Tuesday_ind',
# 'Wednesday_ind',
# 'Thursday_ind',
# 'Friday_ind',
# 'Saturday_ind',
# 'Sunday_ind',
 'POI_Nearside_ind',
 'POI_Front_ind',
 'POI_Back_ind',
 'POI_Offside_ind',
 'POI_No_Impact_ind',
 'Towing_ind',
 'VM_Parked_ind',
 'VM_Reversing_ind',
 'VM_Slowing_ind',
 'VM_Moving_off_ind',
 'VM_U_Turn_ind',
 'VM_Turning_Left_ind',
 'VM_Turning_Right_ind',
 'VM_Waiting_ind',
 'VM_Changing_lane_ind',
 'VM_Overtaking_ind',
 'VM_Going_Ahead_ind',
 'Skidded_ind',
 'Jackknife_and_Overturn_ind',
 'Jackknifed_ind',
 'Overturned_ind',
 'Skid_and_Overturn_ind',
 'Dr_Age_decile',
 'Eng_Cap_decile',
 'Veh_Age_decile',
 'Mins_since_12_decile',
# 'Dr_Age_Up_to_25_ind',
# 'Dr_Age_26_to_34_ind',
# 'Dr_Age_35_to_44_ind',
# 'Dr_Age_44_to_54_ind',
# 'Dr_Age_55_plus_ind',
# 'Veh_Age_0_to_3_ind',
# 'Veh_Age_4_to_6_ind',
# 'Veh_Age_7_to_9_ind',
# 'Veh_Age_10_to_12_ind',
# 'Veh_Age_13_plus_ind',
# 'Eng_Cap_0_to_1229_ind',
# 'Eng_Cap_1230_to_1497_ind',
# 'Eng_Cap_1498_to_1794_ind',
# 'Eng_Cap_1795_to_1997_ind',
# 'Eng_Cap_1998_plus_ind',                                
# 'SL_10_ind',
# 'SL_20_ind',
# 'SL_30_ind',
# 'SL_40_ind',
# 'SL_50_ind',
# 'SL_60_ind',
# 'SL_70_ind',                                     
 'Speed_Limit'
# 'Male_Dr_Age_Up_to_25_ind',
# 'Male_Dr_Age_26_to_34_ind',
# 'Male_Dr_Age_35_to_44_ind',
# 'Male_Dr_Age_44_to_54_ind',
# 'Male_Dr_Age_55_plus_ind',
# 'Female_Dr_Age_Up_to_25_ind',
# 'Female_Dr_Age_26_to_34_ind',
# 'Female_Dr_Age_35_to_44_ind',
# 'Female_Dr_Age_44_to_54_ind',
# 'Female_Dr_Age_55_plus_ind'
                                 ])



### Training and Test splits and sklearn imports

In [None]:
#import useful functions from sklearn
from sklearn.cross_validation import train_test_split #didn't use because it doesn't accommodate undersampling
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn import linear_model as lm 
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn import svm 
from patsy import dmatrices
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn import calibration

#create data_train array
data_train = final_train.as_matrix(columns=[ list(data_cols) ])
#create target_train array
target_train = final_train.as_matrix(columns=['Target_ind'])

# flatten y into a 1-D array
target_train = np.ravel(target_train)

#create data_test array
data_test = test.as_matrix(columns=[ list(data_cols) ])
#create target_test array
target_test = test.as_matrix(columns=['Target_ind'])
# flatten y into a 1-D array
target_test = np.ravel(target_test)

In [None]:
# what percentage of base had serious accident?
check = ((target_test.mean()))
print(check)

## Feature Selection: Logistic Regression

### Recursive Feature Elimination: Logistic Regression

In [None]:
#Recursive Feature Elimination
model_RFE_lr = lm.LogisticRegression()

# create the RFE model
rfe_lr = RFE(model_RFE_lr, n_features_to_select=20)
rfe_lr = rfe_lr.fit(data_train, target_train)

### Recursive Feature Elimination Cross Validated: Logistic Regression

In [None]:
#Recursive Feature Elimination
model_RFE_lr = lm.LogisticRegression()

# create the RFECV model
rfe_lr = RFECV(model_RFE_lr, step=1, cv=10, scoring='accuracy')   #'roc_auc' 
rfe_lr = rfe_lr.fit(data_train, target_train)

#### Table of Columns to keep: Logistic Regression

In [None]:
result_lr = rfe_lr.support_
rank_lr = rfe_lr.ranking_

RFE_cols_lr = pd.DataFrame(result_lr, columns = ['Keep'])
RFE_rank_lr = pd.DataFrame(rank_lr, columns = ['Rank'])
data_cols_df = pd.DataFrame(list(data_cols), columns =['Variable'])


RFE_cols_lr['indexs'] = RFE_cols_lr.index
RFE_rank_lr['indexs'] = RFE_rank_lr.index
data_cols_df['indexs'] = data_cols_df.index

keep_vars_lr = pd.merge(data_cols_df, RFE_cols_lr , on=['indexs'])
keep_vars_lr = pd.merge(keep_vars_lr, RFE_rank_lr , on=['indexs'])
keep_vars_lr = keep_vars_lr.drop('indexs', axis=1)

pd.options.display.max_rows = 90
keep_vars_lr[:100] 

## Model Creation:  Logistic Regression

#### Final Columns: Logistic Regression

In [None]:
final_cols_lr = pd.DataFrame(columns=['Weekend_Ind'
,'Male_ind'
,'Urban_ind'
,'Fine_Weather_ind'
,'Dry_Road_ind'
,'Daylight_ind'
,'Veh_Car_ind'
,'Veh_Motorcycle_ind'
,'Veh_Van_ind'
,'Veh_Truck_ind'
,'Veh_Bus_ind'
,'Veh_Agri_ind'
,'Rd_Dual_Crgwy_ind'
,'Rd_Sngl_Crgwy_ind'
,'Rd_Rndabt_ind'
,'Rd_One_Way_St_ind'
,'Rd_Slip_Road_ind'
,'POI_Nearside_ind'
,'POI_Front_ind'
,'POI_Back_ind'
,'POI_Offside_ind'
,'POI_No_Impact_ind'
,'Towing_ind'
,'VM_Parked_ind'
,'VM_Reversing_ind'
,'VM_Slowing_ind'
,'VM_Moving_off_ind'
,'VM_U_Turn_ind'
,'VM_Turning_Left_ind'
,'VM_Turning_Right_ind'
,'VM_Waiting_ind'
,'VM_Changing_lane_ind'
,'VM_Overtaking_ind'
,'VM_Going_Ahead_ind'
,'Skidded_ind'
,'Jackknife_and_Overturn_ind'
,'Jackknifed_ind'
,'Overturned_ind'
,'Skid_and_Overturn_ind'
,'Dr_Age_decile'
,'Veh_Age_decile'
,'Speed_Limit'
 ])

#### Final Train and Test sets: Logistic Regression

In [None]:
data_train_lr = final_train.as_matrix(columns=[ list(final_cols_lr) ])
data_test_lr = test.as_matrix(columns=[ list(final_cols_lr) ])

#### Model Build: Logistic Regression

In [None]:
model_lr = lm.LogisticRegression()
model_lr.fit(data_train_lr, target_train)

#### Coefficients: Logistic Regression

In [None]:
# examine the coefficients
coef = pd.DataFrame(list(zip(final_cols_lr, np.transpose(model_lr.coef_))),columns= ['Variable','Coefficient'])

coef.sort_values('Coefficient',ascending=False)

#### Predict on Blind Test: Logistic Regression

In [None]:
#Predict target for the test set
pred_model_lr = model_lr.predict(data_test_lr)

#generate class probabilities
pred_probs_model_lr = model_lr.predict_proba(data_test_lr)

## Feature Selection: Decision Tree

### Recursive Feature Elimination Cross Validated: Decision Tree

In [None]:
#Recursive Feature Elimination
model_RFE_dt = dt(criterion='entropy' , max_depth = 10)

# create the RFE model
#rfe_dt = RFE(model_RFE_dt, n_features_to_select=10, step=1 )
rfe_dt = RFECV(model_RFE_dt, step=1, cv=10, scoring='accuracy')   #'roc_auc' 
rfe_dt = rfe_dt.fit(data_train, target_train)

#### Table of Columns to keep: Decision Tree

In [None]:
result_dt = rfe_dt.support_
rank_dt = rfe_dt.ranking_

RFE_cols_dt = pd.DataFrame(result_dt, columns = ['Keep'])
RFE_rank_dt = pd.DataFrame(rank_dt, columns = ['Rank'])
data_cols_df = pd.DataFrame(list(data_cols), columns =['Variable'])


RFE_cols_dt['indexs'] = RFE_cols_dt.index
RFE_rank_dt['indexs'] = RFE_rank_dt.index
data_cols_df['indexs'] = data_cols_df.index

keep_vars_dt = pd.merge(data_cols_df, RFE_cols_dt , on=['indexs'])
keep_vars_dt = pd.merge(keep_vars_dt, RFE_rank_dt , on=['indexs'])
keep_vars_dt = keep_vars_dt.drop('indexs', axis=1)

pd.options.display.max_rows = 90
keep_vars_dt[:100] 

## Model Creation: Decision Tree

#### Final Columns: Decision Tree

In [None]:
final_cols_dt = pd.DataFrame(columns=[
'Weekend_Ind'
,'Male_ind'
,'Urban_ind'
,'Fine_Weather_ind'
,'Dry_Road_ind'
,'Daylight_ind'
,'Veh_Car_ind'
,'Veh_Motorcycle_ind'
,'Veh_Van_ind'
,'Veh_Truck_ind'
,'Rd_Dual_Crgwy_ind'
,'Rd_Sngl_Crgwy_ind'
,'Rd_Rndabt_ind'
,'POI_Nearside_ind'
,'POI_Front_ind'
,'POI_Back_ind'
,'POI_Offside_ind'
,'POI_No_Impact_ind'
,'Towing_ind'
,'VM_Parked_ind'
,'VM_Reversing_ind'
,'VM_Slowing_ind'
,'VM_Moving_off_ind'
,'VM_U_Turn_ind'
,'VM_Turning_Left_ind'
,'VM_Turning_Right_ind'
,'VM_Waiting_ind'
,'VM_Changing_lane_ind'
,'VM_Overtaking_ind'
,'VM_Going_Ahead_ind'
,'Skidded_ind'
,'Overturned_ind'
,'Skid_and_Overturn_ind'
,'Dr_Age_decile'
,'Eng_Cap_decile'
,'Veh_Age_decile'
,'Mins_since_12_decile'
,'Speed_Limit'  
])


#### Final Train and Test sets: Decision Tree

In [None]:
data_train_dt = final_train.as_matrix(columns=[ list(final_cols_dt) ])
data_test_dt = test.as_matrix(columns=[ list(final_cols_dt) ])

#### Model Build: Decision Tree

In [None]:
model_dt = dt(criterion='entropy' , max_depth = 10)
model_dt.fit(data_train_dt, target_train)

#### Predict on Blind Test: Decision Tree

In [None]:
#Predict target for the test set
pred_model_dt = model_dt.predict(data_test_dt)

#generate class probabilities
pred_probs_model_dt = model_dt.predict_proba(data_test_dt)

## Feature Selection: SVM

### Recursive Feature Elimination Cross Validated: SVM

In [None]:
#Recursive Feature Elimination
#model_RFE_svm = svm.SVC(kernel='linear', C=1.0)
model_RFE_svm = svm.LinearSVC(C=100, loss='hinge', max_iter=5000 )

# create the RFE model
#rfe_svm = RFE(model_RFE_svm, n_features_to_select=8, step=1 )
rfe_svm = RFECV(model_RFE_svm, step=1, cv=10, scoring='accuracy')   #roc_auc can't be used for svm as it does not provide probabilities
rfe_svm = rfe_svm.fit(data_train, target_train)

#### Table of Columns to keep: SVM

In [None]:
result_svm = rfe_svm.support_
rank_svm = rfe_svm.ranking_

RFE_cols_svm = pd.DataFrame(result_svm, columns = ['Keep'])
RFE_rank_svm = pd.DataFrame(rank_svm, columns = ['Rank'])
data_cols_df = pd.DataFrame(list(data_cols), columns =['Variable'])


RFE_cols_svm['indexs'] = RFE_cols_svm.index
RFE_rank_svm['indexs'] = RFE_rank_svm.index
data_cols_df['indexs'] = data_cols_df.index

keep_vars_svm = pd.merge(data_cols_df, RFE_cols_svm , on=['indexs'])
keep_vars_svm = pd.merge(keep_vars_svm, RFE_rank_svm , on=['indexs'])
keep_vars_svm = keep_vars_svm.drop('indexs', axis=1)

pd.options.display.max_rows = 90
keep_vars_svm[:100] 

## Model Creation: SVM

#### Final Columns: SVM

In [None]:
final_cols_svm = pd.DataFrame(columns=['Urban_ind',
'Veh_Motorcycle_ind',
'Veh_Van_ind',
'Rd_Rndabt_ind',
'Rd_Slip_Road_ind',
'POI_Back_ind',
'VM_Parked_ind',
'VM_Reversing_ind',
'VM_Slowing_ind',
'VM_U_Turn_ind',
'VM_Turning_Right_ind',
'VM_Overtaking_ind',
'VM_Going_Ahead_ind',
'Jackknife_and_Overturn_ind',
])

#### Final Train and Test sets: SVM

In [None]:
data_train_svm = final_train.as_matrix(columns=[ list(final_cols_svm) ])
data_test_svm = test.as_matrix(columns=[ list(final_cols_svm) ])

#### Model Build: SVM

In [None]:
#model_svm = svm.SVC(kernel='linear',max_iter=5000)
model_svm = svm.LinearSVC(loss='hinge', max_iter=5000 ) #runs faster than using above model build

model_svm = calibration.CalibratedClassifierCV(model_svm) 
model_svm.fit(data_train_svm, target_train)

#### Predict on Blind Test: SVM

In [None]:
#Predict target for the test set
pred_model_svm = model_svm.predict(data_test_svm)

#generate class probabilities
pred_probs_model_svm = model_svm.predict_proba(data_test_svm)

## Model Evaluation

### Evaluating the Performance of the model on the Blind Test

#### Evaluation Metrics

In [None]:
print('LR - Train Accuracy:     %f'%model_lr.score(data_train_lr, target_train))
print('LR - Test Accuracy:      %f'%model_lr.score(data_test_lr, target_test))

print('DT - Train Accuracy:     %f'%model_dt.score(data_train_dt, target_train))
print('DT - Test Accuracy:      %f'%model_dt.score(data_test_dt, target_test))

print('SVM - Train Accuracy:     %f'%model_svm.score(data_train_svm, target_train))
print('SVM - Test Accuracy:      %f'%model_svm.score(data_test_svm, target_test))

In [None]:
print ('Accuracy:    LR - %f'%metrics.accuracy_score(target_test, pred_model_lr), ' DT - %f'%metrics.accuracy_score(target_test, pred_model_dt) , ' SVM - %f'%metrics.accuracy_score(target_test, pred_model_svm))
print ('ROC AUC:     LR - %f'%metrics.roc_auc_score(target_test, pred_probs_model_lr[:, 1]),' DT - %f'%metrics.roc_auc_score(target_test, pred_probs_model_dt[:, 1]),' SVM - %f'%metrics.roc_auc_score(target_test, pred_probs_model_svm[:, 1]))
print ('Gini:        LR - %f'%(2*(metrics.roc_auc_score(target_test, pred_probs_model_lr[:, 1])) - 1) , ' DT - %f'%(2*(metrics.roc_auc_score(target_test, pred_probs_model_dt[:, 1])) - 1),' SVM - %f'%(2*(metrics.roc_auc_score(target_test, pred_probs_model_svm[:, 1])) - 1) )  
print ('Recall:      LR - %f'%metrics.recall_score(target_test,pred_model_lr), ' DT - %f'%metrics.recall_score(target_test,pred_model_dt) , ' SVM - %f'%metrics.recall_score(target_test,pred_model_svm))
print ('Precision:   LR - %f'%metrics.precision_score(target_test,pred_model_lr) , ' DT - %f'%metrics.precision_score(target_test,pred_model_dt) ,  ' SVM - %f'%metrics.precision_score(target_test,pred_model_svm))

#### ROC Curve

In [None]:
#generate class probabilities for target ind = 1
pred_probs_target_lr = model_lr.predict_proba(data_test_lr)[:, 1]
pred_probs_target_dt = model_dt.predict_proba(data_test_dt)[:, 1]
pred_probs_target_svm = model_svm.predict_proba(data_test_svm)[:, 1]

fpr_lr, tpr_lr, thresholds_lr = metrics.roc_curve(target_test , pred_probs_target_lr)
fpr_dt, tpr_dt, thresholds_dt = metrics.roc_curve(target_test , pred_probs_target_dt)
fpr_svm, tpr_svm, thresholds_svm = metrics.roc_curve(target_test , pred_probs_target_svm)

#line to show Logistic Regression
plt.plot(fpr_lr, tpr_lr, linewidth= 1.5, color = 'c', label='Logistic Regression')

#line to show Decision Tree
plt.plot(fpr_dt, tpr_dt, linewidth= 1.5, color = 'm', label='Decision Tree')

#line to show Support Vector Machine
plt.plot(fpr_svm, tpr_svm, linewidth= 1.5, color = 'y', label='Support Vector Machine')

#line to show the the diagonal
plt.plot(fpr_lr, fpr_lr, linewidth= 1.5, color = '0.5')

#plot details
plt.legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0. , fontsize=10)
plt.xlim(-0.02 , 1.02)
plt.ylim(-0.02 , 1.02)
plt.title('ROC Curve')
plt.xlabel('False Positive Rate (100 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)


#### Confusion Matrix

In [None]:
tn_lr, fp_lr, fn_lr, tp_lr = confusion_matrix(target_test, pred_model_lr).ravel()
print('Logistic Regression Model')
print('TN: %d'%tn_lr, 'FP: %d'%fp_lr, 'FN: %d'%fn_lr, 'TP: %d'%tp_lr)
print (metrics.classification_report(target_test, pred_model_lr))


tn_dt, fp_dt, fn_dt, tp_dt = confusion_matrix(target_test, pred_model_dt).ravel()
print('Decision Tree Model')
print('TN: %d'%tn_dt, 'FP: %d'%fp_dt, 'FN: %d'%fn_dt, 'TP: %d'%tp_dt)
print (metrics.classification_report(target_test, pred_model_dt))

tn_svm, fp_svm, fn_svm, tp_svm = confusion_matrix(target_test, pred_model_svm).ravel()
print('SVM Model')
print('TN: %d'%tn_svm, 'FP: %d'%fp_svm, 'FN: %d'%fn_svm, 'TP: %d'%tp_svm)
print (metrics.classification_report(target_test, pred_model_svm))

### Cumulatative Lift

#### Logistic Regression Cumulative Lift

In [None]:
df2 = pd.DataFrame(pred_probs_target_lr, columns=['probs'])
df3 = pd.DataFrame(target_test, columns=['actual'])

df2 = df3.join(df2,  how='outer')

#df2 = df2.sort(columns = 'probs')
df2['index1'] = df2.index
df2['decile_probs'] = (pd.qcut(df2['probs'], 10, labels=False))+1

#deciles by probs of actual ind
probs_act = (df2.groupby('decile_probs')['actual'].sum())
#counts by decile
dist_pop = (df2.groupby('decile_probs')['actual'].count())

print(probs_act, dist_pop)
df2.groupby('decile_probs').mean()

#the rest is done in excel i.e. divide decile_probs by counts to work out cum_lift per decile

#### Decision Tree Cumulative Lift

In [None]:
df4 = pd.DataFrame(pred_probs_target_dt, columns=['probs'])
df5 = pd.DataFrame(target_test, columns=['actual'])

df4 = df5.join(df4,  how='outer')

#different method needed for deciles because of non unique bin edges
df4['decile_probs'] = (pd.qcut(df4.probs.rank(method='first'), 10, labels=False))+1


#deciles by probs of actual ind
probs_act_dt = (df4.groupby('decile_probs')['actual'].sum())
#counts by decile
dist_pop_dt = (df4.groupby('decile_probs')['actual'].count())

print(probs_act_dt , dist_pop_dt)
df4.groupby('decile_probs').mean()

#the rest is done in excel i.e. divide decile_probs by counts to work out cum_lift per decile

#### Support Vector Machine Cumulative Lift

In [None]:
df6 = pd.DataFrame(pred_probs_target_svm, columns=['probs'])
df7 = pd.DataFrame(target_test, columns=['actual'])

df6 = df7.join(df6,  how='outer')

#different method needed for deciles because of non unique bin edges
df6['decile_probs'] = (pd.qcut(df6.probs.rank(method='first'), 10, labels=False))+1


#deciles by probs of actual ind
probs_act_svm = (df6.groupby('decile_probs')['actual'].sum())
#counts by decile
dist_pop_svm = (df6.groupby('decile_probs')['actual'].count())

print(probs_act_svm , dist_pop_svm)
df6.groupby('decile_probs').mean()

#the rest is done in excel i.e. divide decile_probs by counts to work out cum_lift per decile

### Adjusting Thresholds

#### Check probabilities

In [None]:
plt.hist(pred_probs_target_lr, bins=10, color = 'c')
plt.xlim(0,1)
plt.xlabel('Predicted probabilities of Serious accident')
plt.ylabel('Frequency')
plt.title('Logistic Regression')

In [None]:
plt.hist(pred_probs_target_dt, bins=10, color = 'm')
plt.xlim(0,1)
plt.xlabel('Predicted probabilities of Serious accident')
plt.ylabel('Frequency')
plt.title('Decision Tree')

In [None]:

plt.hist(pred_probs_target_svm, bins=10, color = 'y')
plt.xlim(0,1)
plt.xlabel('Predicted probabilities of Serious accident')
plt.ylabel('Frequency')
plt.title('SVM')

### Trying Different Probability Thresholds

In [None]:
#adjust threshold for predcted probability
from sklearn.preprocessing import binarize
target_pred_class_lr = binarize(pred_probs_target_lr, 0.47)[0]
target_pred_class_dt = binarize(pred_probs_target_dt, 0.45)[0]
target_pred_class_svm = binarize(pred_probs_target_svm, 0.47)[0]

print (metrics.classification_report(target_test, target_pred_class_lr))
print (metrics.classification_report(target_test, target_pred_class_dt))
print (metrics.classification_report(target_test, target_pred_class_svm))