This notebook is to impute the missing contraband values in the california traffic stop data set. 

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, CategoricalEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import reciprocal
import os
print(os.listdir("../input"))

In [None]:
f = '../input/CA_cleaned.csv'

#the columns we will use
columns = ['stop_date', 'county_name', 'driver_gender', 'driver_age_raw',
           'driver_race', 'violation_raw', 'search_conducted', 'search_type_raw',
           'contraband_found', 'stop_outcome', 'is_arrested']

#take a random sample of the train
train = pd.read_csv(f, usecols = columns)

#Some train preprocessing

#convert gender to boolean
train['driver_gender'] = (train.driver_gender == 'M')

#extract the date information
train['stop_date'] = pd.to_datetime(train.stop_date)

#year starts at 2009, ends at 2016, so we subtract 2009 to start the time at 0
#this is general good practice for numerical stability
train['year'] = train.stop_date.dt.year - 2009

#Month is categorized as Jan = 1, Feb = 2, ..., Dec = 12
train['month'] = train.stop_date.dt.month.astype('O')

#replace the missing value in county_name
train.loc[train.county_name.isna(), 'county_name'] = 'Missing'
#train.loc[train.driver_race.isna(), 'driver_race'] 

#create an indicator
train['search'] = (train.search_conducted) * 1

We seperate the rows with missing contraband values (~1million rows) and the rest of the data which will be used for training the imputer. We will use a model-based approach where we predict the probability of the contraband value being equal to 1. Then we probabilistically impute these values.

In [None]:
training_columns = ['county_name', 'driver_gender', 'driver_age_raw',
                    'driver_race', 'violation_raw', 'search_conducted',
                    'stop_outcome', 'is_arrested', 'year', 'month']

#take only columns we will train on
missing_contraband = train.loc[train.contraband_found.isna(),]
missing_contraband = missing_contraband[training_columns]

#same as above, but on training set
train = train.loc[-train.contraband_found.isna(),]
training_columns.append('contraband_found')
train = train[training_columns]

#subset only data where a search was conducted - serves as a data reduction
train = train[train.search_conducted == 1]



In [None]:
train = train.dropna(axis = 0)
columns = train.columns
percent_missing = train.isnull().sum() * 100 / len(train)
missing_value_df = pd.DataFrame({'column_name': columns,
                                 'percent_missing': percent_missing})
missing_value_df

In [None]:
missing_contraband.shape

In [None]:
columns = missing_contraband.columns
percent_missing = missing_contraband.isnull().sum() * 100 / len(missing_contraband)
missing_value_df = pd.DataFrame({'column_name': columns,
                                 'percent_missing': percent_missing})
missing_value_df

Note that the dataframe I created `missing_contraband` consists only of those people who were searched.  This can be seen easily by checking the length of the dataframe and the summation of `search_conducted`, as `search_conducted` consists only of `1/0`. Since our data is fairly large, we will only take the traffic stops with `search_conducted = 1` as our training data, to serve as a data reduction. I don't *think* this will affect the final results in a meaningful way.

In [None]:
missing_contraband.search_conducted.sum() == len(missing_contraband)

In [None]:
#take only the relevant training samples
train = train[(train.search_conducted == True)]
labels = train.contraband_found.astype('int')

#drop the `search_conducted` columns
train.drop(['search_conducted', 'contraband_found' ], axis=1, inplace=True)
missing_contraband.drop('search_conducted', axis = 1, inplace = True)

Now we are going to encode the dataframe so that we can fit a model.

In [None]:
train = pd.get_dummies(train, drop_first = True, sparse = True)
missing_contraband = pd.get_dummies(missing_contraband, drop_first = True, sparse = True)
#train.columns.intersection(missing_contraband.columns)

In [None]:
logit = LogisticRegression(solver = 'liblinear', class_weight = 'balanced')
param_dist = {"C": reciprocal(a=1e-4,b=1e3),
              'class_weight': ['balanced', None]}

rnd_logit = RandomizedSearchCV(logit, param_dist, 
                               n_iter = 100, scoring = 'roc_auc',
                               random_state = 1, cv = 5,
                               verbose = 2, n_jobs = 4)

rnd_logit.fit(train, labels)

In [None]:
rnd_logit.best_score_

In [None]:
rnd_logit.best_params_

In [None]:
y_probs = rnd_logit.predict_proba(train)[:,1]
roc_auc_score(labels, y_probs)

In [None]:
contraband_probabilities = rnd_logit.predict_proba(missing_contraband)[:,1]
contraband_probabilities = pd.DataFrame(contraband_probabilities, columns = ['prob_of_contranband'])

Now let's send everything out as a CSV file. 

In [None]:
contraband_probabilities.to_csv('imputed probabilities')

In [None]:
f = '../input/CA_cleaned.csv'

#the columns we will use
columns = ['stop_date', 'county_name', 'driver_gender', 'driver_age_raw',
           'driver_race', 'violation_raw', 'search_conducted', 'search_type_raw',
           'contraband_found', 'stop_outcome', 'is_arrested']

#take a random sample of the train
train = pd.read_csv(f, usecols = columns)

#Some train preprocessing

#convert gender to boolean
train['driver_gender'] = (train.driver_gender == 'M')

#extract the date information
train['stop_date'] = pd.to_datetime(train.stop_date)

#year starts at 2009, ends at 2016, so we subtract 2009 to start the time at 0
#this is general good practice for numerical stability
train['year'] = train.stop_date.dt.year - 2009

#Month is categorized as Jan = 1, Feb = 2, ..., Dec = 12
train['month'] = train.stop_date.dt.month.astype('O')

#replace the missing value in county_name
train.loc[train.county_name.isna(), 'county_name'] = 'Missing'
#train.loc[train.driver_race.isna(), 'driver_race'] 

#create an indicator
train['search'] = (train.search_conducted) * 1

training_columns = ['county_name', 'driver_gender', 'driver_age_raw',
                    'driver_race', 'violation_raw', 'search_conducted',
                    'stop_outcome', 'is_arrested', 'year', 'month']

#take only columns we will train on
missing_contraband = train.loc[train.contraband_found.isna(),]
missing_contraband = missing_contraband[training_columns]

#same as above, but on training set
train = train.loc[-train.contraband_found.isna(),]
training_columns.append('contraband_found')
train = train[training_columns]

#subset only data where a search was conducted - serves as a data reduction
train = train[train.search_conducted == 1]
train = train.dropna(axis = 0)

#take only the relevant training samples
train = train[(train.search_conducted == True)]
labels = train.contraband_found.astype('int')

#drop the `search_conducted` columns
train.drop(['search_conducted'], axis=1, inplace=True)
missing_contraband.drop('search_conducted', axis = 1, inplace = True)


In [None]:
train.to_csv('impute_train.csv')
missing_contraband.to_csv('missing_contraband.csv')
contraband_probabilities.to_csv('imputed probabilities')