# Kaggle Workshop


This dataset has attributes about police stops in Rhode Island. 

Can we predict the traffic stop outcome based on information about the stop?

# Load Data


First, we must load the police_project.csv with read_csv()

In [1]:
import pandas as pd
import numpy as np
policedata = pd.read_csv('police_project.csv')

We can check the loaded data with info()

In [2]:
policedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91741 entries, 0 to 91740
Data columns (total 15 columns):
stop_date             91741 non-null object
stop_time             91741 non-null object
county_name           0 non-null float64
driver_gender         86406 non-null object
driver_age_raw        86414 non-null float64
driver_age            86120 non-null float64
driver_race           86408 non-null object
violation_raw         86408 non-null object
violation             86408 non-null object
search_conducted      91741 non-null bool
search_type           3196 non-null object
stop_outcome          86408 non-null object
is_arrested           86408 non-null object
stop_duration         86408 non-null object
drugs_related_stop    91741 non-null bool
dtypes: bool(2), float64(3), object(10)
memory usage: 9.3+ MB


Check the possible labels.

In [3]:
policedata["stop_outcome"].unique()

       'Arrest Passenger', 'No Action'], dtype=object)

# Extract Features and Clean Data

Now that we have the csv file loaded, we must select the features we want to examine and clean up the data.

We can select the columns with policedata[["feature1","feature2",...,"feature6"]]

In [4]:
df = policedata[["driver_gender","driver_age","driver_race","violation", "search_conducted", "stop_outcome"]]
df.head()

Unnamed: 0,driver_gender,driver_age,driver_race,violation,search_conducted,stop_outcome
0,M,20.0,White,Speeding,False,Citation
1,M,40.0,White,Speeding,False,Citation
2,M,33.0,White,Speeding,False,Citation
3,M,19.0,White,Other,False,Arrest Driver
4,F,21.0,White,Speeding,False,Citation


Let's check if there are any null values with isna().sum()

In [5]:
df.isna().sum()

driver_gender       5335
driver_age          5621
driver_race         5333
violation           5333
search_conducted       0
stop_outcome        5333
dtype: int64

Let's drop the incomplete entries with dropna() and check the result.

In [6]:
df = df.dropna()
df.isna().sum()

driver_gender       0
driver_age          0
driver_race         0
violation           0
search_conducted    0
stop_outcome        0
dtype: int64

# Split x and y

We can look at the top 5 entries with head()

In [7]:
df.head()

Unnamed: 0,driver_gender,driver_age,driver_race,violation,search_conducted,stop_outcome
0,M,20.0,White,Speeding,False,Citation
1,M,40.0,White,Speeding,False,Citation
2,M,33.0,White,Speeding,False,Citation
3,M,19.0,White,Other,False,Arrest Driver
4,F,21.0,White,Speeding,False,Citation


What possible outcomes are there? Check with stop_outcome.unique().

In [8]:
df.stop_outcome.unique()

       'No Action'], dtype=object)

Let's split our dataframe into x and y variables.

In [9]:
x = df.iloc[:, 0:5]
y = df.iloc[:, 5]

Use pd.get_dummies(x) to transform x into numeric values.

In [10]:
x = pd.get_dummies(x)

Split into 80% training and 20% testing sets.

In [11]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Building SK-Learn Models

Building a random forest model

<img src="resources/randomforest.jpg" width=500>

In [12]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)

Building a support vector machine model

<img src="resources/svm.png" width=500>

In [13]:
from sklearn.svm import SVC
sv_rbf_clf = SVC(kernel='rbf', gamma='scale', verbose=1)
sv_poly_clf = SVC(kernel='poly', gamma='scale', verbose=1)
sv_sig_clf = SVC(kernel='sigmoid', gamma='scale', verbose=1)

Building a K Nearest Neighbors Model
<img src="resources/knn.png" width=300>

In [14]:
from sklearn import neighbors
knn_clf = neighbors.KNeighborsClassifier(n_neighbors=5)

One way to validate the model is through the test set.

In [None]:
from sklearn import metrics
rf_clf.fit(x_train, y_train)
rf_pred = rf_clf.predict(x_test)
print("RANDOM FOREST ACC:", metrics.accuracy_score(y_test, rf_pred))
print(" RANDOM FOREST SUMMARY:", metrics.classification_report(y_test, rf_pred))

sv_rbf_clf.fit(x_train, y_train)
sv_rbf_pred = sv_rbf_clf.predict(x_test) #Predict class labels for samples in X_test.
print("SVM RBF ACC:", metrics.accuracy_score(y_test, sv_rbf_pred))
print(" SVM RBF SUMMARY:", metrics.classification_report(y_test, sv_rbf_pred))

sv_poly_clf.fit(x_train, y_train)
sv_poly_pred = sv_rbf_clf.predict(x_test) #Predict class labels for samples in X_test.
print("SVM POLY ACC:", metrics.accuracy_score(y_test, sv_poly_pred))
print(" SVM POLY SUMMARY:", metrics.classification_report(y_test, sv_poly_pred))

sv_sig_clf.fit(x_train, y_train)
sv_sig_pred = sv_rbf_clf.predict(x_test) #Predict class labels for samples in X_test.
print("SVM SIGMOID ACC:", metrics.accuracy_score(y_test, sv_sig_pred))
print(" SVM SIGMOID SUMMARY:", metrics.classification_report(y_test, sv_sig_pred))

knn_clf.fit(x_train, y_train)
knn_pred = knn_clf.predict(x_test) #Predict class labels for samples in X_test.
print("KNN ACC:", metrics.accuracy_score(y_test, knn_pred))
print(" KNN SUMMARY:", metrics.classification_report(y_test, knn_pred))

RANDOM FOREST ACC: 0.8913081344713465


  'precision', 'predicted', average, warn_for)


 RANDOM FOREST SUMMARY:                   precision    recall  f1-score   support

   Arrest Driver       0.00      0.00      0.00       509
Arrest Passenger       0.00      0.00      0.00        74
        Citation       0.89      1.00      0.94     15351
             N/D       0.00      0.00      0.00       133
       No Action       0.00      0.00      0.00       106

        accuracy                           0.89     17223
       macro avg       0.15      0.17      0.16     17223
    weighted avg       0.79      0.89      0.84     17223

[LibSVM]SVM RBF ACC: 0.8913081344713465


  'precision', 'predicted', average, warn_for)


 SVM RBF SUMMARY:                   precision    recall  f1-score   support

   Arrest Driver       0.00      0.00      0.00       509
Arrest Passenger       0.00      0.00      0.00        74
        Citation       0.89      1.00      0.94     15351
             N/D       0.00      0.00      0.00       133
       No Action       0.00      0.00      0.00       106

        accuracy                           0.89     17223
       macro avg       0.15      0.17      0.16     17223
    weighted avg       0.79      0.89      0.84     17223

[LibSVM]

Another way is cross fold validation. Either cross_val_score or cross_val_predict.
Cross validation is useful for choosing a model and its hyperparameters.

<img src="resources/crossval.png" width=500>

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf_clf, x_test, y_test, cv=5, verbose=2)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Imbalanced Data

Let's say we have an imbalanced data set with 90% of one class.
Our model can classify everything as that class and achieve 90% accuracy.

Is this happening in our model?


In [None]:
y.value_counts()

In [None]:
np.unique(rf_pred)

How can we fix this?

We can balance the class weights in the model.

Binary Solutions:

Increase minority examples through resample with replacement.

Decrease majority examples through resample without replacement.

Examine ROC curve instead of accuracy.

In [None]:
y.value_counts()

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0, class_weight = "balanced")
rf_clf.fit(x_train, y_train)
rf_pred = rf_clf.predict(x_test)
print(metrics.accuracy_score(y_test, rf_pred))

# PyTorch Time

## Define how to Load Data

`Dataset` class must override 3 functions:
```
__init__()
__len__()
__getitem__(index)
```

In [None]:
import torch
from torch.utils.data import Dataset
import numpy as np

class PoliceDataset(Dataset):
    def __init__(self, csv_file, label_column="stop_outcome"):
        policedata = pd.read_csv(csv_file)
        # select appropriate columns
        df = policedata[["driver_gender","driver_age","driver_race","violation", "search_conducted", "stop_outcome"]]
        # remove NAs
        df = df.dropna()
        
        # convert to category names into numbers
        self.label_map = dict(enumerate(df['stop_outcome'].astype("category").cat.categories))
        cat_columns = df.select_dtypes(['object']).columns
        for c in cat_columns:
            df[c] = df[c].astype("category").cat.codes        

        self.features= df.drop(label_column,axis=1).values
        self.labels = df["stop_outcome"].values
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self,index):
        # return the features and the labels
        return (
            torch.tensor(self.features[index].astype(np.float), dtype=torch.float), 
            torch.tensor(self.labels[index].astype(np.float), dtype=torch.uint8)
        )

In [None]:
my_dataset= PoliceDataset("police_project.csv")
features, labels = my_dataset[0]

In [None]:
from torch.utils.data import DataLoader
my_dataloader = DataLoader(my_dataset, batch_size=16)
my_dataloader

In [None]:
features, labels = next(iter(my_dataloader))
features, labels

In [None]:
[my_dataset.label_map[int(i)] for i in labels]

## Define a Model

`nn.Module` class must override 3 functions:
```
__init__()
forward()
```

In [None]:
class MyNeuralNet(nn.Module):
    def __init__(self, input_size, num_layers=3,num_classes= 6, width=1000, dropout=0.5):
        super().__init__()
        self.input_size = input_size
        self.num_layers=num_layers
        self.width=width
        self.dropout=dropout
        
        
        layers = [nn.Linear(input_size, width)]
        layers += [
            nn.Sequential(
                nn.Dropout(dropout),
                nn.Linear(width, width),
                nn.ReLU()
            )
            for _ in range(num_layers)
        ]
        
        layers.append(nn.Linear(width, num_classes))

        self.net= nn.Sequential(*layer`s)
        
    def forward(self, x):
        return self.net(x)

In [None]:
my_net= MyNeuralNet(input_size=13, num_layers=5)
my_net

Test it out

In [None]:
my_net(features)

## Training

Same training loop from [last workshop](https://github.com/sjsumlclub/workshop-anatomy-pytorch-project/blob/master/Anatomy%20of%20a%20PyTorch%20Project%20WORKSHOP.ipynb).

In [None]:
#insert training loop here
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
my_net = my_net.to(device)

# import torch.optim
import torch.optim as optim
# instantiate an optimizer
optimizer = optim.Adam(my_net.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

from tqdm import tqdm_notebook as tqdm

# train for n epochs. an epoch is a full iteration through our dataset
num_epochs = 10

# create something to track of accuracy over time
accuracies = []

# loop over epochs
for epoch in tqdm(range(num_epochs), desc="Epoch"):
    
    # track our accuracy
    correct_this_epoch = 0.
    
    # loop over our data loader
    for data, labels in tqdm(my_dataloader, desc="Batch", leave=False):
        data, labels = data.to(device), labels.to(device)
        
        # pass data through model
        outputs = my_net(data)
        # calculate the loss
#         print(outputs.size())
#         print(labels.long().size())
        loss = criterion(outputs, labels.long())
        
        # Use our optimizer to update the network
        # 1: zero_grad our optimizer
        optimizer.zero_grad()
        # 2: run a backward pass
        loss.backward()
        # 3: make a step
        optimizer.step()
        
        _, preds = torch.max(outputs, dim=1)
        correct_this_epoch += torch.sum(preds==labels.data.long())
        
    accuracy_this_epoch = correct_this_epoch.double() / len(my_dataset)
    print(accuracy_this_epoch)
    accuracies.append(accuracy_this_epoch.item())