# Disclaimer: This notebook uses snippets from this [source](https://github.com/sjsumlclub/kaggle-workshop/blob/master/kaggle_workshop_answers.ipynb) for a basis for the pytorch classes.
* The Dataset, Training Loop, and Neural Net class are taken/adapted from the source linked above.

In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader

In [2]:
df = pd.read_csv("police_project.csv")
df.count()

stop_date             91741
stop_time             91741
county_name               0
driver_gender         86406
driver_age_raw        86414
driver_age            86120
driver_race           86408
violation_raw         86408
violation             86408
search_conducted      91741
search_type            3196
stop_outcome          86408
is_arrested           86408
stop_duration         86408
drugs_related_stop    91741
dtype: int64

In [3]:
df.head()

Unnamed: 0,stop_date,stop_time,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,2005-01-02,01:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,2005-01-18,08:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,2005-01-23,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2005-02-20,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,2005-03-14,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


# Preprocessing

* What has been done so far is to create the dataframe and look at the head to see if anything needs to be removed.
* in the next cell I remove stop_outcome and county_name from df so that I can iterate safely over the columns of df_clean.
* county_name was all NaN and most of search time was and stop_outcome was the result


In [3]:
raw_columns = []
for c in df.columns:
    if "county_name" in c or "search_type" in c:
        raw_columns.append(c)
df_clean =df.drop(columns=raw_columns)
df_clean = df_clean.dropna()
df_clean= df_clean.drop(columns=["stop_outcome"])
df_clean.columns

Index(['stop_date', 'stop_time', 'driver_gender', 'driver_age_raw',
       'driver_age', 'driver_race', 'violation_raw', 'violation',
       'search_conducted', 'is_arrested', 'stop_duration',
       'drugs_related_stop'],
      dtype='object')

# Creating the Dataset
* in this cell we create the Dataset class for the pytorch model, adapted from the link at the top
* this code makes a dataframe drops all of the columns passed in to bad columns, does one hot encoding and then saves the features and class values as properties of the class
* lines of code that were commented out were part of the original implementation

In [4]:
import torch
from torch.utils.data import Dataset
# label encoding is used
class PoliceDataset(Dataset):
    def __init__(self, csv_file, label_column="stop_outcome", bad_columns=[]):
        policedata = pd.read_csv(csv_file)
        # select appropriate columns
        if len(bad_columns) > 0:
            df = policedata.drop(columns=bad_columns)
            df = df.dropna()
        else:    
        # remove NAs
            df= policedata.drop(columns=["county_name","search_type"])
            df = df.dropna()
        # convert to category names into numbers
        map={}
        labels= df[label_column].astype("category").cat.categories
        for index, value in enumerate(labels):
            map[index]=value
        self.label_map = map
        df[label_column] = df[label_column].astype("category").cat.codes 
        self.labels = df[label_column].values
        df2= df.drop(label_column,axis=1)
        df2= pd.get_dummies(df2)
        self.features= df2.values
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self,index):
        # return the features and the labels
        return (
            torch.tensor(self.features[index].astype(np.float), dtype=torch.float), 
            torch.tensor(self.labels[index].astype(np.float), dtype=torch.uint8)
        )

# Verify Dataset works
* make a dataset and check length of features and the class values

In [7]:
police= PoliceDataset(csv_file= "police_project.csv",bad_columns=[], label_column= "stop_outcome")
# get amount of features to be used in nn
len(police.features[0])

5235

In [8]:
len(police.features)

86113

In [9]:
police.label_map

{0: 'Arrest Driver',
 1: 'Arrest Passenger',
 2: 'Citation',
 3: 'N/D',
 4: 'No Action',

# Make Neural Net here
* Feed forward net code taken from link at the top
* this lets you define the width of your layers as well as the amount of layers and probability of dropout
* input size is the number of features from the dataset

In [12]:
from torch import nn as nn
class MyNeuralNet(nn.Module):
    def __init__(self, input_size, num_layers=3,num_classes= 6, width=1000, dropout=0.5):
        super().__init__()
        self.input_size = input_size
        self.num_layers=num_layers
        self.width=width
        self.dropout=dropout
        
        
        layers = [nn.Linear(input_size, width)]
        layers += [
            nn.Sequential(
                nn.Dropout(dropout),
                nn.Linear(width, width),
                nn.ReLU()
            )
            for _ in range(num_layers)
        ]
        
        layers.append(nn.Linear(width, num_classes))

        self.net= nn.Sequential(*layers)
        
    def forward(self, x):
        return self.net(x)

# Create training loop method here
* this was adapted from the link at the top and has been changed to a method so different hyperparameters can be tested easily

In [17]:
from tqdm import tqdm_notebook as tqdm
def train(police_data, learning_rate,epochs, net_loop, batch_size=32):
    my_dataloader = DataLoader(police_data, batch_size=batch_size)
    #insert training loop here

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    my_net = net_loop.to(device)

    # import torch.optim
    import torch.optim as optim
    # instantiate an optimizer
    optimizer = optim.Adam(my_net.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    from tqdm import tqdm_notebook as tqdm

    # train for n epochs. an epoch is a full iteration through our dataset
    num_epochs = epochs

    # create something to track of accuracy over time
    accuracies = []

    # loop over epochs
    for epoch in tqdm(range(num_epochs), desc="Epoch"):
    #for epoch in range(num_epochs):
        # track our accuracy
        correct_this_epoch = 0.

        # loop over our data loader
        for data, labels in tqdm(my_dataloader, desc="Batch", leave=False):
        #for data, labels in my_dataloader:
            data, labels = data.to(device), labels.to(device)

            # pass data through model
            outputs = my_net(data)
            # calculate the loss
            #print(outputs.size())
            #print(labels.long().size())
            loss = criterion(outputs, labels.long())

            # Use our optimizer to update the network
            # 1: zero_grad our optimizer
            optimizer.zero_grad()
            # 2: run a backward pass
            loss.backward()
            # 3: make a step
            optimizer.step()

            _, preds = torch.max(outputs, dim=1)
            correct_this_epoch += torch.sum(preds==labels.data.long())

        accuracy_this_epoch = correct_this_epoch.double() / len(police)
        print(accuracy_this_epoch)
        accuracies.append(accuracy_this_epoch.item())
    return accuracies[len(accuracies)-1]

# Finding Best Accuracy with all columns intact
* this cell loops over the learning rate and epochs to see which gives the best accuracy
* save result in a dictionary
* start with dataset no extra columns removed aside from county_name and search type which have bad values

In [18]:
police_data=PoliceDataset(csv_file= "police_project.csv",bad_columns=["county_name","search_type"], label_column= "stop_outcome")
layers = 7
width=1000
lr=[.001, .002,.003]
epochs= [1,5,10]
results={}
net_loop= MyNeuralNet(input_size= len(police_data.features[0]),num_layers= layers,num_classes=len(np.unique(police_data.labels)),
            width=width)
for rate in tqdm(lr, desc="Iter"):
    for epoch in tqdm(epochs, desc="SubIter", leave=False):
        highest_acc=0
        key_actual=""
        temp=train(police_data,rate, epoch,net_loop)
        key=f"Removed: none, lr: {rate}, epoch: {epoch}"
        print(key+" "+f"{temp}")
        if(temp>highest_acc):
            highest_acc=temp
            key_actual=key
        
        
        
results[key]=highest_acc
results

HBox(children=(IntProgress(value=0, description='Iter', max=3, style=ProgressStyle(description_width='initial'…

HBox(children=(IntProgress(value=0, description='SubIter', max=3, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Batch', max=2692, style=ProgressStyle(description_width='init…

tensor(0.8915, dtype=torch.float64)

Removed: none, lr: 0.001, epoch: 1 0.8914565745009464


HBox(children=(IntProgress(value=0, description='Epoch', max=5, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Batch', max=2692, style=ProgressStyle(description_width='init…

tensor(0.8905, dtype=torch.float64)


HBox(children=(IntProgress(value=0, description='Batch', max=2692, style=ProgressStyle(description_width='init…

tensor(0.8909, dtype=torch.float64)


HBox(children=(IntProgress(value=0, description='Batch', max=2692, style=ProgressStyle(description_width='init…

KeyboardInterrupt: 

# Training loop for column removal
* Remove all columns one at a time
* save all accuracies in the same dictionary as the other loop
* finally print all of the dictionary

In [None]:
for col in tqdm(df_clean.columns, desc="columns iterated through"):
    if "search_type" not in col:
        police_data=PoliceDataset(csv_file= "police_project.csv",bad_columns=["county_name","search_type",col], label_column= "stop_outcome")
        layers = 7
        width=1000
        lr=[.001, .002, .003]
        epochs= [1,5,10]
        net_loop= MyNeuralNet(input_size= len(police_data.features[0]),num_layers= layers,num_classes=len(np.unique(police_data.labels)),
                    width=width)
        for rate in tqdm(lr, desc="Iter", leave=False):
            for epoch in tqdm(epochs, desc="SubIter", leave=False):
                highest_acc=0
                key_actual=""
                temp=train(police_data,rate, epoch,net_loop)
                key=f"Removed: {col}, lr: {rate}, epoch: {epoch}"
                if(temp>highest_acc):
                    highest_acc=temp
                    key_actual=key



        results[key]=highest_acc
results

HBox(children=(IntProgress(value=0, description='columns iterated through', max=12, style=ProgressStyle(descri…

HBox(children=(IntProgress(value=0, description='Iter', max=3, style=ProgressStyle(description_width='initial'…

HBox(children=(IntProgress(value=0, description='SubIter', max=3, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Batch', max=2692, style=ProgressStyle(description_width='init…

tensor(0.8912, dtype=torch.float64)



HBox(children=(IntProgress(value=0, description='Epoch', max=5, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Batch', max=2692, style=ProgressStyle(description_width='init…

tensor(0.8907, dtype=torch.float64)


HBox(children=(IntProgress(value=0, description='Batch', max=2692, style=ProgressStyle(description_width='init…