# Kaggle Workshop


This dataset has attributes about police stops in Rhode Island. 

Can we predict the traffic stop outcome based on information about the stop?

# Load Data


First, we must load the police_project.csv with read_csv()

In [None]:
# import pandas, read the csv "police_project.csv"
policedata = ?

We can check the loaded data with info()

# Extract Features and Clean Data

Now that we have the csv file loaded, we must select the features we want to examine and clean up the data.

We can select the columns with policedata[["feature1","feature2",...,"feature6"]]

In [None]:
df = policedata[["driver_gender","driver_age","driver_race","violation", "search_conducted", "stop_outcome"]]
df.head()

Let's check if there are any null values with isna().sum()

In [None]:
df.isna().sum()

Let's drop the incomplete entries with dropna() and check the result.

In [None]:
df = df.?
df.isna().sum()

# Split x and y

We can look at the top 5 entries with head()

In [None]:
df.?()

What possible outcomes are there? Check the unique values in the stop_outcome column.

Let's split our dataframe into x and y variables.

In [None]:
x = df.iloc[:, 0:5]
y = df.iloc[:, 5]

Use pd.get_dummies(x) to transform x into numeric values.

In [None]:
x = pd.?(x)

Split into 80% training and 20% testing sets.

In [None]:
from ? import ?
# perform the split
x_train, x_test, y_train, y_test = ?(x, y, test_size=0.2, random_state=0)

# Building SK-Learn Models

Building a random forest model

<img src="resources/randomforest.jpg" width=500>

In [None]:
# import the RandomForestClassifier
from ? import ?

rf_clf = ? # initialize RandomForestClassifier

Building a support vector machine model

<img src="resources/svm.png" width=500>

In [None]:
# import SVC model
from ? import ?

# make several svc versions with "rbf" kernel
sv_rbf_clf = SVC(kernel='rbf', gamma='scale', verbose=1)
# make other svc versions using different kernels: "poly", "sigmoid"
sv_poly_clf = ? # change kernel to "poly"
sv_sig_clf = ? # change kernel to "sigmoid"

Building a K Nearest Neighbors Model
<img src="resources/knn.png" width=300>

In [None]:
# import KNeighbors
from ? import ?

knn_clf = ? # initializae KNeighbors

One way to validate the model is through the test set.

In [None]:
from sklearn import metrics
# fit
rf_clf.?(x_train, y_train)
# predict
rf_pred = rf_clf.?(x_test)
print("RANDOM FOREST ACC:", metrics.accuracy_score(y_test, rf_pred))
print(" RANDOM FOREST SUMMARY:", metrics.classification_report(y_test, rf_pred))

sv_rbf_clf.?(x_train, y_train)
sv_rbf_pred = sv_rbf_clf.?(x_test) #Predict class labels for samples in X_test.
print("SVM RBF ACC:", metrics.accuracy_score(y_test, sv_rbf_pred))
print(" SVM RBF SUMMARY:", metrics.classification_report(y_test, sv_rbf_pred))

# sv_poly_clf.?(x_train, y_train)
# sv_poly_pred = sv_rbf_clf.?(x_test) #Predict class labels for samples in X_test.
# print("SVM POLY ACC:", metrics.accuracy_score(y_test, sv_poly_pred))
# print(" SVM POLY SUMMARY:", metrics.classification_report(y_test, sv_poly_pred))

# sv_sig_clf.?(x_train, y_train)
# sv_sig_pred = sv_rbf_clf.?(x_test) #Predict class labels for samples in X_test.
# print("SVM SIGMOID ACC:", metrics.accuracy_score(y_test, sv_sig_pred))
# print(" SVM SIGMOID SUMMARY:", metrics.classification_report(y_test, sv_sig_pred))

knn_clf.?(x_train, y_train)
knn_pred = knn_clf.?(x_test) #Predict class labels for samples in X_test.
print("KNN ACC:", metrics.accuracy_score(y_test, knn_pred))
print(" KNN SUMMARY:", metrics.classification_report(y_test, knn_pred))

Another way is cross fold validation. Either cross_val_score or cross_val_predict.
Cross validation is useful for choosing a model and its hyperparameters.

<img src="resources/crossval.png" width=500>

In [None]:
from ? import ?
scores = cross_val_score(?YOUR_MODEL?, x_test, y_test, cv=5, verbose=2)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Imbalanced Data

Let's say we have an imbalanced data set with 90% of one class.
Our model can classify everything as that class and achieve 90% accuracy.

Is this happening in our model?


In [None]:
y.value_counts()

In [None]:
np.unique(rf_pred)

How can we fix this?

We can balance the class weights in the model.

Binary Solutions:

Increase minority examples through resample with replacement.

Decrease majority examples through resample without replacement.

Examine ROC curve instead of accuracy.

In [None]:
y.value_counts()

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0, class_weight = "balanced")
rf_clf.fit(x_train, y_train)
rf_pred = rf_clf.predict(x_test)
print(metrics.accuracy_score(y_test, rf_pred))

# PyTorch Time

## Define how to Load Data

`Dataset` class must override 3 functions:
```
__init__()
__len__()
__getitem__(index)
```

In [None]:
import torch
from torch.utils.data import Dataset
import numpy as np

class PoliceDataset(Dataset):
    def __init__(self, csv_file, label_column="stop_outcome"):
        # read the csv_file
        policedata = ?
        # select appropriate columns
        df = policedata[["driver_gender","driver_age","driver_race","violation", "search_conducted", "stop_outcome"]]
        # remove NAs
        df = df.?()
        
        # Convert to category names into numbers
        # save a mapping of integers to label strings
        self.label_map = dict(enumerate(df['stop_outcome'].astype("category").cat.categories))
        cat_columns = df.select_dtypes(['object']).columns
        for c in cat_columns:
            df[c] = df[c].astype("category").cat.codes        

        self.features = ?.values
        self.labels = ?.values
        
    def __len__(self):
        return ? # [DEFINE LENGTH]
    
    def __getitem__(self,index):
        # return the features and the labels
        return (
            ?, # return features
            ? # return labels
        )

In [None]:
# construct a PoliceDataset
my_dataset = ?
features, labels = my_dataset[0]

In [None]:
# wrap it in a DataLoader
from ? import ?
my_dataloader = ?(?, batch_size=16)
my_dataloader

In [None]:
features, labels = next(iter(my_dataloader))
features, labels

In [None]:
# check our label mapping
[my_dataset.label_map[int(i)] for i in labels]

## Define a Model

`nn.Module` class must override 3 functions:
```
__init__()
forward()
```

In [None]:
class MyNeuralNet(nn.Module):
    def __init__(self, input_size, num_layers=3,num_classes= 6, width=1000, dropout=0.5):
        super().__init__()
        # save our attributes
        self.input_size = input_size
        self.num_layers=num_layers
        self.width=width
        self.dropout=dropout
        
        # Define the layers of our neural net
        layers = [nn.Linear(input_size, width)]
        layers += [
            nn.Sequential(
                nn.?,
                nn.?,
                nn.?
            )
            for _ in range(num_layers)
        ]
        
        layers.append(nn.Linear(width, num_classes))

        self.net = nn.Sequential(*layer`s)
        
    def forward(self, x):
        return ?

In [None]:
my_net = MyNeuralNet(input_size=13, num_layers=5)
my_net

Test it out

In [None]:
my_net(features)

## Training

Same training loop from [last workshop](https://github.com/sjsumlclub/workshop-anatomy-pytorch-project/blob/master/Anatomy%20of%20a%20PyTorch%20Project%20WORKSHOP.ipynb).

In [None]:
#insert training loop here
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
my_net = my_net.to(device)

# import torch.optim
import torch.optim as optim
# instantiate an optimizer
optimizer = optim.Adam(my_net.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

from tqdm import tqdm_notebook as tqdm

# train for n epochs. an epoch is a full iteration through our dataset
num_epochs = 10

# create something to track of accuracy over time
accuracies = []

# loop over epochs
for epoch in tqdm(range(num_epochs), desc="Epoch"):
    
    # track our accuracy
    correct_this_epoch = 0.
    
    # loop over our data loader
    for data, labels in tqdm(my_dataloader, desc="Batch", leave=False):
        data, labels = data.to(device), labels.to(device)
        
        # pass data through model
        outputs = my_net(data)
        # calculate the loss
#         print(outputs.size())
#         print(labels.long().size())
        loss = criterion(outputs, labels.long())
        
        # Use our optimizer to update the network
        # 1: zero_grad our optimizer
        optimizer.zero_grad()
        # 2: run a backward pass
        loss.backward()
        # 3: make a step
        optimizer.step()
        
        _, preds = torch.max(outputs, dim=1)
        correct_this_epoch += torch.sum(preds==labels.data.long())
        
    accuracy_this_epoch = correct_this_epoch.double() / len(my_dataset)
    print(accuracy_this_epoch)
    accuracies.append(accuracy_this_epoch.item())