# Project 4 Models - Andres


## Imports

In [2]:
# !pip install scikeras

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.dummy import DummyClassifier

from scikeras.wrappers import KerasClassifier

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

## Import Data

In [3]:
data = pd.read_csv('./cleaned_hatecrime.csv', index_col=0)
data.head(2)

Unnamed: 0,RecordId,ClosedYear,MonthOccurrence,County,NCIC,TotalNumberOfVictims,TotalNumberOfIndividualVictims,SuspectsRaceAsAGroup,TotalNumberOfSuspects,MostSeriousUcr,MostSeriousUcrType,MostSeriousLocation,MostSeriousBias,MostSeriousBiasType,MostSeriousVictimType,WeaponType,Offensive_Act,label
5411,CA00-0000015217,2004,2,Alameda,Alameda Co. Sheriff's Department,2,2,Unknown,0,Intimidation,Violent Crimes,Residence/Home/Driveway,Anti-Black or African American,Race/Ethnicity/Ancestry,Person,,Daubing of swastika,4.0
5412,CA00-0000015122,2004,3,Alameda,Alameda,1,1,White,1,Destruction/Damage/Vandalism,Property Crimes,Residence/Home/Driveway,Anti-Asian,Race/Ethnicity/Ancestry,Person,,Threatening letters/flyers/email,4.0


## Transform and Scale Data

### Column Transformer

In [5]:
X = data.drop(columns=['RecordId', 'label'])
ct = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    remainder='passthrough',
    verbose_feature_names_out=False
)
X_encoded = ct.fit_transform(X)
X_encoded
ct.get_feature_names_out()
X_encoded = pd.DataFrame(X_encoded, columns=ct.get_feature_names_out())

In [6]:
X_encoded.head(2)

Unnamed: 0,County_Alameda,County_Alpine,County_Amador,County_Butte,County_Calaveras,County_Colusa,County_Contra Costa,County_Del Norte,County_El Dorado,County_Fresno,County_Glenn,County_Humboldt,County_Imperial,County_Inyo,County_Kern,County_Kings,County_Lake,County_Lassen,County_Los Angeles,County_Madera,County_Marin,County_Mariposa,County_Mendocino,County_Merced,County_Mono,County_Monterey,County_Napa,County_Nevada,County_Orange,County_Placer,County_Plumas,County_Riverside,County_Sacramento,County_San Benito,County_San Bernardino,County_San Diego,County_San Francisco,County_San Joaquin,County_San Luis Obispo,County_San Mateo,...,MostSeriousVictimType_Business,MostSeriousVictimType_Financial,MostSeriousVictimType_Government,MostSeriousVictimType_Other,MostSeriousVictimType_Person,MostSeriousVictimType_Religious,"WeaponType_Arson, fire","WeaponType_Blunt object (blugeon, club, etc.)","WeaponType_Firearm (unknown whether handgun, rifle or shotgun)",WeaponType_Handgun,WeaponType_Knife or other cutting or stabbing instrument,WeaponType_None,"WeaponType_Other ( bottle, rocks, spitting)","WeaponType_Other gun (pellet, BB, stun gun, etc.)","WeaponType_Personal weapons (hands, feet, teeth, etc.)",WeaponType_Poison,WeaponType_Rifle,WeaponType_Ropes or garrote strangulation or hanging,WeaponType_Shotgun,WeaponType_Unknown,WeaponType_Vehicle,Offensive_Act_Annoying telephone calls/fax,Offensive_Act_Bombing,Offensive_Act_Cross burning,Offensive_Act_Damage to vehicle,Offensive_Act_Daubing of swastika,Offensive_Act_Disturbing public assembly/meeting,Offensive_Act_Explosion,Offensive_Act_Graffiti,Offensive_Act_Hanging in Effigy,Offensive_Act_Other,Offensive_Act_Rock throwing,Offensive_Act_Threatening letters/flyers/email,Offensive_Act_Unknown,Offensive_Act_Verbal slurs,ClosedYear,MonthOccurrence,TotalNumberOfVictims,TotalNumberOfIndividualVictims,TotalNumberOfSuspects
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2004.0,2.0,2.0,2.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2004.0,3.0,1.0,1.0,1.0


### Scaling

In [7]:
X_encoded_scaled = StandardScaler().fit_transform(X_encoded)

## Target

In [8]:
y = data['label']

In [9]:
y.unique()

array(['4.0', '2.0', '0.0', '1.0', '3.0', 'None'], dtype=object)

In [10]:
y_mapped = y.map(
    {'4.0': 4,
     '2.0': 2,
     '0.0': 0,
     '1.0': 1, 
     '3.0': 3,
     'None': 5}
)

In [11]:
y_categorical = to_categorical(y_mapped, 6)

## Baseline

In [13]:
y_mapped.value_counts(normalize=True)

4    0.521314
2    0.338187
0    0.095769
5    0.038265
1    0.005204
3    0.001261
Name: label, dtype: float64

## Test/Train Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded_scaled, y_categorical, stratify=y, random_state=13)

In [15]:
X_train.shape

(14268, 745)

In [16]:
y_train.shape

(14268, 6)

## Neural Network Models with Grid Searching

With help from Lab 7.01

In [17]:
# Got input shape error in grid search. Using this from lesson with Chuck given: "Model <keras.engine.sequential.Sequential object at 0x7fdf22f98410> cannot be saved because the input shapes have not been set. Usually, input shapes are automatically determined when calling `.fit()` or `.predict()`. To manually set the shapes, call `model.build(input_shape)"
n_input = X_train.shape[1]

In [23]:
# # Create model function. Required by KerasClassifier
# def create_model(dropout_rate, neurons):
#   model = Sequential()
#   model.add(Dense(neurons, input_dim=n_input, activation='relu'))
#   model.add(Dropout(dropout_rate))
#   model.add(Dense(6, activation='softmax'))
#   # Model compile
#   model.compile(
#     optimizer='adam',
#     loss='categorical_crossentropy',
#     metrics=['accuracy']
#   )
#   return model


In [24]:
# # Create/Instantiate? model.
# model = KerasClassifier(model=create_model, verbose=2)


In [28]:
# model.model

<function __main__.create_model>

In [29]:

# # Define grid search parameters. I'm using the same from my tutorial to start it off.
# params= {
#     'batch_size': [50],
#     'epochs': [100],
#     'callbacks': [EarlyStopping(monitor='loss', patience=5)],
#     'model__dropout_rate': [.25, .5, .75, .9],
#     'model__neurons': [12, 24, 36]
# }
# # Grid Search
# gs = GridSearchCV(
#     estimator=model,
#     param_grid=params,
#     n_jobs=-1,
# )
# gs_result = gs.fit(X_train, y_train)

# # Result summary
# print(f"Best score: {gs_result.best_score_}. Used these parameters: {gs_result.best_params_}")

# # This part copied from machine learning mastery prints out all results to check where improvements can be made
# means = gs_result.cv_results_['mean_test_score']
# stds = gs_result.cv_results_['std_test_score']
# params = gs_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))



Epoch 1/100
286/286 - 1s - loss: 1.7721 - accuracy: 0.4849 - 1s/epoch - 5ms/step
Epoch 2/100
286/286 - 1s - loss: 1.0605 - accuracy: 0.6236 - 602ms/epoch - 2ms/step
Epoch 3/100
286/286 - 1s - loss: 0.9037 - accuracy: 0.6543 - 596ms/epoch - 2ms/step
Epoch 4/100
286/286 - 1s - loss: 0.8330 - accuracy: 0.6740 - 589ms/epoch - 2ms/step
Epoch 5/100
286/286 - 1s - loss: 0.7595 - accuracy: 0.6913 - 598ms/epoch - 2ms/step
Epoch 6/100
286/286 - 1s - loss: 0.7277 - accuracy: 0.7044 - 572ms/epoch - 2ms/step
Epoch 7/100
286/286 - 1s - loss: 0.7101 - accuracy: 0.7053 - 584ms/epoch - 2ms/step
Epoch 8/100
286/286 - 1s - loss: 0.6864 - accuracy: 0.7088 - 583ms/epoch - 2ms/step
Epoch 9/100
286/286 - 1s - loss: 0.6740 - accuracy: 0.7144 - 610ms/epoch - 2ms/step
Epoch 10/100
286/286 - 1s - loss: 0.6604 - accuracy: 0.7218 - 590ms/epoch - 2ms/step
Epoch 11/100
286/286 - 1s - loss: 0.6438 - accuracy: 0.7251 - 584ms/epoch - 2ms/step
Epoch 12/100
286/286 - 1s - loss: 0.6311 - accuracy: 0.7274 - 625ms/epoch - 2

In [34]:
# gs_result.best_estimator_.model.__getattribute__

<method-wrapper '__getattribute__' of function object at 0x7f87440ad3b0>

## Neural Network with parameters from GS

In [35]:
model2 = Sequential()
model2.add(Dense(36, input_dim=n_input, activation='relu'))
model2.add(Dropout(.5))
model2.add(Dense(6, activation='softmax'))
# Model compile
model2.compile(
  optimizer='adam',
  loss='categorical_crossentropy',
  metrics=['accuracy']
)

In [36]:
history = model2.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    callbacks=EarlyStopping(monitor='loss', patience=5),
    batch_size=50,
    verbose=2
)

Epoch 1/100
286/286 - 2s - loss: 1.4575 - accuracy: 0.5178 - val_loss: 0.9533 - val_accuracy: 0.6264 - 2s/epoch - 9ms/step
Epoch 2/100
286/286 - 1s - loss: 1.0029 - accuracy: 0.6241 - val_loss: 0.8573 - val_accuracy: 0.6624 - 1s/epoch - 4ms/step
Epoch 3/100
286/286 - 1s - loss: 0.8735 - accuracy: 0.6602 - val_loss: 0.8099 - val_accuracy: 0.6765 - 1s/epoch - 4ms/step
Epoch 4/100
286/286 - 1s - loss: 0.7928 - accuracy: 0.6793 - val_loss: 0.7870 - val_accuracy: 0.6824 - 1s/epoch - 4ms/step
Epoch 5/100
286/286 - 1s - loss: 0.7510 - accuracy: 0.6920 - val_loss: 0.7748 - val_accuracy: 0.6809 - 1s/epoch - 4ms/step
Epoch 6/100
286/286 - 1s - loss: 0.7196 - accuracy: 0.6994 - val_loss: 0.7555 - val_accuracy: 0.6845 - 1s/epoch - 4ms/step
Epoch 7/100
286/286 - 1s - loss: 0.7015 - accuracy: 0.7034 - val_loss: 0.7455 - val_accuracy: 0.6836 - 1s/epoch - 4ms/step
Epoch 8/100
286/286 - 1s - loss: 0.6729 - accuracy: 0.7131 - val_loss: 0.7411 - val_accuracy: 0.6952 - 1s/epoch - 4ms/step
Epoch 9/100
286/

## Neural Network, drop ClosedYear from X

### Transform and Scale Data


In [None]:
X = data.drop(columns=['RecordId', 'ClosedYear', 'label'])
ct = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    remainder='passthrough',
    verbose_feature_names_out=False
)
X_encoded = ct.fit_transform(X)
X_encoded
ct.get_feature_names_out()
X_encoded = pd.DataFrame(X_encoded, columns=ct.get_feature_names_out())

In [None]:
X_encoded.head(2)

Unnamed: 0,County_Alameda,County_Alpine,County_Amador,County_Butte,County_Calaveras,County_Colusa,County_Contra Costa,County_Del Norte,County_El Dorado,County_Fresno,County_Glenn,County_Humboldt,County_Imperial,County_Inyo,County_Kern,County_Kings,County_Lake,County_Lassen,County_Los Angeles,County_Madera,County_Marin,County_Mariposa,County_Mendocino,County_Merced,County_Mono,County_Monterey,County_Napa,County_Nevada,County_Orange,County_Placer,County_Plumas,County_Riverside,County_Sacramento,County_San Benito,County_San Bernardino,County_San Diego,County_San Francisco,County_San Joaquin,County_San Luis Obispo,County_San Mateo,...,MostSeriousBiasType_Sexual Orientation,MostSeriousVictimType_Business,MostSeriousVictimType_Financial,MostSeriousVictimType_Government,MostSeriousVictimType_Other,MostSeriousVictimType_Person,MostSeriousVictimType_Religious,"WeaponType_Arson, fire","WeaponType_Blunt object (blugeon, club, etc.)","WeaponType_Firearm (unknown whether handgun, rifle or shotgun)",WeaponType_Handgun,WeaponType_Knife or other cutting or stabbing instrument,WeaponType_None,"WeaponType_Other ( bottle, rocks, spitting)","WeaponType_Other gun (pellet, BB, stun gun, etc.)","WeaponType_Personal weapons (hands, feet, teeth, etc.)",WeaponType_Poison,WeaponType_Rifle,WeaponType_Ropes or garrote strangulation or hanging,WeaponType_Shotgun,WeaponType_Unknown,WeaponType_Vehicle,Offensive_Act_Annoying telephone calls/fax,Offensive_Act_Bombing,Offensive_Act_Cross burning,Offensive_Act_Damage to vehicle,Offensive_Act_Daubing of swastika,Offensive_Act_Disturbing public assembly/meeting,Offensive_Act_Explosion,Offensive_Act_Graffiti,Offensive_Act_Hanging in Effigy,Offensive_Act_Other,Offensive_Act_Rock throwing,Offensive_Act_Threatening letters/flyers/email,Offensive_Act_Unknown,Offensive_Act_Verbal slurs,MonthOccurrence,TotalNumberOfVictims,TotalNumberOfIndividualVictims,TotalNumberOfSuspects
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,1.0,1.0,1.0


#### Scaling

In [None]:
X_encoded_scaled = StandardScaler().fit_transform(X_encoded)

#### Target

In [None]:
y = data['label']

In [None]:
y.unique()

array(['4.0', '2.0', '0.0', '1.0', '3.0', 'None'], dtype=object)

In [None]:
y_mapped = y.map(
    {'4.0': 4,
     '2.0': 2,
     '0.0': 0,
     '1.0': 1, 
     '3.0': 3,
     'None': 5}
)

In [None]:
y_categorical = to_categorical(y_mapped, 6)

### Baseline

In [None]:
y_mapped.value_counts(normalize=True)

4    0.521314
2    0.338187
0    0.095769
5    0.038265
1    0.005204
3    0.001261
Name: label, dtype: float64

### Test/Train Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded_scaled, y_categorical, stratify=y, random_state=13)

In [None]:
X_train.shape

(14268, 744)

In [None]:
y_train.shape

(14268, 6)

### Neural Network Models with Grid Searching

With help from Lab 7.01

In [None]:
# Got input shape error in grid search. Using this from lesson with Chuck given: "Model <keras.engine.sequential.Sequential object at 0x7fdf22f98410> cannot be saved because the input shapes have not been set. Usually, input shapes are automatically determined when calling `.fit()` or `.predict()`. To manually set the shapes, call `model.build(input_shape)"
n_input = X_train.shape[1]

In [None]:
# Create model function. Required by KerasClassifier
def create_model(dropout_rate, neurons):
  model = Sequential()
  model.add(Dense(neurons, input_dim=n_input, activation='relu'))
  model.add(Dropout(dropout_rate))
  model.add(Dense(6, activation='softmax'))
  # Model compile
  model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
  )
  return model


In [None]:
# Create/Instantiate? model.
model = KerasClassifier(model=create_model, verbose=2)


In [None]:
# Define grid search parameters. I'm using the same from my tutorial to start it off.
params= {
    'batch_size': [50],
    'epochs': [100],
    'callbacks': [EarlyStopping(monitor='loss', patience=5)],
    'model__dropout_rate': [.5, .75, .9],
    'model__neurons': [12, 24, 36, 48],
}
# Grid Search
gs = GridSearchCV(
    estimator=model,
    param_grid=params,
    n_jobs=-1,
)
gs_result = gs.fit(X_train, y_train)

# Result summary
print(f"Best score: {gs_result.best_score_}. Used these parameters: {gs_result.best_params_}")

# This part copied from machine learning mastery prints out all results to check where improvements can be made
means = gs_result.cv_results_['mean_test_score']
stds = gs_result.cv_results_['std_test_score']
params = gs_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



Epoch 1/100
286/286 - 1s - loss: 1.4771 - accuracy: 0.5222 - 1s/epoch - 5ms/step
Epoch 2/100
286/286 - 1s - loss: 1.0522 - accuracy: 0.6126 - 562ms/epoch - 2ms/step
Epoch 3/100
286/286 - 1s - loss: 0.9345 - accuracy: 0.6278 - 556ms/epoch - 2ms/step
Epoch 4/100
286/286 - 1s - loss: 0.8920 - accuracy: 0.6323 - 563ms/epoch - 2ms/step
Epoch 5/100
286/286 - 1s - loss: 0.8639 - accuracy: 0.6424 - 561ms/epoch - 2ms/step
Epoch 6/100
286/286 - 1s - loss: 0.8324 - accuracy: 0.6480 - 553ms/epoch - 2ms/step
Epoch 7/100
286/286 - 1s - loss: 0.8245 - accuracy: 0.6484 - 559ms/epoch - 2ms/step
Epoch 8/100
286/286 - 1s - loss: 0.8078 - accuracy: 0.6575 - 549ms/epoch - 2ms/step
Epoch 9/100
286/286 - 1s - loss: 0.7960 - accuracy: 0.6571 - 557ms/epoch - 2ms/step
Epoch 10/100
286/286 - 1s - loss: 0.7885 - accuracy: 0.6606 - 569ms/epoch - 2ms/step
Epoch 11/100
286/286 - 1s - loss: 0.7751 - accuracy: 0.6613 - 546ms/epoch - 2ms/step
Epoch 12/100
286/286 - 1s - loss: 0.7734 - accuracy: 0.6642 - 581ms/epoch - 2

### Save Model

In [None]:
# gs_result.best_estimator_.model_.save('/content/drive/MyDrive/Colab Notebooks/4-project/model/')

INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/4-project/model/assets


In [None]:
gs_result.best_estimator_.model_.evaluate(X_test, y_test)



[2.2183337211608887, 0.6256043910980225]