In [None]:
# pip install scikit-learn

In [None]:
import tensorflow as tf # Import Tensorflow
import numpy as np  # Import NumPy
import pandas as pd # Import Pandas
import matplotlib.pyplot as plt # Matplotlib Data Visualization Library
import seaborn as sns # Seaborn Data Visualization Library

from sklearn.model_selection import train_test_split # Import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler # Import LabelEncoder and MinMaxScaler


In [None]:
# Upload the CSV data and convert it to a Pandas DataFrame
df = pd.read_csv('./reduced_version_data_ENEL_645.csv', index_col='Community Name')
df.head()

Unnamed: 0_level_0,Sector,Group Category,Category,Crime Count,Resident Count,Year,Month
Community Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
WHITEHORN,NORTHEAST,Crime,Street Robbery,1,12019,2019,SEP
FOOTHILLS,EAST,Crime,Theft OF Vehicle,10,317,2019,NOV
ACADIA,SOUTH,Crime,Theft FROM Vehicle,13,10520,2019,SEP
MAHOGANY,SOUTHEAST,Crime,Theft OF Vehicle,1,11784,2019,NOV
LINCOLN PARK,WEST,Crime,Commercial Break & Enter,5,2617,2019,NOV


In [None]:
# View dimensions of the DataFrame
df.shape

(100000, 7)

In [None]:
# Check for null values
df.isnull().sum() 

Sector            0
Group Category    0
Category          0
Crime Count       0
Resident Count    0
Year              0
Month             0
dtype: int64

In [None]:
# Check all unique values for Category in the DataFrame
df['Category'].unique()

array(['Street Robbery', 'Theft OF Vehicle', 'Theft FROM Vehicle',
       'Commercial Break & Enter', 'Social Disorder',
       'Assault (Non-domestic)', 'Residential Break & Enter',
       'Physical Disorder', 'Violence Other (Non-domestic)',
       'Commercial Robbery', '1320.131'], dtype=object)

In [None]:
# Pop the Category column out of df and assign it to output vector y
y = df.pop('Category')
y

Community Name
WHITEHORN                        Street Robbery
FOOTHILLS                      Theft OF Vehicle
ACADIA                       Theft FROM Vehicle
MAHOGANY                       Theft OF Vehicle
LINCOLN PARK           Commercial Break & Enter
                                 ...           
WOODBINE                      Physical Disorder
NORTH GLENMORE PARK      Assault (Non-domestic)
HAYSBORO                      Physical Disorder
FAIRVIEW INDUSTRIAL             Social Disorder
KILLARNEY/GLENGARRY            Theft OF Vehicle
Name: Category, Length: 100000, dtype: object

In [None]:
# Create a separate DataFrame with only the categorical features of df
categorical_columns_df = df[['Sector', 'Month', 'Group Category']]
categorical_columns_df

Unnamed: 0_level_0,Sector,Month,Group Category
Community Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
WHITEHORN,NORTHEAST,SEP,Crime
FOOTHILLS,EAST,NOV,Crime
ACADIA,SOUTH,SEP,Crime
MAHOGANY,SOUTHEAST,NOV,Crime
LINCOLN PARK,WEST,NOV,Crime
...,...,...,...
WOODBINE,SOUTH,JAN,Disorder
NORTH GLENMORE PARK,WEST,MAR,Crime
HAYSBORO,SOUTH,SEP,Disorder
FAIRVIEW INDUSTRIAL,SOUTH,MAR,Disorder


In [None]:
# Use get_dummies to One Hot Encode the categorical_columns_df
categorical_columns_df_enc = pd.get_dummies(data=categorical_columns_df)
categorical_columns_df_enc

Unnamed: 0_level_0,Sector_CENTRE,Sector_EAST,Sector_NORTH,Sector_NORTHEAST,Sector_NORTHWEST,Sector_SOUTH,Sector_SOUTHEAST,Sector_WEST,Month_APR,Month_AUG,Month_DEC,Month_FEB,Month_JAN,Month_JUL,Month_JUN,Month_MAR,Month_MAY,Month_NOV,Month_OCT,Month_SEP,Group Category_Crime,Group Category_Disorder
Community Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
WHITEHORN,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
FOOTHILLS,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
ACADIA,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
MAHOGANY,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
LINCOLN PARK,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WOODBINE,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
NORTH GLENMORE PARK,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
HAYSBORO,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
FAIRVIEW INDUSTRIAL,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [None]:
# Form features matrix X by combining the Encoded categorical Features with the numerical features
X = pd.concat([df[['Crime Count', 'Resident Count', 'Year']], categorical_columns_df_enc], axis=1)
X

Unnamed: 0_level_0,Crime Count,Resident Count,Year,Sector_CENTRE,Sector_EAST,Sector_NORTH,Sector_NORTHEAST,Sector_NORTHWEST,Sector_SOUTH,Sector_SOUTHEAST,Sector_WEST,Month_APR,Month_AUG,Month_DEC,Month_FEB,Month_JAN,Month_JUL,Month_JUN,Month_MAR,Month_MAY,Month_NOV,Month_OCT,Month_SEP,Group Category_Crime,Group Category_Disorder
Community Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
WHITEHORN,1,12019,2019,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
FOOTHILLS,10,317,2019,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
ACADIA,13,10520,2019,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
MAHOGANY,1,11784,2019,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
LINCOLN PARK,5,2617,2019,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WOODBINE,2,9131,2013,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
NORTH GLENMORE PARK,2,2333,2014,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
HAYSBORO,5,6943,2012,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
FAIRVIEW INDUSTRIAL,7,0,2013,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [None]:
# Instantiate MinMaxScaler object
scaler = MinMaxScaler() 
scaler.fit(X) # Fit scaler to features matrix X
X_scaled = scaler.transform(X) # Transform X using Scaler to create X_scaled

In [None]:
label_encoder = LabelEncoder() # Instantiate LabelEncoder object
label_encoder.fit(y) # Fit encoder to output vector y
y_enc = label_encoder.transform(y) # Transform y using Scaler to create y_enc

print(f"Shape of y_enc: {y_enc.shape}") # Check dimensions  of y_enc
y_enc

Shape of y_enc: (100000,)


array([7, 9, 8, ..., 4, 6, 9])

In [None]:
X_scaled # View preview of Features Matrix X

array([[0.        , 0.4624471 , 1.        , ..., 1.        , 1.        ,
        0.        ],
       [0.01153846, 0.012197  , 1.        , ..., 0.        , 1.        ,
        0.        ],
       [0.01538462, 0.40477107, 1.        , ..., 1.        , 1.        ,
        0.        ],
       ...,
       [0.00512821, 0.26714121, 0.        , ..., 1.        , 0.        ,
        1.        ],
       [0.00769231, 0.        , 0.14285714, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.26433244, 0.14285714, ..., 0.        , 1.        ,
        0.        ]])

In [None]:
# Split arrays or matrices into random train and test subsets. Use 70/30 split.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_enc, train_size=0.7, random_state=39)

In [None]:
# View dimensions of the Features Matrix X and Output Column y for both training and testing sets
print(f"Dimensions of X_train: {X_train.shape}")
print(f"Dimensions of X_test: {X_test.shape}")
print(f"Dimensions of y_train: {y_train.shape}")
print(f"Dimensions of y_test: {y_test.shape}")

# Conversion needed to pass X_train to model fit method
X_train = np.asarray(X_train).astype(np.float32) 
# Conversion needed to pass y_train to model fit method
y_train = np.asarray(y_train).astype(np.float32)

Dimensions of X_train: (70000, 25)
Dimensions of X_test: (30000, 25)
Dimensions of y_train: (70000,)
Dimensions of y_test: (30000,)


In [None]:
number_of_output_labels = y.nunique() # Count the number of unique labels in y
print(number_of_output_labels) # Print the number of unique labels in y

# Create Neural Network Model with 6 layers
model = tf.keras.Sequential ([
    tf.keras.layers.Dense (512, input_shape = (X_train.shape[1], ), activation = 'relu'),
    tf.keras.layers.Dense (256, activation = 'relu'),
    tf.keras.layers.Dense (128, activation = 'relu'),
    tf.keras.layers.Dense (64, activation = 'relu'),
    tf.keras.layers.Dense (32, activation = 'relu'),
    tf.keras.layers.Dense (number_of_output_labels, activation = 'softmax')
])

11


In [None]:
# Adam optimization: a gradient descent method that is "computationally efficient, has little memory requirement, 
# invariant to diagonal rescaling of gradients, and is well suited for problems that are large in terms of data/parameters"

# Use this SparseCategoricalCrossentropy crossentropy loss function when there are two or more label classes. 

# Accuracy metric: calculates how often predictions equal labels.
model.compile(tf.keras.optimizers.Adam(learning_rate=0.0001), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
              metrics=['accuracy'])

In [None]:
model.summary() # Print model summary

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               13312     
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 11)                363       
                                                                 
Total params: 188,235
Trainable params: 188,235
Non-trai

In [None]:
# Enables / disables eager execution of tf.functions. Needed to prevent an error for this code
tf.config.run_functions_eagerly(True)
model.fit(X_train, y_train, epochs=15)

Epoch 1/15




Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f8b6043acb0>

In [None]:
# Conversion needed to pass X_test to model evaluate method
X_test = np.asarray(X_test).astype(np.float32)
# Conversion needed to pass y_test to model evaluate method
y_test = np.asarray(y_test).astype(np.float32) 
model.evaluate(X_test, y_test)



[1.217498779296875, 0.49316665530204773]