In [None]:
# pip install scikit-learn

In [None]:
import tensorflow as tf # Import Tensorflow
import numpy as np  # Import NumPy
import pandas as pd # Import Pandas
import matplotlib.pyplot as plt # Matplotlib Data Visualization Library
import seaborn as sns # Seaborn Data Visualization Library

from sklearn.model_selection import train_test_split # Import train_test_split 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler # Import Label Encode and MinMaxScaler

In [None]:
# Upload the CSV data and convert it to a Pandas DataFrame
df = pd.read_csv('./reduced_version_data_ENEL_645.csv', index_col='Community Name')
df.head()

Unnamed: 0_level_0,Sector,Group Category,Category,Crime Count,Resident Count,Year,Month
Community Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
WHITEHORN,NORTHEAST,Crime,Street Robbery,1,12019,2019,SEP
FOOTHILLS,EAST,Crime,Theft OF Vehicle,10,317,2019,NOV
ACADIA,SOUTH,Crime,Theft FROM Vehicle,13,10520,2019,SEP
MAHOGANY,SOUTHEAST,Crime,Theft OF Vehicle,1,11784,2019,NOV
LINCOLN PARK,WEST,Crime,Commercial Break & Enter,5,2617,2019,NOV


In [None]:
# View dimensions of the DataFrame
df.shape

(100000, 7)

In [None]:
# Check for null values
df.isnull().sum() 

Sector            0
Group Category    0
Category          0
Crime Count       0
Resident Count    0
Year              0
Month             0
dtype: int64

In [None]:
# Pop the Crime Count column out of df and assign it to output vector y
y = df.pop('Crime Count')
y

Community Name
WHITEHORN               1
FOOTHILLS              10
ACADIA                 13
MAHOGANY                1
LINCOLN PARK            5
                       ..
WOODBINE                2
NORTH GLENMORE PARK     2
HAYSBORO                5
FAIRVIEW INDUSTRIAL     7
KILLARNEY/GLENGARRY     1
Name: Crime Count, Length: 100000, dtype: int64

In [None]:
# Create a separate DataFrame with only the categorical features of df
categorical_columns_df = df[['Sector', 'Month', 'Group Category', 'Category']]
categorical_columns_df

Unnamed: 0_level_0,Sector,Month,Group Category,Category
Community Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WHITEHORN,NORTHEAST,SEP,Crime,Street Robbery
FOOTHILLS,EAST,NOV,Crime,Theft OF Vehicle
ACADIA,SOUTH,SEP,Crime,Theft FROM Vehicle
MAHOGANY,SOUTHEAST,NOV,Crime,Theft OF Vehicle
LINCOLN PARK,WEST,NOV,Crime,Commercial Break & Enter
...,...,...,...,...
WOODBINE,SOUTH,JAN,Disorder,Physical Disorder
NORTH GLENMORE PARK,WEST,MAR,Crime,Assault (Non-domestic)
HAYSBORO,SOUTH,SEP,Disorder,Physical Disorder
FAIRVIEW INDUSTRIAL,SOUTH,MAR,Disorder,Social Disorder


In [None]:
# Use get_dummies to One Hot Encode the categorical_columns_df
categorical_columns_df_enc = pd.get_dummies(data=categorical_columns_df)
categorical_columns_df_enc

Unnamed: 0_level_0,Sector_CENTRE,Sector_EAST,Sector_NORTH,Sector_NORTHEAST,Sector_NORTHWEST,Sector_SOUTH,Sector_SOUTHEAST,Sector_WEST,Month_APR,Month_AUG,...,Category_Assault (Non-domestic),Category_Commercial Break & Enter,Category_Commercial Robbery,Category_Physical Disorder,Category_Residential Break & Enter,Category_Social Disorder,Category_Street Robbery,Category_Theft FROM Vehicle,Category_Theft OF Vehicle,Category_Violence Other (Non-domestic)
Community Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
WHITEHORN,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
FOOTHILLS,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
ACADIA,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
MAHOGANY,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
LINCOLN PARK,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WOODBINE,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
NORTH GLENMORE PARK,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
HAYSBORO,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
FAIRVIEW INDUSTRIAL,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
# Form features matrix X by combining the Encoded categorical Features with the numerical features
X = pd.concat([df[['Resident Count', 'Year']], categorical_columns_df_enc], axis=1)
X

Unnamed: 0_level_0,Resident Count,Year,Sector_CENTRE,Sector_EAST,Sector_NORTH,Sector_NORTHEAST,Sector_NORTHWEST,Sector_SOUTH,Sector_SOUTHEAST,Sector_WEST,...,Category_Assault (Non-domestic),Category_Commercial Break & Enter,Category_Commercial Robbery,Category_Physical Disorder,Category_Residential Break & Enter,Category_Social Disorder,Category_Street Robbery,Category_Theft FROM Vehicle,Category_Theft OF Vehicle,Category_Violence Other (Non-domestic)
Community Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
WHITEHORN,12019,2019,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
FOOTHILLS,317,2019,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
ACADIA,10520,2019,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
MAHOGANY,11784,2019,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
LINCOLN PARK,2617,2019,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WOODBINE,9131,2013,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
NORTH GLENMORE PARK,2333,2014,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
HAYSBORO,6943,2012,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
FAIRVIEW INDUSTRIAL,0,2013,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
# Instantiate MinMaxScaler object
scaler = MinMaxScaler()
scaler.fit(X) # Fit scaler to features matrix X
X_scaled = scaler.transform(X) # Transform X using Scaler to create X_scaled

In [None]:
X_scaled # View preview of Features Matrix X

array([[0.4624471 , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.012197  , 1.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.40477107, 1.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.26714121, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.14285714, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.26433244, 0.14285714, 1.        , ..., 0.        , 1.        ,
        0.        ]])

In [None]:
# Split arrays or matrices into random train and test subsets. Use 70/30 split.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.7, random_state=45)

In [None]:
# View dimensions of the Features Matrix X and Output Column y for both training and testing sets
print(f"Dimensions of X_train: {X_train.shape}")
print(f"Dimensions of X_test: {X_test.shape}")
print(f"Dimensions of y_train: {y_train.shape}")
print(f"Dimensions of y_test: {y_test.shape}")

# Conversion needed to pass X_train to model fit method
X_train = np.asarray(X_train).astype(np.float32)
# Conversion needed to pass y_train to model fit method
y_train = np.asarray(y_train).astype(np.float32)

Dimensions of X_train: (70000, 35)
Dimensions of X_test: (30000, 35)
Dimensions of y_train: (70000,)
Dimensions of y_test: (30000,)


In [None]:
# Create Neural Network with 7 layers
model = tf.keras.Sequential ([
    tf.keras.layers.Dense (1024, input_shape=(X_train.shape[1], ), activation = 'relu',),
    tf.keras.layers.Dense (512, activation = 'relu'),
    tf.keras.layers.Dense (256, activation = 'relu'),
    tf.keras.layers.Dense (128, activation = 'relu'),
    tf.keras.layers.Dense (64, activation = 'relu'),
    tf.keras.layers.Dense (32, activation = 'relu'),
    tf.keras.layers.Dense (1, activation = 'relu')
])

In [None]:
# Adam optimization: a gradient descent method that is "computationally efficient, has little memory requirement, 
# invariant to diagonal rescaling of gradients, and is well suited for problems that are large in terms of data/parameters"

# MAE metric: Computes the mean absolute error between the labels and predictions.
model.compile(tf.keras.optimizers.Adam(learning_rate=0.0001), 
              loss=tf.keras.losses.MeanAbsoluteError(), 
              metrics=['mae'])

In [None]:
model.summary() # Print model summary

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1024)              36864     
                                                                 
 dense_1 (Dense)             (None, 512)               524800    
                                                                 
 dense_2 (Dense)             (None, 256)               131328    
                                                                 
 dense_3 (Dense)             (None, 128)               32896     
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 dense_5 (Dense)             (None, 32)                2080      
                                                                 
 dense_6 (Dense)             (None, 1)                 3

In [None]:
model.fit(X_train, y_train, epochs=30) # Train model and observe the mean absolute error

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f25b883d2d0>

In [None]:
model.evaluate(X_test, y_test) # Evaulate the mean absolute error on the testing data



[3.7756664752960205, 3.7756664752960205]