In [1]:
# Import Modules
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
import tensorflow as tf

Import the data using psycopg2

In [2]:
import psycopg2
from Resources.config import PostgrePw

In [3]:
# Create the connection engine
conn_string = f"host='localhost' dbname='crime_data_db' user='postgres' password='{PostgrePw}'"
conn = psycopg2.connect(conn_string)

In [4]:
# establish a connection to Postgre
try:
    conn = psycopg2.connect(conn_string)
    print("Connection successful")
    
    # query and load the table
    query = "SELECT * FROM crime_data"
    
    # load into pandas DF
    CMPD_df = pd.read_sql_query(query,conn)
    
    # Display Df
    print(CMPD_df.head())
    
except Exception as e:
    print(f"Error connecting to the database: {e}")
# since the data is loaded into pandas, we can close the connection!
finally:
    if 'conn' in locals():
        conn.close()
        print("Connection closed")

Connection successful


  CMPD_df = pd.read_sql_query(query,conn)


              x              y  year incident_report_id             location  \
0  1.448107e+06  543688.000087  2021   20210807-1032-01         500 W 4TH ST   
1  1.406833e+06  499666.999875  2021   20210714-0714-02     14200 PERUGIA WY   
2  1.439801e+06  548658.999988  2017   20170521-1457-02         3000 RUSH AV   
3  1.449394e+06  572029.000163  2022   20221028-1713-02       4900 SUNSET RD   
4  1.432559e+06  550084.000090  2020   20201214-0857-00  4600 TUCKASEEGEE RD   

        city state    zip  x_coord_public  y_coord_public  ...  \
0  CHARLOTTE    NC  28202         1448107          543688  ...   
1  CHARLOTTE    NC  28273         1406833          499667  ...   
2  CHARLOTTE    NC  28208         1439801          548659  ...   
3  CHARLOTTE    NC  28269         1449394          572029  ...   
4  CHARLOTTE    NC   None         1432559          550084  ...   

   location_type_description  place_type_description  \
0                   Outdoors  Public/Non-Residential   
1         

In [5]:
# Find the names of the columns so we can remove some of the unimportant ones
CMPD_df.columns

Index(['x', 'y', 'year', 'incident_report_id', 'location', 'city', 'state',
       'zip', 'x_coord_public', 'y_coord_public', 'latitude_public',
       'longitude_public', 'division_id', 'cmpd_patrol_division', 'npa',
       'date_reported', 'date_incident_began', 'date_incident_end',
       'address_description', 'location_type_description',
       'place_type_description', 'place_detail_description',
       'clearance_status', 'clearance_detail_status', 'clearance_date',
       'highest_nibrs_code', 'highest_nibrs_description', 'objectid',
       'globalid'],
      dtype='object')

In [6]:
#Check the data types
CMPD_df.dtypes

x                                   float64
y                                   float64
year                                  int64
incident_report_id                   object
location                             object
city                                 object
state                                object
zip                                  object
x_coord_public                        int64
y_coord_public                        int64
latitude_public                     float64
longitude_public                    float64
division_id                          object
cmpd_patrol_division                 object
npa                                   int64
date_reported                datetime64[ns]
date_incident_began                  object
date_incident_end                    object
address_description                  object
location_type_description            object
place_type_description               object
place_detail_description             object
clearance_status                

In [9]:
# Create a new dataframe with the data you want to train
# There were many good variables, but many were repetitive or superfluous for the amount of data that needs to be trained
cmpd_df = CMPD_df[['year', 'zip', 'division_id', 'npa', 'date_reported', 'place_detail_description', 'highest_nibrs_code','clearance_status']]
print(cmpd_df.head())

   year    zip division_id  npa date_reported      place_detail_description  \
0  2021  28202          01  476    2021-08-07        Air/Bus/Train Terminal   
1  2021  28273          21   82    2021-07-14  Apartment/Duplex Private Res   
2  2017  28208          02  293    2017-05-21             Private Residence   
3  2022  28269          11  125    2022-10-28                   Hotel/Motel   
4  2020   None          27  199    2020-12-14                      Day Care   

  highest_nibrs_code clearance_status  
0                23H             Open  
1                23F             Open  
2                290             Open  
3                11D             Open  
4                220             Open  


In [10]:
# Find the minimum and maximum date so we know how much time is covered in the dataset
min_date = cmpd_df['date_reported'].min()
max_date = cmpd_df['date_reported'].max()
print(f'Earliest date in dataset: {min_date}\nLatest date in dataset: {max_date}')

Earliest date in dataset: 2017-01-01 00:00:00
Latest date in dataset: 2024-05-20 00:00:00


In [11]:
#Check the data types
cmpd_df.dtypes

year                                 int64
zip                                 object
division_id                         object
npa                                  int64
date_reported               datetime64[ns]
place_detail_description            object
highest_nibrs_code                  object
clearance_status                    object
dtype: object

In [12]:
#Change the values in the 'CLEARANCE STATUS' to 0 or 1
#Define the mapping 
mapping = {'Open': True, 'Cleared by Arrest': False, 'Cleared by Arrest by Another Agency': False, 'Exceptionally Cleared': False, 'Unfounded': False}

# Apply mapping to the 'CLEARANCE_STATUS' column and fill NaN values with a default value (e.g., False)
CMPD_df['clearance_status'] = CMPD_df['clearance_status'].map(mapping).fillna(False)

#check to make sure the above worked
CMPD_df.head(5)

#Convert the 'CLEARANCE_STATUS' column into an array 

status_array = CMPD_df['clearance_status'].values
print(status_array)


[ True  True  True ...  True False False]


In [13]:
# Change DATE_REPORTED date time to only the year and month so there are less columns to train. 
# This could decrease the amount of columns from potentially 2,500+ to 84 for just dates. 
# This can be optimized if we want to add more specific dates later.
cmpd_df['date_reported'] = pd.to_datetime(cmpd_df['date_reported'])

# Change the datetime format to display only the year and month
cmpd_df['date_reported_y/m'] = cmpd_df['date_reported'].dt.strftime('%Y-%m')

# Display the DataFrame
print(cmpd_df.head())
print(cmpd_df.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmpd_df['date_reported'] = pd.to_datetime(cmpd_df['date_reported'])


   year    zip division_id  npa date_reported      place_detail_description  \
0  2021  28202          01  476    2021-08-07        Air/Bus/Train Terminal   
1  2021  28273          21   82    2021-07-14  Apartment/Duplex Private Res   
2  2017  28208          02  293    2017-05-21             Private Residence   
3  2022  28269          11  125    2022-10-28                   Hotel/Motel   
4  2020   None          27  199    2020-12-14                      Day Care   

  highest_nibrs_code clearance_status date_reported_y/m  
0                23H             Open           2021-08  
1                23F             Open           2021-07  
2                290             Open           2017-05  
3                11D             Open           2022-10  
4                220             Open           2020-12  
(659886, 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmpd_df['date_reported_y/m'] = cmpd_df['date_reported'].dt.strftime('%Y-%m')


In [14]:
# Drop the original DATE_REPORTED column and null values

# Declare new df with dropped nulls 
cleanedish_cmpd_df = cmpd_df.dropna()

# Declare final df after dropping DATE_REPORTED column 
cleaned_cmpd_df = cleanedish_cmpd_df.drop(columns=['date_reported'])

# Verify column was dropped
print(cleaned_cmpd_df.head())
# Verify that null rows were dropped
print(cleaned_cmpd_df.shape)

   year    zip division_id  npa      place_detail_description  \
0  2021  28202          01  476        Air/Bus/Train Terminal   
1  2021  28273          21   82  Apartment/Duplex Private Res   
2  2017  28208          02  293             Private Residence   
3  2022  28269          11  125                   Hotel/Motel   
6  2017  28215          07  271             Private Residence   

  highest_nibrs_code clearance_status date_reported_y/m  
0                23H             Open           2021-08  
1                23F             Open           2021-07  
2                290             Open           2017-05  
3                11D             Open           2022-10  
6                220             Open           2017-08  
(448619, 8)


In [15]:
# Encode categorical variables
#needs_encoding = ['PLACE_DETAIL_DESCRIPTION','HIGHEST_NIBRS_CODE','DATE_REPORTED_Y/M']
final_cmpd_df = pd.get_dummies(cleaned_cmpd_df, columns= ['clearance_status'])
final_cmpd_df.head()

Unnamed: 0,year,zip,division_id,npa,place_detail_description,highest_nibrs_code,date_reported_y/m,clearance_status_Cleared by Arrest,clearance_status_Cleared by Arrest by Another Agency,clearance_status_Exceptionally Cleared,clearance_status_Open,clearance_status_Unfounded
0,2021,28202,1,476,Air/Bus/Train Terminal,23H,2021-08,False,False,False,True,False
1,2021,28273,21,82,Apartment/Duplex Private Res,23F,2021-07,False,False,False,True,False
2,2017,28208,2,293,Private Residence,290,2017-05,False,False,False,True,False
3,2022,28269,11,125,Hotel/Motel,11D,2022-10,False,False,False,True,False
6,2017,28215,7,271,Private Residence,220,2017-08,False,False,False,True,False


In [16]:
final_cmpd_df.dtypes

year                                                     int64
zip                                                     object
division_id                                             object
npa                                                      int64
place_detail_description                                object
highest_nibrs_code                                      object
date_reported_y/m                                       object
clearance_status_Cleared by Arrest                        bool
clearance_status_Cleared by Arrest by Another Agency      bool
clearance_status_Exceptionally Cleared                    bool
clearance_status_Open                                     bool
clearance_status_Unfounded                                bool
dtype: object

In [17]:
# Split data into features and target variable


#X = final_cmpd_df.copy()
#X.drop(['CLEARANCE_STATUS'], axis=1)

y = final_cmpd_df['clearance_status_open'].astype(int)


KeyError: 'clearance_status_open'

In [None]:
#final_cmpd_df.head()
X

In [None]:
# Set the data rate limit directly
#import os
#os.environ['NotebookApp.iopub_data_rate_limit'] = '1000000000'

#def find_string_values(final_cmpd_df):
    #string_values = []
    #for col in final_cmpd_df.columns:
       # for i, val in enumerate(final_cmpd_df[col]):
            #if isinstance(val, str):
                #string_values.append((col, i, val))
    #return string_values

# Call the function on your DataFrame
#result = find_string_values(final_cmpd_df)
#print(result)

In [None]:
# Import sklearn to spilt the dataset into training and testing data
from sklearn.model_selection import train_test_split

# Assuming X and y are your features and target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)


In [None]:
# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create the Keras Sequential model
nn_model = tf.keras.models.Sequential()