In [None]:
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

NIBRS = pd.read_csv('NIBRS_Code.csv')

df = pd.read_csv('merged_datasets_FIXED.csv')

# Filter the DataFrame based on specific conditions and drop unnecessary columns
filtered_df = df.loc[
    (df['ADDRESS_DESCRIPTION'] != 'Location where officer took report') &
    (df['CLEARANCE_STATUS'] != 'Unfounded')
].drop(columns=['DATE_INCIDENT_END','ADDRESS_DESCRIPTION', 'YEAR', 'LOCATION', 'DATE_REPORTED', 'X_COORD_PUBLIC', 'Y_COORD_PUBLIC', 'HIGHEST_NIBRS_CODE', 'DIVISION_ID', 'CLEARANCE_DETAIL_STATUS', 'CLEARANCE_STATUS', 'CLEARANCE_DATE'])

# Map 'Vulnerable' column values to binary and drop the original column
filtered_df['VulnerableBinary'] = filtered_df['Vulnerable'].map({'No': 0, 'Yes': 1})
filtered_df = filtered_df.drop(columns=['Vulnerable'])

# Merge with NIBRS DataFrame based on 'HIGHEST_NIBRS_DESCRIPTION'
filtered_df = pd.merge(filtered_df, NIBRS, on='HIGHEST_NIBRS_DESCRIPTION', how='left')

# Create a new binary column 'Violent-Crime' based on the values in the 'Crime Against' column
filtered_df['Violent-Crime'] = (filtered_df['Crime Against'] == 'Person').astype(int)

# Set 'Violent-Crime' to 0 where it was initially 0
filtered_df.loc[filtered_df['Violent-Crime'] == 0, 'Violent-Crime'] = 0

# Specify columns to enumerate
columns_to_enum = ['PLACE_DETAIL_DESCRIPTION', 'LOCATION_TYPE_DESCRIPTION', 'PLACE_TYPE_DESCRIPTION', 'HIGHEST_NIBRS_DESCRIPTION', 'CMPD_PATROL_DIVISION', 'Crime Against']
enum_mapping = {}

# Enumerate specified columns and print the mapping of codes to original values
for column in columns_to_enum:
    filtered_df[f'{column}'], unique_codes = pd.factorize(filtered_df[column])

    # Print the mapping of codes to original values
    print(f"Mapping of {column} codes:")
    print(dict(zip(unique_codes, range(0, len(unique_codes)))))

# Replace '%' in the entire DataFrame
df_clean = filtered_df.replace('%', '', regex=True)

# Convert 'DATE_INCIDENT_BEGAN' to datetime format, drop missing values, and format as a string without dashes
df_clean['DATE_INCIDENT_BEGAN'] = pd.to_datetime(df_clean['DATE_INCIDENT_BEGAN'], errors='coerce').dt.date
df_clean = df_clean.dropna()
df_clean['DATE_INCIDENT_BEGAN'] = df_clean['DATE_INCIDENT_BEGAN'].astype(str).str.replace('-', '')

# Drop the 'Offense Code' column
df_clean = df_clean.drop(columns=['Offense Code'])

# Convert 'ZIP' to numeric, drop missing values, and print the head and the original number of rows
df_clean['ZIP'] = pd.to_numeric(df_clean['ZIP'], errors='coerce')
df_clean = df_clean.dropna()
print(df_clean.head())
print("Original number of rows:", df_clean.shape[0])