In [None]:
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Assuming 'classified_filtered_dataset' is the name of your CSV file
df = pd.read_csv('incident_report_w_patrol.csv')
df.drop(columns=['CLEARANCE_DATE'], inplace=True)

# Display the original number of rows
print("Original number of rows:", df.shape[0])

# Drop rows with missing values in the 'Crime Against' column
df_cleaned = df.dropna()

# Display the new number of rows
print("Number of rows after dropping missing values in 'Crime Against' column:", df_cleaned.shape[0])

# Drop additional columns that are not needed
df_cleaned = df_cleaned.drop(columns=['DATE_REPORTED', 'HIGHEST_NIBRS_CODE'])

# Remove percentage signs from the DataFrame
df_clean = df_cleaned.replace('%', '', regex=True)
df_clean.head()

# Now df_cleaned contains the DataFrame with rows having missing values in 'Crime Against' column removed
df_clean['DATE_INCIDENT_BEGAN'] = pd.to_datetime(df_clean['DATE_INCIDENT_BEGAN'], errors='coerce').dt.date
df_clean = df_clean.dropna()
# Convert the 'DATE_INCIDENT_BEGAN' column to a string format without dashes
df_clean['DATE_INCIDENT_BEGAN'] = df_clean['DATE_INCIDENT_BEGAN'].astype(str).str.replace('-', '')

# Convert specific columns to enumerated values
columns_to_enum = ['HIGHEST_NIBRS_DESCRIPTION', 'CMPD_PATROL_DIVISION', 'Crime Against']

for column in columns_to_enum:
    # Use pd.factorize to convert categorical values to enumerated values
    df_clean[column], _ = pd.factorize(df_clean[column])

# Display the cleaned DataFrame
df_clean.head()
df_clean.to_csv('merged_dataset.csv', index=False)