In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

In [2]:
crime_df = pd.read_csv('Resources/cleaned_crime_df.csv')
crime_df.head()

Unnamed: 0,year,zip,division_id,npa,date_reported_y/m,place_detail_description,highest_nibrs_code,location_type_description,clearance_status
0,2021,28202,1,476,2021-08,Air/Bus/Train Terminal,23H,Outdoors,Open
1,2021,28273,21,82,2021-07,Apartment/Duplex Private Res,23F,Parking Lot,Open
2,2017,28208,2,293,2017-05,Private Residence,290,Outdoors,Open
3,2022,28269,11,125,2022-10,Hotel/Motel,11D,Indoors,Open
4,2017,28215,7,271,2017-08,Private Residence,220,Indoors,Open


In [3]:
# Define categorical features to be one-hot encoded
categorical_features = ['zip', 'division_id', 'date_reported_y/m', 'place_detail_description', 'highest_nibrs_code', 'location_type_description']

# Fill NaN values with 'missing' for categorical features using .loc
crime_df.loc[:, categorical_features] = crime_df.loc[:, categorical_features].fillna('missing')

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=False, drop='first')

# Fit and transform the categorical features
encoded_features = encoder.fit_transform(crime_df[categorical_features])

# Convert encoded features to DataFrame
encoded_features_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

# Ensure the index matches
encoded_features_df.index = crime_df.index

# Concatenate the encoded features with the original DataFrame (dropping the original categorical columns)
final_encoded_df = pd.concat([crime_df.drop(columns=categorical_features), encoded_features_df], axis=1)

# Label encode the target variable
label_encoder = LabelEncoder()
final_encoded_df['clearance_status'] = label_encoder.fit_transform(crime_df['clearance_status'])

# Save the prepared DataFrame to a new CSV file
final_encoded_df.to_csv('Resources/encoded_crime_data.csv', index=False) 

