### Training a Random Forest Algorithm Model

In [29]:
# Data Processing
import pandas as pd

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from collections import Counter


df = pd.read_csv("./CT-Combined/combined_data.csv")

In [30]:
# Drop rare offense categories (< 50 rows)
classes = df['offense_category_name'].value_counts()
print(df['offense_category_name'].value_counts())

common_classes = classes[classes >= 100].index
df = df[df['offense_category_name'].isin(common_classes)]

offense_category_name
Drug/Narcotic Offenses          21347
Weapon Law Violations           11277
Assault Offenses                 9041
Counterfeiting/Forgery           4569
Stolen Property Offenses         3889
Pornography/Obscene Material      982
Sex Offenses                      553
Robbery                           435
Animal Cruelty                    391
Kidnapping/Abduction              193
Homicide Offenses                  27
Gambling Offenses                   8
Name: count, dtype: int64


In [31]:
print(df['offense_category_name'].value_counts())


offense_category_name
Drug/Narcotic Offenses          21347
Weapon Law Violations           11277
Assault Offenses                 9041
Counterfeiting/Forgery           4569
Stolen Property Offenses         3889
Pornography/Obscene Material      982
Sex Offenses                      553
Robbery                           435
Animal Cruelty                    391
Kidnapping/Abduction              193
Name: count, dtype: int64


##### One Hot Encoder for X

In [32]:
# Here, what I am doing is that I'm turning all features that aren't numerical into numerical format
# One hot encoder is best for nominal data where there's no ranks
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output= False).set_output(transform='pandas')
ohetransform = ohe.fit_transform(df[['city', 'criminal_act_name', 'location_area']])

# X is the input features (what the model learns from)
numerical_features = df[['year', 'hour']]
X = pd.concat([numerical_features, ohetransform], axis = 1)

##### Label Encoder for y

In [33]:
le = LabelEncoder()
y = le.fit_transform(df["offense_category_name"])

##### Train Test Split for both X and y

In [34]:
#random_state is keeping the the result exact same everytime you run it, likes like a random_seed
#test_size is to give the model 20% of the dataset to test on
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22, test_size=0.2, stratify=y)

##### Train the model on the data

In [None]:
# rf2 = RandomForestClassifier(
#     n_estimators=1500,        # More trees
#     class_weight='balanced',
#     criterion='entropy',
#     min_samples_split=5,      # Lower to allow more splits
#     min_samples_leaf=2,       # Lower to allow smaller leaves
#     max_depth=20,             # Deeper trees
#     max_features='sqrt',      # Better feature selection
#     random_state=42
# )

rf2 = XGBClassifier(
    n_estimators=1500,
    max_depth=10,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    use_label_encoder=False
)

In [36]:
rf2.fit(X_train, y_train)

0,1,2
,n_estimators,1500
,criterion,'entropy'
,max_depth,20
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [37]:
# Make predictions
y_pred = rf2.predict(X_test)

In [38]:
# Evaluate model
accuracy = rf2.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.6238

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        78
           1       0.90      0.36      0.51      1808
           2       0.72      0.84      0.77       914
           3       0.79      0.71      0.75      4270
           4       0.03      0.33      0.06        39
           5       0.29      0.80      0.42       196
           6       0.13      0.57      0.21        87
           7       0.10      0.50      0.16       111
           8       0.49      0.75      0.60       778
           9       0.67      0.52      0.58      2255

    accuracy                           0.62     10536
   macro avg       0.51      0.64      0.51     10536
weighted avg       0.73      0.62      0.65     10536



In [39]:
# Feature importance analysis
features_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf2.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(features_importance.head(10))


Top 10 Most Important Features:
                                              feature  importance
104                    criminal_act_name_None/Unknown    0.216265
109           criminal_act_name_Possessing/Concealing    0.096477
110            criminal_act_name_Simple/Gross Neglect    0.073871
137  location_area_Highway/Road/Alley/Street/Sidewalk    0.057098
1                                                hour    0.056014
73                                      city_Stamford    0.052711
148                      location_area_Residence/Home    0.045824
47                                   city_New Britain    0.040222
49                                     city_New Haven    0.033265
102   criminal_act_name_Intentional Abuse and Torture    0.032845


In [40]:
print("\nClass Labels:")
for i, class_name in enumerate(le.classes_):
    print(f"{i}: {class_name}")


Class Labels:
0: Animal Cruelty
1: Assault Offenses
2: Counterfeiting/Forgery
3: Drug/Narcotic Offenses
4: Kidnapping/Abduction
5: Pornography/Obscene Material
6: Robbery
7: Sex Offenses
8: Stolen Property Offenses
9: Weapon Law Violations
