### Training a Random Forest Algorithm Model

In [141]:
# Data Processing
import pandas as pd

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from collections import Counter


df = pd.read_csv("./CT-Combined/combined_data.csv")

In [None]:
# Drop rare offense categories (< 50 rows)
classes = df['offense_category_name'].value_counts()
print(df['offense_category_name'].value_counts())

common_classes = classes[classes >= 100].index
df = df[df['offense_category_name'].isin(common_classes)]

offense_category_name
Drug/Narcotic Offenses          12127
Weapon Law Violations            7639
Assault Offenses                 5973
Counterfeiting/Forgery           3398
Stolen Property Offenses         2528
Pornography/Obscene Material      655
Sex Offenses                      347
Robbery                           258
Animal Cruelty                    219
Kidnapping/Abduction              139
Homicide Offenses                  15
Gambling Offenses                   4
Name: count, dtype: int64


In [143]:
print(df['offense_category_name'].value_counts())


offense_category_name
Drug/Narcotic Offenses          12127
Weapon Law Violations            7639
Assault Offenses                 5973
Counterfeiting/Forgery           3398
Stolen Property Offenses         2528
Pornography/Obscene Material      655
Sex Offenses                      347
Robbery                           258
Animal Cruelty                    219
Kidnapping/Abduction              139
Name: count, dtype: int64


##### One Hot Encoder for X

In [144]:
# Here, what I am doing is that I'm turning all features that aren't numerical into numerical format
# One hot encoder is best for nominal data where there's no ranks
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output= False).set_output(transform='pandas')
ohetransform = ohe.fit_transform(df[['city', 'criminal_act_name', 'location_area']])

# X is the input features (what the model learns from)
numerical_features = df[['year', 'incident_hour']]
X = pd.concat([numerical_features, ohetransform], axis = 1)

##### Label Encoder for y

In [None]:
le = LabelEncoder()
y = le.fit_transform(df["offense_category_name"])

##### Train Test Split for both X and y

In [146]:
#random_state is keeping the the result exact same everytime you run it, likes like a random_seed
#test_size is to give the model 20% of the dataset to test on
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22, test_size=0.2, stratify=y)

##### Train the model on the data

In [147]:
# rf2 = RandomForestClassifier(
#     n_estimators=1500,        # More trees
#     class_weight='balanced',
#     criterion='entropy',
#     min_samples_split=5,      # Lower to allow more splits
#     min_samples_leaf=2,       # Lower to allow smaller leaves
#     max_depth=20,             # Deeper trees
#     max_features='sqrt',      # Better feature selection
#     random_state=42
# )

rf2 = XGBClassifier(
    n_estimators=1500,
    max_depth=10,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    use_label_encoder=False
)

In [148]:
rf2.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [149]:
# Make predictions
y_pred = rf2.predict(X_test)

In [150]:
# Evaluate model
accuracy = rf2.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.7490

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        44
           1       0.89      0.97      0.93      1195
           2       0.86      0.82      0.84       680
           3       0.76      0.78      0.77      2425
           4       0.00      0.00      0.00        28
           5       0.73      0.63      0.67       131
           6       0.11      0.04      0.06        51
           7       0.08      0.01      0.02        69
           8       0.57      0.57      0.57       506
           9       0.63      0.62      0.63      1528

    accuracy                           0.75      6657
   macro avg       0.56      0.54      0.55      6657
weighted avg       0.74      0.75      0.74      6657



In [151]:
# Feature importance analysis
features_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf2.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(features_importance.head(10))


Top 10 Most Important Features:
                                               feature  importance
112                     criminal_act_name_None/Unknown    0.253573
118             criminal_act_name_Simple/Gross Neglect    0.115238
110    criminal_act_name_Intentional Abuse and Torture    0.084018
127                location_area_Bank/Savings and Loan    0.063870
109              criminal_act_name_Exploiting Children    0.034870
107  criminal_act_name_Cultivating/Manufacturing/Pu...    0.021159
135                           location_area_Cyberspace    0.017913
119  criminal_act_name_Transporting/Transmitting/Im...    0.016867
37                                       city_Hartford    0.014679
53                                      city_New Haven    0.010310


In [152]:
print("\nClass Labels:")
for i, class_name in enumerate(le.classes_):
    print(f"{i}: {class_name}")


Class Labels:
0: Animal Cruelty
1: Assault Offenses
2: Counterfeiting/Forgery
3: Drug/Narcotic Offenses
4: Kidnapping/Abduction
5: Pornography/Obscene Material
6: Robbery
7: Sex Offenses
8: Stolen Property Offenses
9: Weapon Law Violations
