### Training a Random Forest Algorithm Model

In [179]:
# Data Processing
import pandas as pd

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier


df = pd.read_csv("./CT-Combined/combined_data.csv")

In [180]:
# Drop rare offense categories (< 50 rows)
classes = df['offense_category_name'].value_counts()
print(df['offense_category_name'].value_counts())

common_classes = classes[classes >= 300].index
df = df[df['offense_category_name'].isin(common_classes)]

offense_category_name
Drug/Narcotic Offenses          21347
Weapon Law Violations           11277
Assault Offenses                 9041
Counterfeiting/Forgery           4569
Stolen Property Offenses         3889
Pornography/Obscene Material      982
Sex Offenses                      553
Robbery                           435
Animal Cruelty                    391
Kidnapping/Abduction              193
Homicide Offenses                  27
Gambling Offenses                   8
Name: count, dtype: int64


In [181]:
print(df['offense_category_name'].value_counts())


offense_category_name
Drug/Narcotic Offenses          21347
Weapon Law Violations           11277
Assault Offenses                 9041
Counterfeiting/Forgery           4569
Stolen Property Offenses         3889
Pornography/Obscene Material      982
Sex Offenses                      553
Robbery                           435
Animal Cruelty                    391
Name: count, dtype: int64


##### One Hot Encoder for X

In [182]:
# Here, what I am doing is that I'm turning all features that aren't numerical into numerical format
# One hot encoder is best for nominal data where there's no ranks
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output= False).set_output(transform='pandas')
ohetransform = ohe.fit_transform(df[['city', 'criminal_act_name', 'location_area']])

# X is the input features (what the model learns from)
numerical_features = df[['year', 'hour']]
X = pd.concat([numerical_features, ohetransform], axis = 1)

##### Label Encoder for y

In [183]:
le = LabelEncoder()
y = le.fit_transform(df["offense_category_name"])

##### Train Test Split for both X and y

In [184]:
#random_state is keeping the the result exact same everytime you run it, likes like a random_seed
#test_size is to give the model 20% of the dataset to test on
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22, test_size=0.2, stratify=y)

In [185]:
print(X.shape, y.shape)

(52484, 159) (52484,)


##### Train the model on the data

In [186]:
# model = RandomForestClassifier(
#     n_estimators=1500,        # More trees
#     class_weight='balanced',
#     criterion='entropy',
#     min_samples_split=5,      # Lower to allow more splits
#     min_samples_leaf=2,       # Lower to allow smaller leaves
#     max_depth=20,             # Deeper trees
#     max_features='sqrt',      # Better feature selection
#     random_state=42
# )


#Using early stop to prevent overfitting
#Basically stops training before the model starts fitting noise
model = XGBClassifier(
    n_estimators=1500, #Max number of tree; More trees
    max_depth=10, #Max depth of each treee
    learning_rate=0.1, #Shrinks how much each tree contributes
    subsample=0.8, #Randomly samples 80% of rows per tree — prevents overfitting
    colsample_bytree=0.8, #Uses 80% of columns per tree — improves generalization 
    eval_metric='mlogloss', #Tells XGBoost to minimize multiclass log-loss (probabilistic loss)
    use_label_encoder=False,
    early_stopping_rounds=10 #Stop training if eval score doesn’t improve for 10 rounds
)

In [188]:
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,10
,enable_categorical,False


In [189]:
# Make predictions
y_pred = model.predict(X_test)

In [190]:
# Evaluate model
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 78.26%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        78
           1       0.90      0.99      0.94      1808
           2       0.87      0.81      0.84       914
           3       0.76      0.88      0.81      4270
           4       0.89      0.60      0.72       196
           5       0.26      0.06      0.09        87
           6       0.18      0.02      0.03       111
           7       0.69      0.61      0.65       778
           8       0.70      0.56      0.62      2255

    accuracy                           0.78     10497
   macro avg       0.70      0.61      0.63     10497
weighted avg       0.77      0.78      0.77     10497



In [191]:
# Feature importance analysis
features_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(features_importance.head(10))


Top 10 Most Important Features:
                                               feature  importance
104                     criminal_act_name_None/Unknown    0.163349
119                location_area_Bank/Savings and Loan    0.085279
110             criminal_act_name_Simple/Gross Neglect    0.057935
101              criminal_act_name_Exploiting Children    0.044999
111  criminal_act_name_Transporting/Transmitting/Im...    0.042138
49                                      city_New Haven    0.038174
102    criminal_act_name_Intentional Abuse and Torture    0.031116
99   criminal_act_name_Cultivating/Manufacturing/Pu...    0.029479
100             criminal_act_name_Distributing/Selling    0.025250
36                                       city_Hartford    0.025021


In [192]:
print("\nClass Labels:")
for i, class_name in enumerate(le.classes_):
    print(f"{i}: {class_name}")


Class Labels:
0: Animal Cruelty
1: Assault Offenses
2: Counterfeiting/Forgery
3: Drug/Narcotic Offenses
4: Pornography/Obscene Material
5: Robbery
6: Sex Offenses
7: Stolen Property Offenses
8: Weapon Law Violations
