### Training a Random Forest Algorithm Model

In [97]:
# Data Processing
import pandas as pd

#Model Algorithms
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier, early_stopping
from xgboost import XGBClassifier

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score


df = pd.read_csv("./Datasets/CT-Combined/combined_data.csv")

In [98]:
# Drop rare offense categories (< 50 rows)
classes = df['offense_category_name'].value_counts()
print(df['offense_category_name'].value_counts())

common_classes = classes[classes >= 300].index
df = df[df['offense_category_name'].isin(common_classes)]

offense_category_name
Larceny/Theft Offenses                      124962
Assault Offenses                             72796
Destruction/Damage/Vandalism of Property     52252
Fraud Offenses                               35492
Motor Vehicle Theft                          24639
Drug/Narcotic Offenses                       21347
Burglary/Breaking & Entering                 14212
Weapon Law Violations                        11277
Robbery                                       5108
Sex Offenses                                  4716
Counterfeiting/Forgery                        4569
Stolen Property Offenses                      3889
Pornography/Obscene Material                   982
Kidnapping/Abduction                           871
Extortion/Blackmail                            686
Embezzlement                                   546
Arson                                          536
Homicide Offenses                              466
Animal Cruelty                                 391
Sex Offen

In [99]:
print(df['offense_category_name'].value_counts())


offense_category_name
Larceny/Theft Offenses                      124962
Assault Offenses                             72796
Destruction/Damage/Vandalism of Property     52252
Fraud Offenses                               35492
Motor Vehicle Theft                          24639
Drug/Narcotic Offenses                       21347
Burglary/Breaking & Entering                 14212
Weapon Law Violations                        11277
Robbery                                       5108
Sex Offenses                                  4716
Counterfeiting/Forgery                        4569
Stolen Property Offenses                      3889
Pornography/Obscene Material                   982
Kidnapping/Abduction                           871
Extortion/Blackmail                            686
Embezzlement                                   546
Arson                                          536
Homicide Offenses                              466
Animal Cruelty                                 391
Sex Offen

##### One Hot Encoder for X

In [None]:
# Here, what I am doing is that I'm turning all features that aren't numerical into numerical format
# One hot encoder is best for nominal data where there's no ranks
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output= False).set_output(transform='pandas')
ohetransform = ohe.fit_transform(df[['city', 'location_area']])

# X is the input features (what the model learns from)
numerical_features = df[['year', 'hour']]
X = pd.concat([numerical_features, ohetransform], axis = 1)

KeyError: "['criminal_act_name'] not in index"

##### Label Encoder for y

In [None]:
le = LabelEncoder()
y = le.fit_transform(df["offense_category_name"])

##### Train Test Split for both X and y

In [None]:
#random_state is keeping the the result exact same everytime you run it, likes like a random_seed
#test_size is to give the model 20% of the dataset to test on
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22, test_size=0.2, stratify=y)

In [None]:
print(X.shape, y.shape)

(52484, 159) (52484,)


##### Train the model on the data

In [None]:
# model = RandomForestClassifier(
#     n_estimators=1500,        # More trees
#     class_weight='balanced',
#     criterion='entropy',
#     min_samples_split=5,      # Lower to allow more splits
#     min_samples_leaf=2,       # Lower to allow smaller leaves
#     max_depth=20,             # Deeper trees
#     max_features='sqrt',      # Better feature selection
#     random_state=42
# )


#Using early stop to prevent overfitting
#Basically stops training before the model starts fitting noise
# model = XGBClassifier(
#     n_estimators=1500, #Max number of tree; More trees
#     max_depth=10, #Max depth of each treee
#     learning_rate=0.1, #Shrinks how much each tree contributes
#     subsample=0.8, #Randomly samples 80% of rows per tree — prevents overfitting
#     colsample_bytree=0.8, #Uses 80% of columns per tree — improves generalization 
#     eval_metric='mlogloss', #Tells XGBoost to minimize multiclass log-loss (probabilistic loss)
#     use_label_encoder=False,
#     early_stopping_rounds=10 #Stop training if eval score doesn’t improve for 10 rounds
# )

model = LGBMClassifier(
    objective='multiclass',
    num_class=len(le.classes_),
    n_estimators=1500,
    learning_rate=0.1,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)

In [None]:
#model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

model.fit(X_train, y_train, 
          eval_set=[(X_test, y_test)], 
          callbacks=[early_stopping(10)])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 284
[LightGBM] [Info] Number of data points in the train set: 41987, number of used features: 130
[LightGBM] [Info] Start training from score -4.898912
[LightGBM] [Info] Start training from score -1.758706
[LightGBM] [Info] Start training from score -2.441264
[LightGBM] [Info] Start training from score -0.899628
[LightGBM] [Info] Start training from score -3.978159
[LightGBM] [Info] Start training from score -4.792913
[LightGBM] [Info] Start training from score -4.553805
[LightGBM] [Info] Start training from score -2.602416
[LightGBM] [Info] Start training from score -1.537694
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[108]	valid_0's multi_logloss: 0.574988


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,10
,learning_rate,0.1
,n_estimators,1500
,subsample_for_bin,200000
,objective,'multiclass'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate model
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 78.06%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        78
           1       0.90      0.99      0.94      1808
           2       0.86      0.81      0.83       914
           3       0.75      0.88      0.81      4270
           4       0.89      0.60      0.72       196
           5       0.18      0.03      0.06        87
           6       0.18      0.02      0.03       111
           7       0.70      0.61      0.65       778
           8       0.71      0.55      0.62      2255

    accuracy                           0.78     10497
   macro avg       0.69      0.61      0.63     10497
weighted avg       0.77      0.78      0.77     10497



In [None]:
# Feature importance analysis
features_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(features_importance.head(10))


Top 10 Most Important Features:
                                              feature  importance
1                                                hour        6857
0                                                year        2697
148                      location_area_Residence/Home         981
109           criminal_act_name_Possessing/Concealing         892
137  location_area_Highway/Road/Alley/Street/Sidewalk         847
47                                   city_New Britain         839
104                    criminal_act_name_None/Unknown         677
73                                      city_Stamford         659
112                 criminal_act_name_Using/Consuming         569
49                                     city_New Haven         564


In [None]:
print("\nClass Labels:")
for i, class_name in enumerate(le.classes_):
    print(f"{i}: {class_name}")


Class Labels:
0: Animal Cruelty
1: Assault Offenses
2: Counterfeiting/Forgery
3: Drug/Narcotic Offenses
4: Pornography/Obscene Material
5: Robbery
6: Sex Offenses
7: Stolen Property Offenses
8: Weapon Law Violations
