Crime Type Prediction: LightGBM

Sections:

1. Imports
2. Data Loading
3. Feature Engineering
4. Train/Test Split & Scaling
5. LightGBM Model Training & Evaluation
6. Confusion Matrix

1. Imports

In [26]:
!pip install lightgbm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import lightgbm as lgb
from lightgbm import LGBMClassifier



2. Data Loading

In [27]:
DATA_PATH = '/Users/minthawzin/Downloads/combined_data.csv'
df = pd.read_csv(DATA_PATH)

# Encode target
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['offense_category_name'])

In [28]:
df


Unnamed: 0,year,month,day,hour,date,dayofweek,city,population,offense_category_name,location_area,crime_rate_per_1000_people,target
0,2022,9,22,13,2022-09-22,3,Berlin,20109,Drug/Narcotic Offenses,Abandoned/Condemned Structure,120,7
1,2022,6,4,20,2022-06-04,5,Norwich,40096,Weapon Law Violations,Abandoned/Condemned Structure,166,23
2,2022,7,10,20,2022-07-10,6,Norwich,40096,Drug/Narcotic Offenses,Abandoned/Condemned Structure,166,7
3,2022,7,10,20,2022-07-10,6,Norwich,40096,Drug/Narcotic Offenses,Abandoned/Condemned Structure,166,7
4,2022,10,14,13,2022-10-14,4,Bridgeport,148395,Drug/Narcotic Offenses,Abandoned/Condemned Structure,106,7
...,...,...,...,...,...,...,...,...,...,...,...,...
366985,2021,3,13,16,2021-03-13,5,Stratford,51683,Assault Offenses,Other/Unknown,99,2
366986,2021,9,5,17,2021-09-05,6,Stratford,51683,Assault Offenses,Other/Unknown,99,2
366987,2021,12,3,14,2021-12-03,4,Stratford,51683,Larceny/Theft Offenses,Other/Unknown,99,15
366988,2021,10,27,11,2021-10-27,2,Stratford,51683,Larceny/Theft Offenses,Other/Unknown,99,15


3. Feature Engineering

In [29]:
# 3.1 Cyclical time features
def add_cyclical_features(df, col, period):
    df[f'{col}_sin'] = np.sin(2 * np.pi * df[col]/period)
    df[f'{col}_cos'] = np.cos(2 * np.pi * df[col]/period)
    return df

for col, p in [('hour',24), ('dayofweek',7), ('month',12)]:
    df = add_cyclical_features(df, col, p)

# 3.2 Weekend Flag
df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)

# 3.3 One-hot encode categorical features
cat_cols = ['city', 'location_area']
df = pd.get_dummies(
    df.drop(columns=['offense_category_name','date','offense_name'], errors='ignore'),
    columns=['city','location_area'],
    drop_first=True
)

4. Train/Test Split & Scaling

In [30]:
features = [c for c in df.columns if c not in ['offense_category_name','date','offense_name','target']]
X = df[features]
y = df['target']

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Identify numeric columns for scaling
num_cols = ['year','month','day','hour','population','crime_rate_per_1000_people'] + \
           [c for c in X.columns if '_sin' in c or '_cos' in c] + ['is_weekend']

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

5. LightGBM Model Training & Evaluation

In [32]:
# Prepare dataset for LightGBM
lgb_train = lgb.Dataset(X_train, label=y_train)

params = {
    'objective': 'multiclass',
    'num_class': len(encoder.classes_),
    'metric': 'multi_logloss',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'verbose': -1
}

from lightgbm import LGBMClassifier, early_stopping, log_evaluation

model = LGBMClassifier(
    objective='multiclass',
    num_class=len(encoder.classes_),
    learning_rate=0.05,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    n_estimators=500,
    verbosity=-1,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='multi_logloss',
    callbacks=[
        early_stopping(stopping_rounds=50, first_metric_only=True),
        log_evaluation(period=20)
    ]
)

y_pred = model.predict(X_test)
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))


Training until validation scores don't improve for 50 rounds
[20]	valid_0's multi_logloss: 1.64255
[40]	valid_0's multi_logloss: 1.57846
[60]	valid_0's multi_logloss: 1.55721
[80]	valid_0's multi_logloss: 1.54838
[100]	valid_0's multi_logloss: 1.54433
[120]	valid_0's multi_logloss: 1.54254
[140]	valid_0's multi_logloss: 1.5414
[160]	valid_0's multi_logloss: 1.54079
[180]	valid_0's multi_logloss: 1.54068
[200]	valid_0's multi_logloss: 1.54071
[220]	valid_0's multi_logloss: 1.54083
[240]	valid_0's multi_logloss: 1.54103
Early stopping, best iteration is:
[195]	valid_0's multi_logloss: 1.54064
Evaluated only: multi_logloss
Accuracy: 47.40%


6. Confusion Matrix

In [None]:
y_pred = model.predict(X_test)

print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=encoder.classes_))

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


Accuracy: 47.40%
Classification Report:
                                           precision    recall  f1-score   support

                          Animal Cruelty       0.17      0.01      0.03        71
                                   Arson       0.00      0.00      0.00       103
                        Assault Offenses       0.42      0.61      0.50     13999
                                 Bribery       0.00      0.00      0.00         3
            Burglary/Breaking & Entering       0.37      0.10      0.16      2721
                  Counterfeiting/Forgery       0.43      0.19      0.26       894
Destruction/Damage/Vandalism of Property       0.34      0.19      0.25     10194
                  Drug/Narcotic Offenses       0.45      0.35      0.39      3749
                            Embezzlement       0.08      0.01      0.02       104
                     Extortion/Blackmail       0.11      0.01      0.03       136
                          Fraud Offenses       0.62     