In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
import json
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.pipeline import make_pipeline
import numpy as np
import xgboost as xgb
import pickle
import os

In [2]:
pd.set_option('display.max_columns', None)

TRAIN_DATA_PATH = '../data/train_2022_2023.csv.gz'
VAL_DATA_PATH = '../data/val_2024.csv.gz'

In [3]:
temp_df = pd.read_csv(TRAIN_DATA_PATH, compression='gzip', nrows=0)
all_cols = temp_df.columns.tolist()

remove_cols = ['id', 'updated_on', 'block', 'iucr', 'beat', 'description', 'latitude', 'longitude', 'location', 'year', 'y_coordinate', 'x_coordinate', 'case_number', 'id']
include_cols = [inc_col for inc_col in all_cols if inc_col not in remove_cols]

In [4]:
df = pd.read_csv(TRAIN_DATA_PATH, compression='gzip', usecols=include_cols, parse_dates=['date'])
df.head()

Unnamed: 0,date,primary_type,location_description,arrest,domestic,district,ward,community_area,fbi_code
0,2022-01-01,SEX OFFENSE,ATHLETIC CLUB,False,False,22,19.0,75.0,17
1,2022-01-01,DECEPTIVE PRACTICE,RESIDENCE,False,False,11,37.0,23.0,11
2,2022-01-01,BATTERY,APARTMENT,False,True,16,45.0,11.0,08B
3,2022-01-01,CRIMINAL SEXUAL ASSAULT,RESIDENCE,False,False,19,44.0,6.0,02
4,2022-01-01,OFFENSE INVOLVING CHILDREN,RESIDENCE,False,True,8,14.0,63.0,17


In [5]:
for col in df.columns.tolist():
    print(f'{col}: {df[col].nunique()}')

date: 236566
primary_type: 31
location_description: 143
arrest: 2
domestic: 2
district: 23
ward: 50
community_area: 77
fbi_code: 26


In [6]:
# Extract temporal features
df['hour'] = df['date'].dt.hour  # 0-23
df['day_of_week'] = df['date'].dt.weekday  # 0=Monday to 6=Sunday
df['month'] = df['date'].dt.month  # 1-12
df['quarter'] = df['date'].dt.quarter  # 1-4

# Binary flags
df['is_night'] = ((df['hour'] >= 18) | (df['hour'] < 6)).astype(int)  # 1 if True, 0 else
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

In [7]:
# countries and their regions
with open ('./location_description.json', 'r') as file:
    map_loc_desc = json.load(file)
    
df['location_group'] = df['location_description'].map(map_loc_desc).fillna("Unknown/Other")
df.drop(columns=['date', 'location_description'], inplace=True)


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503052 entries, 0 to 503051
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   primary_type    503052 non-null  object 
 1   arrest          503052 non-null  bool   
 2   domestic        503052 non-null  bool   
 3   district        503052 non-null  int64  
 4   ward            503039 non-null  float64
 5   community_area  503013 non-null  float64
 6   fbi_code        503052 non-null  object 
 7   hour            503052 non-null  int32  
 8   day_of_week     503052 non-null  int32  
 9   month           503052 non-null  int32  
 10  quarter         503052 non-null  int32  
 11  is_night        503052 non-null  int64  
 12  is_weekend      503052 non-null  int64  
 13  location_group  503052 non-null  object 
dtypes: bool(2), float64(2), int32(4), int64(3), object(3)
memory usage: 39.3+ MB


In [9]:
df = df.dropna()

In [10]:
feature_cols = [
    'primary_type', 'domestic', 'district', 'ward', 'community_area', 'fbi_code',
    'hour', 'day_of_week', 'month', 'quarter', 'is_night', 'is_weekend', 'location_group'
]

In [11]:
for col in df.columns.tolist():
    print(f'{col}: {df[col].nunique()}')

primary_type: 31
arrest: 2
domestic: 2
district: 23
ward: 50
community_area: 77
fbi_code: 26
hour: 24
day_of_week: 7
month: 12
quarter: 4
is_night: 2
is_weekend: 2
location_group: 15


In [12]:
for c in df.columns.tolist():
    print(df[c].value_counts())
    print()

primary_type
THEFT                                112383
BATTERY                               85200
CRIMINAL DAMAGE                       57337
MOTOR VEHICLE THEFT                   50722
ASSAULT                               43440
DECEPTIVE PRACTICE                    34495
OTHER OFFENSE                         30338
ROBBERY                               20016
WEAPONS VIOLATION                     17388
BURGLARY                              15081
NARCOTICS                             10210
CRIMINAL TRESPASS                      8941
OFFENSE INVOLVING CHILDREN             3630
CRIMINAL SEXUAL ASSAULT                3279
SEX OFFENSE                            2594
PUBLIC PEACE VIOLATION                 1574
HOMICIDE                               1375
STALKING                                982
INTERFERENCE WITH PUBLIC OFFICER        978
ARSON                                   933
PROSTITUTION                            492
INTIMIDATION                            420
LIQUOR LAW VIOLATIO

In [13]:
df.isna().sum()

primary_type      0
arrest            0
domestic          0
district          0
ward              0
community_area    0
fbi_code          0
hour              0
day_of_week       0
month             0
quarter           0
is_night          0
is_weekend        0
location_group    0
dtype: int64

In [14]:
X_train = df[feature_cols]
y_train = df['arrest'].astype('int')

In [15]:
# # Class weights for imbalance (fit on train)
# classes = np.unique(y_train)
# class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
# sample_weights = {cls: w for cls, w in zip(classes, class_weights)}

In [16]:
# class_weights

In [17]:
# sample_weights

In [18]:
X_train_dict = X_train.to_dict(orient='records')

In [None]:
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    # scale_pos_weight=sample_weights[1]
)

In [20]:
pipeline = make_pipeline(
    DictVectorizer(sparse=True, dtype=np.float32),
    model
)

pipeline.fit(X_train_dict, y_train)

0,1,2
,steps,"[('dictvectorizer', ...), ('xgbclassifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,dtype,<class 'numpy.float32'>
,separator,'='
,sparse,True
,sort,True

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [21]:
# Evaluate on val
y_val_pred_proba = pipeline.predict_proba(X_train_dict)[:, 1]
y_val_pred = pipeline.predict(X_train_dict)

In [22]:
auc_val = roc_auc_score(y_train, y_val_pred_proba)
print(f"Validation AUC-ROC: {auc_val:.4f}")
print("\nClassification Report (Val):\n", classification_report(y_train, y_val_pred))

Validation AUC-ROC: 0.8724

Classification Report (Val):
               precision    recall  f1-score   support

           0       0.92      0.99      0.95    442673
           1       0.83      0.39      0.54     60327

    accuracy                           0.92    503000
   macro avg       0.88      0.69      0.75    503000
weighted avg       0.91      0.92      0.90    503000



In [None]:
# Ensure model directory exists
os.makedirs('../models', exist_ok=True)

with open('../models/xgb_model.pkl', 'wb') as f_out:
    pickle.dump(pipeline, f_out)

In [24]:
with open('../models/xgb_model.pkl', 'rb') as f_in:
    pipeline = pickle.load(f_in)