In [1]:
import pandas as pd
import geopandas as gpd

### Import Data

In [11]:
gdf = gpd.read_file('../GENERATED-DATA/census_crime_clustered.geojson')

In [12]:
gdf.columns

Index(['NM_DISTRIT', 'CRIMES', 'V001_ENTORNO01', 'V002_ENTORNO01',
       'V003_ENTORNO01', 'V004_ENTORNO01', 'V001_DOMICILIORENDA',
       'V002_DOMICILIORENDA', 'V003_DOMICILIORENDA', 'V004_DOMICILIORENDA',
       'V001_BASICO', 'V002_BASICO', 'V003_BASICO', 'V004_BASICO',
       'V005_BASICO', 'V006_BASICO', 'V007_BASICO', 'V008_BASICO',
       'V009_BASICO', 'V010_BASICO', 'V011_BASICO', 'V012_BASICO',
       'V001_DOMICILIO02', 'V002_DOMICILIO02', 'V001_DOMICILIO01',
       'V002_DOMICILIO01', 'V001_PESSOA01', 'V086_PESSOA02', 'V001_PESSOA03',
       'V002_PESSOA03', 'V003_PESSOA03', 'V004_PESSOA03', 'V005_PESSOA03',
       'V006_PESSOA03', 'V001_PESSOA12', 'V001_PESSOA11', 'V001_RESPONSAVEL01',
       'V001_RESPONSAVEL02', 'cluster', 'geometry'],
      dtype='object')

### Create Classification Problem Target

In [13]:
gdf['CRIMES'].quantile([0.95])

0.95    82.0
Name: CRIMES, dtype: float64

anything above the 95th quantile is going to be considered a hotspot

In [14]:
def bin_target(y):
    # bins = y.quantile([0.25, 0.5, 0.75])
    bins = y.quantile([0.95])
    return y.apply(lambda x: sum(x > bins))

y = bin_target(gdf['CRIMES'])

In [15]:
y.value_counts()

CRIMES
0    17274
1      908
Name: count, dtype: int64

In [16]:
gdf['hotspot'] = y

In [38]:
df = gdf.drop(columns=['geometry', 'CRIMES', 'NM_DISTRIT'])
df.columns

Index(['V001_ENTORNO01', 'V002_ENTORNO01', 'V003_ENTORNO01', 'V004_ENTORNO01',
       'V001_DOMICILIORENDA', 'V002_DOMICILIORENDA', 'V003_DOMICILIORENDA',
       'V004_DOMICILIORENDA', 'V001_BASICO', 'V002_BASICO', 'V003_BASICO',
       'V004_BASICO', 'V005_BASICO', 'V006_BASICO', 'V007_BASICO',
       'V008_BASICO', 'V009_BASICO', 'V010_BASICO', 'V011_BASICO',
       'V012_BASICO', 'V001_DOMICILIO02', 'V002_DOMICILIO02',
       'V001_DOMICILIO01', 'V002_DOMICILIO01', 'V001_PESSOA01',
       'V086_PESSOA02', 'V001_PESSOA03', 'V002_PESSOA03', 'V003_PESSOA03',
       'V004_PESSOA03', 'V005_PESSOA03', 'V006_PESSOA03', 'V001_PESSOA12',
       'V001_PESSOA11', 'V001_RESPONSAVEL01', 'V001_RESPONSAVEL02', 'cluster',
       'hotspot'],
      dtype='object')

### Classification Problem

In [39]:
from imblearn.over_sampling import SMOTE

def balance_classes(X, y):
    smote = SMOTE(sampling_strategy='auto', random_state=0)
    X_res, y_res = smote.fit_resample(X, y)
    return X_res, y_res

In [92]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

def predict_func(X_train, X_test, y_train, y_test):
    X_train, y_train = balance_classes(X_train, y_train)

    xgb = XGBClassifier(
        scale_pos_weight=1,  # Adjust based on the imbalance ratio
        eval_metric='aucpr',
        objective='binary:logistic',
    )

    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    result = classification_report(y_test, y_pred, output_dict=True)
    return result


In [41]:
# function to split train and test data
# based on the cluster column
def split_train_test(df, cv_column, k):
    train = df[df[cv_column] != k]
    test = df[df[cv_column] == k]
    return train, test

def cross_validate(df, target, cv_column, predict_func):
    results = []

    for k in df[cv_column].unique():
        train, test = split_train_test(df, cv_column, k)

        X_train = train.drop(columns=[target, cv_column])
        X_test = test.drop(columns=[target, cv_column])
        y_train = train[target]
        y_test = test[target]

        result = predict_func(X_train, X_test, y_train, y_test)
        results.append(result)

    return results

results = cross_validate(df, 'hotspot', 'cluster', predict_func)

In [91]:
for i, result in enumerate(results):
    print(f'Cluster [{i}] - Class 1 - precision: {result['1']['precision']:.2f}, recall: {result['1']['recall']:.2f}')

Cluster [0] - Class 1 - precision: 0.09, recall: 0.13
Cluster [1] - Class 1 - precision: 0.12, recall: 0.08
Cluster [2] - Class 1 - precision: 0.23, recall: 0.10
Cluster [3] - Class 1 - precision: 0.00, recall: 0.00
Cluster [4] - Class 1 - precision: 0.55, recall: 0.16


In [85]:
for i, result in enumerate(results):
    print('Cluster', i)
    print(pd.DataFrame(result).T)
    print()

Cluster 0
              precision    recall  f1-score      support
0              0.963377  0.947872  0.955562  11184.00000
1              0.094720  0.131466  0.110108    464.00000
accuracy       0.915350  0.915350  0.915350      0.91535
macro avg      0.529049  0.539669  0.532835  11648.00000
weighted avg   0.928774  0.915350  0.921883  11648.00000

Cluster 1
              precision    recall  f1-score      support
0              0.950960  0.970504  0.960633  5255.000000
1              0.124294  0.077193  0.095238   285.000000
accuracy       0.924549  0.924549  0.924549     0.924549
macro avg      0.537627  0.523849  0.527935  5540.000000
weighted avg   0.908433  0.924549  0.916113  5540.000000

Cluster 2
              precision    recall  f1-score     support
0              0.835031  0.929705  0.879828  441.000000
1              0.225000  0.100000  0.138462   90.000000
accuracy       0.789077  0.789077  0.789077    0.789077
macro avg      0.530015  0.514853  0.509145  531.000000
weig

### Testing With no Clustered CV

In [67]:
# import train_test_split
from sklearn.model_selection import train_test_split

X = df.drop(columns=['hotspot', 'cluster'])
y = df['hotspot']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [75]:
y_test.value_counts()

hotspot
0    3464
1     173
Name: count, dtype: int64

In [69]:
X_train, y_train = balance_classes(X_train, y_train)
y_train.value_counts()

hotspot
0    13810
1    13810
Name: count, dtype: int64

In [74]:
from sklearn.model_selection import cross_val_predict

xgb = XGBClassifier(
    scale_pos_weight=1,  # Adjust based on the imbalance ratio
    eval_metric='aucpr',
    objective='binary:logistic',
)

xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      3464
           1       0.16      0.13      0.14       173

    accuracy                           0.93      3637
   macro avg       0.56      0.55      0.55      3637
weighted avg       0.92      0.93      0.92      3637



### Testing with CV score

In [61]:
X, y = balance_classes(X, y)

In [73]:
from sklearn.model_selection import cross_val_score

xgb = XGBClassifier(
    scale_pos_weight=1,  # Adjust based on the imbalance ratio
    eval_metric='aucpr',
    objective='binary:logistic',
)

cross_val_score(xgb, X, y, cv=5, scoring='f1_weighted')

array([0.91465607, 0.92377169, 0.91551763, 0.91199051, 0.90922982])