In [3]:
from turtledemo.sorting_animate import Block
import pandas as pd

# Załadowanie i wstępna analiza zbioru

W projekcie wykorzystany jest zbiór Crimes in Chicago (https://www.kaggle.com/datasets/currie32/crimes-in-chicago/data), a dokładniej część zbioru zawierająca dane od 2012 roku do 2017 roku, z ostatnią aktualizacją przypadającą na dzień 25 stycznia 2017 roku. Zbiór ten składa się z 23 kolumn oraz 1456714 wierszy.

In [5]:
path = '../Chicago_Crimes_2012_to_2017.csv' # Ścieżka do lokalnej lokalizacji zbioru.

df = pd.read_csv(path, on_bad_lines='skip')

In [6]:
df.shape # Rozmiar zbioru

(1456714, 23)

In [7]:
list(df.columns) # Lista kolumn w zbiorze

['Unnamed: 0',
 'ID',
 'Case Number',
 'Date',
 'Block',
 'IUCR',
 'Primary Type',
 'Description',
 'Location Description',
 'Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'FBI Code',
 'X Coordinate',
 'Y Coordinate',
 'Year',
 'Updated On',
 'Latitude',
 'Longitude',
 'Location']

Poniżej można zaobserwować wygląd poszczególnych kolumn w zbiorze, a także ich typ.

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,3,10508693,HZ250496,05/03/2016 11:40:00 PM,013XX S SAWYER AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,...,24.0,29.0,08B,1154907.0,1893681.0,2016,05/10/2016 03:56:50 PM,41.864073,-87.706819,"(41.864073157, -87.706818608)"
1,89,10508695,HZ250409,05/03/2016 09:40:00 PM,061XX S DREXEL AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,...,20.0,42.0,08B,1183066.0,1864330.0,2016,05/10/2016 03:56:50 PM,41.782922,-87.604363,"(41.782921527, -87.60436317)"
2,197,10508697,HZ250503,05/03/2016 11:31:00 PM,053XX W CHICAGO AVE,470,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,STREET,False,...,37.0,25.0,24,1140789.0,1904819.0,2016,05/10/2016 03:56:50 PM,41.894908,-87.758372,"(41.894908283, -87.758371958)"
3,673,10508698,HZ250424,05/03/2016 10:10:00 PM,049XX W FULTON ST,460,BATTERY,SIMPLE,SIDEWALK,False,...,28.0,25.0,08B,1143223.0,1901475.0,2016,05/10/2016 03:56:50 PM,41.885687,-87.749516,"(41.885686845, -87.749515983)"
4,911,10508699,HZ250455,05/03/2016 10:00:00 PM,003XX N LOTUS AVE,820,THEFT,$500 AND UNDER,RESIDENCE,False,...,28.0,25.0,06,1139890.0,1901675.0,2016,05/10/2016 03:56:50 PM,41.886297,-87.761751,"(41.886297242, -87.761750709)"


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1456714 entries, 0 to 1456713
Data columns (total 23 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Unnamed: 0            1456714 non-null  int64  
 1   ID                    1456714 non-null  int64  
 2   Case Number           1456713 non-null  object 
 3   Date                  1456714 non-null  object 
 4   Block                 1456714 non-null  object 
 5   IUCR                  1456714 non-null  object 
 6   Primary Type          1456714 non-null  object 
 7   Description           1456714 non-null  object 
 8   Location Description  1455056 non-null  object 
 9   Arrest                1456714 non-null  bool   
 10  Domestic              1456714 non-null  bool   
 11  Beat                  1456714 non-null  int64  
 12  District              1456713 non-null  float64
 13  Ward                  1456700 non-null  float64
 14  Community Area        1456674 non-

Ważnym krokiem jest również sprawdzenie ilości wybrakowanych wierszy w zbiorze, co zostało wykonane poniżej.

In [10]:
df.isnull().sum()

Unnamed: 0                  0
ID                          0
Case Number                 1
Date                        0
Block                       0
IUCR                        0
Primary Type                0
Description                 0
Location Description     1658
Arrest                      0
Domestic                    0
Beat                        0
District                    1
Ward                       14
Community Area             40
FBI Code                    0
X Coordinate            37083
Y Coordinate            37083
Year                        0
Updated On                  0
Latitude                37083
Longitude               37083
Location                37083
dtype: int64

Można zauważyć, że w zbiorze brakuje wielu wierszy zawierających koordynaty zgłoszenia.

# Preprocessing

W poniższej sekcji znajduje się kod, dzięki któremu zbió został przetworzony do formy, która użyta będzie w tworzeniu modeli klasyfikacji.

In [11]:
# Import bibliotek używanych podczas przetwarzania zbioru.

from sklearn.model_selection import train_test_split
from datetime import datetime as dt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import re
#import category_encoders as ce

In [12]:
import warnings
warnings.filterwarnings('ignore')

Ze względu na ich niewielką liczbę w skali rozmiaru zbioru, pierwszym krokiem w przetworzeniu zbioru będzie usunięcie wierszy zawierających wybrakowane pola. 

In [13]:
df = df.dropna()

Podział zbioru na jego cechy (zbiór X) oraz klasy ("target", zbiór y).

In [14]:
X = df.drop('Primary Type', axis = 1)
y = df['Primary Type']

Podział wcześniej określonych zbiorów na zbiór treningowy, testowy oraz walidayjny.

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

Zakodowanie zbiorów zawierających klasy w poszczególnych zbiorach: treningowym, testowym oraz walidacyjnym.

In [17]:
le_y = LabelEncoder() # Zadeklarowanie LabelEncoder()

y_train_le = le_y.fit_transform(y_train) # Zakodowanie zbioru treningowego.
y_test_le = le_y.transform(y_test) # Zakodowanie zbioru testowego.
#y_val_le = le_y.transform(y_val) # Zakodowanie zbioru walidacyjnego.

### Klasy
Klasa BoolTranformer jest użyta do przetworzenia kolumn zawierających dane True/False.


In [18]:
class BoolTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, bool_cols):
        self.bool_cols = bool_cols
        self.le = LabelEncoder()
        
    def fit(self, X, y=None):
        self.le.fit(X[self.bool_cols[0]])
        return self
        
    def transform(self, X):
        for col in self.bool_cols:
            X[col] = self.le.transform(X[col])
        return X

Klasa DateTimeTransformer jest wykorzystana do przetworzenia kolumny "Date", która zawiera w sobie datę oraz godzinę zdarzenia.

In [19]:
class DateTimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X): 
        X['Date'] = pd.to_datetime(X['Date'], errors='coerce')
        
        X['Year'] = X['Date'].dt.year
        X['Month'] = X['Date'].dt.month
        X['Day'] = X['Date'].dt.day
        X['Hour'] = X['Date'].dt.hour
        
        X = X.drop(columns=['Date'])
        return X

Klasa AddressTransformer jest wykorzystana na kolumnie "Block", usuwa ona zredagowaną część adresu zdarzenia, zostawiając jedynie nazwę ulicy.

In [20]:
class AddressTranformer(BaseEstimator, TransformerMixin):
    def __init__(self, block_cols):
        self.block_cols = block_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for col in self.block_cols:
            X[col] = X[col].apply(self.extract_street_name)
        return X

    def extract_street_name(self, address):
        match = re.search(r'\d{3}XX\s+[SNEW]\s+(.+)', address)
        if match:
            street_name = match.group(1).strip()
            return street_name
        return None

Klasa ColumnDrop usuwa wcześniej określone kolumny ze zbioru.

In [21]:
class ColumnDrop(BaseEstimator, TransformerMixin):
    def __init__(self, drop_cols):
        self.drop_cols = drop_cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(columns = self.drop_cols)

### Zadeklarowanie poszczególnych podzbiorów.
Poniższy kod deklaruje, które kolumny będą podlegać konnkretnym metodom przetwarzania.

In [22]:
drop_cols = ['ID', 'Case Number', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Location', 'FBI Code', 'IUCR'] #ColumnDrop
bool_cols = ['Arrest', 'Domestic'] #BoolTransformer
date_cols = ['Date'] #DateTimeTransformer
block_cols = ['Block'] #AddressTransformer
cat_cols = ['Location Description', 'Beat', 'District', 'Ward', 'Community Area', 'Description'] #cat_pipeline
num_cols = ['Longitude', 'Latitude'] #num_pipeline

Potok transformujący *num_pipeline* przyjmuje kolumny zadeklarowane jako num_cols, a następnie dzięki klasie SimpleImputer() uzupełnia brakujące wartości (używając średniej arytmetycznej) oraz standaryzuje wartości poprzez klasę StandardScaler().

In [23]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler())
])

Potok tranformujący *cat_pipeline* przetwarza kolumny zawierające wartości kategoryczne, a następnie przy użyciu klasy SimpleImputer() uzupełnia brakujące wartości oraz koduje je przez klasę OrdinalEncoder().

In [24]:
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

Potok transformujący *block_pipeline* przyjmuje kolumnę "Block", a następnie, poprzez klasę AddressTransformer() przetwarza adres zawarty w kolumnie usuwając zredagową część adresu oraz koduje ją poprzez klasę OrdinalEncoder().

In [25]:
block_pipeline = Pipeline([
    ('adresser', AddressTranformer(block_cols)),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

Za pomocą klasy ColumnTransformer, wcześniej zadeklarowane kolumny są poddane przetworzeniu.

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('dropper', ColumnDrop(drop_cols), drop_cols),
        ('datetimer', DateTimeTransformer(), ['Date']),
        ('adresser', block_pipeline, block_cols),
        ('binary_encoder', BoolTransformer(bool_cols), bool_cols),
        ('num_pipeline', num_pipeline, num_cols),
        ('cat_pipeline', cat_pipeline, cat_cols)
    ],
    remainder='passthrough'
)

Poniższy kod przedstawia przykład ramki danych, która powstała podczas przetworzenia oryginalnej ramki danych X_train przez ColumnTransformer() *preprocessor*.

In [27]:
preprocessed_array = preprocessor.fit_transform(X_train)

In [28]:
preprocessed_df = pd.DataFrame(preprocessed_array)

In [29]:
preprocessed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,2013.0,1.0,6.0,5.0,486.0,0.0,0.0,-0.731152,0.576168,99.0,130.0,10.0,26.0,23.0,307.0,2746233.0
1,2012.0,9.0,30.0,11.0,741.0,0.0,0.0,-1.405425,0.881023,111.0,291.0,22.0,29.0,19.0,0.0,2662655.0
2,2016.0,10.0,24.0,1.0,1777.0,0.0,0.0,0.718002,0.122058,119.0,8.0,0.0,1.0,33.0,280.0,6174428.0
3,2016.0,4.0,7.0,5.0,468.0,1.0,0.0,-1.424288,1.002573,111.0,290.0,22.0,30.0,19.0,302.0,4082605.0
4,2015.0,9.0,26.0,1.0,1407.0,0.0,1.0,0.071623,-0.336618,108.0,110.0,8.0,19.0,61.0,118.0,3465911.0


# Wybrane algorytmy klasyfikacji
Poniższa sekcja przestawia implementację wybranych algorytmów klasyfikujących.

In [30]:
# Biblioteki używane do analizy wyników:

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score

# Biblioteki zawierające wybrane algorytmy klasyfikujące:

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Import poszczególnych metod służących do dostrojenia hiperparametrów modeli:

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [31]:
target_names = le_y.classes_

#### RandomForestClassifier

In [77]:
rf_clf = RandomForestClassifier(
    n_estimators = 100,
    max_features = 'log2',
    class_weight = 'balanced_subsample'
)

In [78]:
# parameters_rf = {
#     'classifier__n_estimators': [10, 100, 500],
#     'classifier__max_features': ['auto', 'sqrt', 'log2'],
#     'classifier__class_weight': ['balanced', 'balanced_subsample']
# }

In [79]:
rf_clf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rf_clf)
])

In [80]:
rf_clf_pipeline.fit(X_train, y_train)

In [81]:
y_pred_rf = rf_clf_pipeline.predict(X_test)

In [82]:
report_rf = classification_report(y_test, y_pred_rf, target_names = target_names)

In [83]:
print(report_rf)

                                   precision    recall  f1-score   support

                            ARSON       0.99      0.67      0.80       435
                          ASSAULT       0.68      0.43      0.52     17901
                          BATTERY       0.80      0.93      0.86     51788
                         BURGLARY       0.99      0.98      0.99     16334
CONCEALED CARRY LICENSE VIOLATION       1.00      0.18      0.30        17
              CRIM SEXUAL ASSAULT       0.99      0.56      0.72      1260
                  CRIMINAL DAMAGE       0.98      0.99      0.98     30562
                CRIMINAL TRESPASS       0.99      0.92      0.95      7286
               DECEPTIVE PRACTICE       0.96      0.95      0.95     13522
                         GAMBLING       1.00      0.96      0.98       442
                         HOMICIDE       1.00      0.97      0.99       518
                HUMAN TRAFFICKING       0.00      0.00      0.00         4
 INTERFERENCE WITH PUBLI

In [84]:
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

In [85]:
conf_matrix_rf

array([[  291,     5,    64, ...,     0,    17,     0],
       [    1,  7617, 10095, ...,     0,   125,     0],
       [    1,  3242, 48209, ...,     0,   189,     0],
       ...,
       [    0,    64,    58, ...,     4,     1,     0],
       [    0,     4,    87, ...,     0, 63834,     0],
       [    0,     1,     2, ...,     0,   124,  3188]])

#### KNeighborsClassifier

In [86]:
kn_clf = KNeighborsClassifier(
    n_neighbors=3,
    weights='distance'
)

In [87]:
# parameters_kn = {
#     'classifier__n_neighbors': [1, 3, 5, 7, 9],
#     'classifier__weights': ['uniform', 'distance'],
#     'classifier__p': [1, 2, 3, 4]
# }

In [88]:
kn_clf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', kn_clf)
])

In [89]:
kn_clf_pipeline.fit(X_train, y_train_le)

In [90]:
y_pred_kn = kn_clf_pipeline.predict(X_test)

In [91]:
report_kn = classification_report(y_test_le, y_pred_kn, target_names = target_names)

In [92]:
print(report_kn)

                                   precision    recall  f1-score   support

                            ARSON       0.02      0.01      0.02       435
                          ASSAULT       0.09      0.09      0.09     17901
                          BATTERY       0.29      0.32      0.30     51788
                         BURGLARY       0.15      0.13      0.14     16334
CONCEALED CARRY LICENSE VIOLATION       0.00      0.00      0.00        17
              CRIM SEXUAL ASSAULT       0.02      0.01      0.02      1260
                  CRIMINAL DAMAGE       0.31      0.31      0.31     30562
                CRIMINAL TRESPASS       0.09      0.08      0.08      7286
               DECEPTIVE PRACTICE       0.18      0.17      0.18     13522
                         GAMBLING       0.02      0.01      0.01       442
                         HOMICIDE       0.60      0.62      0.61       518
                HUMAN TRAFFICKING       0.00      0.00      0.00         4
 INTERFERENCE WITH PUBLI

In [93]:
conf_matrix_kn = confusion_matrix(y_test_le, y_pred_kn)

In [94]:
conf_matrix_kn

array([[    6,    25,   147, ...,     0,    93,     0],
       [   25,  1665,  3340, ...,    10,  3630,   233],
       [  108,  2752, 16364, ...,    17,  8721,   346],
       ...,
       [    0,    24,    19, ...,     2,    19,     1],
       [   76,  3679,  9618, ...,    25, 26741,   165],
       [    0,   318,   516, ...,     2,   267,   180]])

#### GradientBoostingClassifier

In [95]:
# gb_clf = GradientBoostingClassifier(
#     n_estimators = 100,
#     learning_rate = 0.1,
#     max_depth = 3
# )

In [96]:
# parameters_gbc = {
#     'classifier__learning_rate': [0.01, 0.1, 1],
#     'classifier__n_estimators': [10, 100, 1000],
# }

In [97]:
# gb_clf_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('classifier', gb_clf)
# ])

In [98]:
# gb_clf_pipeline.fit(X_train, y_train_le)

In [99]:
# y_pred_gb = gb_clf_pipeline.predict(X_test)

In [100]:
# report_gb = classification_report(y_test_le, y_pred_gb, target_names = target_names)

In [101]:
# print(report_gb)

In [102]:
# conf_matrix_gb = confusion_matrix(y_test_le, y_pred_gb)

In [103]:
# conf_matrix_gb

#### AdaBoost

In [68]:
abc = AdaBoostClassifier(
    n_estimators = 50,
    learning_rate = 1
)

In [69]:
abc_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', abc)
])

In [70]:
# parameters_ab = {
#     'classifier__learning_rate': [0.01, 0.1, 1],
#     'classifier__n_estimators': [10, 50, 100]
# }

In [71]:
abc_pipeline.fit(X_train, y_train_le)

In [72]:
y_pred_abc = abc_pipeline.predict(X_test)

In [73]:
report_abc = classification_report(y_test_le, y_pred_abc, target_names = target_names)

In [74]:
print(report_abc)

                                   precision    recall  f1-score   support

                            ARSON       0.00      0.00      0.00       435
                          ASSAULT       0.00      0.00      0.00     17901
                          BATTERY       0.74      0.50      0.59     51788
                         BURGLARY       0.00      0.00      0.00     16334
CONCEALED CARRY LICENSE VIOLATION       0.00      0.00      0.00        17
              CRIM SEXUAL ASSAULT       0.00      0.00      0.00      1260
                  CRIMINAL DAMAGE       0.50      0.95      0.65     30562
                CRIMINAL TRESPASS       0.00      0.00      0.00      7286
               DECEPTIVE PRACTICE       0.00      0.00      0.00     13522
                         GAMBLING       0.00      0.00      0.00       442
                         HOMICIDE       0.00      0.00      0.00       518
                HUMAN TRAFFICKING       0.00      0.00      0.00         4
 INTERFERENCE WITH PUBLI

In [75]:
conf_matrix_abc = confusion_matrix(y_test_le, y_pred_abc)

In [76]:
conf_matrix_abc

array([[    0,     0,   304, ...,     0,   126,     0],
       [    0,     0,     0, ...,     0,  5096,     0],
       [    0,     0, 25642, ...,     0,  8095,     0],
       ...,
       [    0,     0,    13, ...,     0,     3,     0],
       [    0,     0,    49, ...,     0, 38528,     0],
       [    0,     0,     3, ...,     0,     0,     0]])

#### AdaBoost with DecisionTreeClassifier

In [104]:
dct = DecisionTreeClassifier()

In [105]:
abc_dct = AdaBoostClassifier(
    estimator=dct
)

In [106]:
abc_dct_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', abc_dct)
])

In [107]:
abc_dct_pipeline.fit(X_train, y_train_le)

In [108]:
y_pred_abc_dct = abc_dct_pipeline.predict(X_test)

In [109]:
report_abc_dct = classification_report(y_test_le, y_pred_abc_dct, target_names = target_names)

In [110]:
print(report_abc_dct)

                                   precision    recall  f1-score   support

                            ARSON       0.93      0.95      0.94       435
                          ASSAULT       0.55      0.56      0.55     17901
                          BATTERY       0.85      0.84      0.85     51788
                         BURGLARY       1.00      1.00      1.00     16334
CONCEALED CARRY LICENSE VIOLATION       0.86      0.71      0.77        17
              CRIM SEXUAL ASSAULT       0.93      0.94      0.94      1260
                  CRIMINAL DAMAGE       0.98      0.98      0.98     30562
                CRIMINAL TRESPASS       0.92      0.93      0.93      7286
               DECEPTIVE PRACTICE       1.00      1.00      1.00     13522
                         GAMBLING       0.99      0.99      0.99       442
                         HOMICIDE       1.00      1.00      1.00       518
                HUMAN TRAFFICKING       1.00      1.00      1.00         4
 INTERFERENCE WITH PUBLI

In [111]:
conf_matrix_abc_dct = confusion_matrix(y_test_le, y_pred_abc_dct)

In [112]:
conf_matrix_abc_dct

array([[  413,     0,     0, ...,     0,     1,     0],
       [    0,  9965,  7824, ...,    71,     0,     0],
       [    0,  8016, 43682, ...,    44,     0,     0],
       ...,
       [    0,    68,    45, ...,    40,     0,     0],
       [    0,     0,     0, ...,     0, 64390,     0],
       [    0,     0,     0, ...,     0,     0,  3397]])

#### BalancedBaggingClassifier

In [113]:
bb_clf = BalancedBaggingClassifier()

In [114]:
bb_clf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', bb_clf)
])

In [115]:
bb_clf_pipeline.fit(X_train, y_train_le)

In [116]:
y_pred_bb_clf = bb_clf_pipeline.predict(X_test)

In [117]:
report_bb_clf = classification_report(y_test_le, y_pred_bb_clf, target_names=target_names)

In [118]:
print(report_bb_clf)

                                   precision    recall  f1-score   support

                            ARSON       0.05      0.77      0.09       435
                          ASSAULT       0.38      0.48      0.43     17901
                          BATTERY       0.86      0.57      0.68     51788
                         BURGLARY       0.76      0.91      0.83     16334
CONCEALED CARRY LICENSE VIOLATION       0.00      0.71      0.00        17
              CRIM SEXUAL ASSAULT       0.17      0.47      0.25      1260
                  CRIMINAL DAMAGE       0.88      0.90      0.89     30562
                CRIMINAL TRESPASS       0.72      0.56      0.63      7286
               DECEPTIVE PRACTICE       0.54      0.52      0.53     13522
                         GAMBLING       0.10      0.97      0.18       442
                         HOMICIDE       0.23      0.99      0.38       518
                HUMAN TRAFFICKING       0.00      0.75      0.01         4
 INTERFERENCE WITH PUBLI

In [119]:
conf_matrix_bb_clf = confusion_matrix(y_test_le, y_pred_bb_clf)

In [120]:
conf_matrix_bb_clf

array([[  333,     4,     5, ...,     0,     5,     0],
       [ 1077,  8652,  2459, ...,  2889,    36,     0],
       [ 1814, 10957, 29388, ...,  4884,   102,     0],
       ...,
       [    2,    74,    15, ...,    30,     1,    12],
       [  452,  2067,   437, ...,   237, 30433,     0],
       [    5,    12,    10, ...,     1,     4,  2906]])

#### BalancedBaggingClassifier with DecisionTreeClassifier

In [121]:
dt_clf = DecisionTreeClassifier()

In [122]:
bbc_dct = BalancedBaggingClassifier(
    estimator=dt_clf
)

In [123]:
bbc_dct_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', bbc_dct)
])

In [124]:
bbc_dct_pipeline.fit(X_train, y_train_le)

In [125]:
y_pred_bbc_dct = bbc_dct_pipeline.predict(X_test)

In [126]:
report_bbc_dct = classification_report(y_test_le, y_pred_bbc_dct, target_names=target_names)

In [127]:
print(report_bbc_dct)

                                   precision    recall  f1-score   support

                            ARSON       0.06      0.76      0.11       435
                          ASSAULT       0.36      0.57      0.44     17901
                          BATTERY       0.84      0.42      0.56     51788
                         BURGLARY       0.69      0.68      0.68     16334
CONCEALED CARRY LICENSE VIOLATION       0.00      0.59      0.00        17
              CRIM SEXUAL ASSAULT       0.32      0.60      0.42      1260
                  CRIMINAL DAMAGE       0.60      0.51      0.55     30562
                CRIMINAL TRESPASS       0.19      0.49      0.27      7286
               DECEPTIVE PRACTICE       0.42      0.17      0.25     13522
                         GAMBLING       0.06      0.97      0.11       442
                         HOMICIDE       0.14      0.99      0.25       518
                HUMAN TRAFFICKING       0.00      1.00      0.00         4
 INTERFERENCE WITH PUBLI

In [128]:
conf_matrix_bbc_dct = confusion_matrix(y_test_le, y_pred_bbc_dct) 

In [129]:
conf_matrix_bbc_dct

array([[  332,    17,     1, ...,     0,     2,     0],
       [  684, 10140,   523, ...,  3197,    72,    10],
       [ 1481, 16178, 21509, ...,  3277,   115,    22],
       ...,
       [    2,    63,     9, ...,    58,     0,     1],
       [  617,   450,   616, ...,     0, 27706,    16],
       [    0,     1,     2, ...,     5,     1,  2135]])

#### BalancedRandomForestClassifier

In [130]:
brfc = BalancedRandomForestClassifier(
    n_estimators = 100,
    max_features = 'log2',
    class_weight = 'balanced_subsample'
)

In [131]:
brfc_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', brfc)
])

In [132]:
brfc_pipeline.fit(X_train, y_train_le)

In [133]:
y_pred_brfc = brfc_pipeline.predict(X_test)

In [134]:
report_brfc = classification_report(y_test_le, y_pred_brfc, target_names = target_names)

In [135]:
print(report_brfc)

                                   precision    recall  f1-score   support

                            ARSON       0.03      0.65      0.05       435
                          ASSAULT       0.26      0.12      0.16     17901
                          BATTERY       0.73      0.23      0.35     51788
                         BURGLARY       0.35      0.08      0.14     16334
CONCEALED CARRY LICENSE VIOLATION       0.00      0.47      0.00        17
              CRIM SEXUAL ASSAULT       0.06      0.24      0.09      1260
                  CRIMINAL DAMAGE       0.81      0.72      0.76     30562
                CRIMINAL TRESPASS       0.34      0.68      0.45      7286
               DECEPTIVE PRACTICE       0.17      0.11      0.13     13522
                         GAMBLING       0.06      0.90      0.12       442
                         HOMICIDE       0.04      0.81      0.08       518
                HUMAN TRAFFICKING       0.00      0.00      0.00         4
 INTERFERENCE WITH PUBLI

In [136]:
conf_matrix_brfc = confusion_matrix(y_test_le, y_pred_brfc)

In [137]:
conf_matrix_brfc

array([[  283,     0,     3, ...,     0,     6,     0],
       [  933,  2117,  1477, ...,  1282,   308,     1],
       [ 1915,  3570, 11953, ...,  1654,   660,     1],
       ...,
       [    1,    13,    17, ...,    21,     1,    24],
       [ 1180,   594,   775, ...,   721, 26481,     4],
       [    1,     4,     0, ...,     4,     3,  3171]])