In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from catboost import CatBoostClassifier

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../data/San Francisco Crime Classification/train.csv", parse_dates=["Dates"], nrows=20000)
df_test = pd.read_csv("../data/San Francisco Crime Classification/test.csv", parse_dates=["Dates"],nrows=20000)

In [3]:
set(df) - set(df_test)

{'Category', 'Descript', 'Resolution'}

In [4]:
df.shape

(20000, 9)

In [5]:
def add_dates(df):
    df["Day"] = df["Dates"].dt.day
    df["Month"] = df["Dates"].dt.month    
    df["Year"] = df["Dates"].dt.year
    
    df.drop("Dates", axis=1, inplace=True)
    return df

In [6]:
df = add_dates(df)
df_test = add_dates(df_test)

In [7]:
enc = LabelEncoder()

In [8]:
df.drop(["Category", "Descript"], axis=1, inplace=True)

In [9]:
cat_labels = [c for c in df if not pd.api.types.is_numeric_dtype(df[c]) and c != "Resolution"]

In [10]:
cat_labels

['DayOfWeek', 'PdDistrict', 'Address']

In [13]:
cat_encoder = Pipeline([
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

In [14]:
preprocessor = ColumnTransformer([
    ("cat_encoder", cat_encoder, cat_labels)
], remainder="passthrough")

In [15]:
X = df.drop("Resolution", axis=1)
y = df["Resolution"]

In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

In [17]:
X_train_encoded = preprocessor.fit_transform(X_train)
X_valid_encoded = preprocessor.transform(X_valid)

In [18]:
y_train_encoded = enc.fit_transform(y_train)
y_valid_encoded = enc.transform(y_valid)

In [19]:
model = CatBoostClassifier()

In [None]:
model.fit(X_train_encoded, y_train_encoded, plot=True, verbose=False);

In [None]:
preprocessor.set_output(transform="pandas");

In [None]:
df_test_encoded = preprocessor.transform(df_test)