# Modeling

## Library import

In [12]:
# Import libraries
## Basic libs
import pandas as pd
import numpy as np
import warnings
## Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

from feature_engine.encoding import OrdinalEncoder, RareLabelEncoder
from feature_engine.imputation import MeanMedianImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, log_loss

# Configure libraries
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (10, 10)
plt.style.use('seaborn')

## Parameter definition

In [2]:
RAW_DATA = '../data/raw/' 
EXTERNAL_DATA = '../data/external/' 
INTERIM_DATA = '../data/interim/' 
PROCESSED_DATA = '../data/processed/'
REFERENCES = '../references/'
RANDOM_STATE = 14

## Importing data

In [None]:
# df = pd.read_parquet(INTERIM_DATA + 'train.pqt')
# df.describe()

## First attempt (baseline)

In [None]:
df = pd.read_parquet(PROCESSED_DATA + 'train_1.pqt')
df.describe()

In [None]:
X = df.drop(columns=['Category', 'Descript', 'Resolution', 'Address']).copy()
y = df[['Category']].copy()
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    shuffle = True, 
                                                    test_size=0.2, 
                                                    random_state=RANDOM_STATE)

In [None]:
# set up the encoder
#encoder_rare_predictors = RareLabelEncoder(n_categories=100, variables=['Address'])

encoder_predictors = OrdinalEncoder(encoding_method='arbitrary', variables=['DayOfWeek', 'PdDistrict'])
encoder_target = OrdinalEncoder(encoding_method='arbitrary', variables=['Category'])

# fit the encoder
#X_train_transformed = encoder_rare_predictors.fit_transform(X_train)

X_train_transformed = encoder_predictors.fit_transform(X_train)
y_train_transformed = encoder_target.fit_transform(y_train)


In [None]:
#encoder_predictors.encoder_dict_

In [None]:
#X_test_transformed = encoder_rare_predictors.transform(X_test)

X_test_transformed = encoder_predictors.transform(X_test)
y_test_transformed = encoder_target.transform(y_test)

In [None]:
model = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=2)
model.fit(X_train_transformed, y_train_transformed)

In [None]:
y_pred = model.predict(X_test_transformed)

In [None]:
model.score(X_test_transformed, y_test_transformed)

In [None]:
print(classification_report(y_test_transformed, y_pred))

## Second attempt

For this attempt I've:
 - Droped all duplicated rows
 - Created column "weekend" that shows if the crime happened during the week or the weekend

In [3]:
df = pd.read_parquet(PROCESSED_DATA + 'train_2.pqt')
df.describe()

Unnamed: 0,X,Y,month,day,year,hour,Weekend
count,873391.0,873391.0,873391.0,873391.0,873391.0,873391.0,873391.0
mean,-122.422639,37.771028,6.436067,15.573364,2008.71293,13.409799,0.277364
std,0.030384,0.458107,3.429044,8.783104,3.630581,6.549561,0.447698
min,-122.513642,37.707879,1.0,1.0,2003.0,0.0,0.0
25%,-122.432957,37.752409,3.0,8.0,2006.0,9.0,0.0
50%,-122.416446,37.775421,6.0,16.0,2009.0,14.0,0.0
75%,-122.40697,37.784372,9.0,23.0,2012.0,19.0,1.0
max,-120.5,90.0,12.0,31.0,2015.0,23.0,1.0


In [4]:
X = df.drop(columns=['Category', 'Descript', 'Resolution', 'Address']).copy()
y = df[['Category']].copy()
X

Unnamed: 0,DayOfWeek,PdDistrict,X,Y,month,day,year,hour,Weekend
0,Wednesday,NORTHERN,-122.425892,37.774599,5,13,2015,23,0
1,Wednesday,NORTHERN,-122.425892,37.774599,5,13,2015,23,0
2,Wednesday,NORTHERN,-122.424363,37.800414,5,13,2015,23,0
3,Wednesday,NORTHERN,-122.426995,37.800873,5,13,2015,23,0
4,Wednesday,PARK,-122.438738,37.771541,5,13,2015,23,0
...,...,...,...,...,...,...,...,...,...
873386,Monday,TARAVAL,-122.459033,37.714056,1,6,2003,0,0
873387,Monday,INGLESIDE,-122.447364,37.731948,1,6,2003,0,0
873388,Monday,SOUTHERN,-122.403390,37.780266,1,6,2003,0,0
873389,Monday,SOUTHERN,-122.390531,37.780607,1,6,2003,0,0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    shuffle = True, 
                                                    test_size=0.2, 
                                                    random_state=RANDOM_STATE)

In [6]:
# set up the encoder
#encoder_rare_predictors = RareLabelEncoder(n_categories=100, variables=['Address'])

encoder_predictors = OrdinalEncoder(encoding_method='arbitrary', variables=['DayOfWeek', 'PdDistrict'])
encoder_target = OrdinalEncoder(encoding_method='arbitrary', variables=['Category'])

# fit the encoder
#X_train_transformed = encoder_rare_predictors.fit_transform(X_train)

X_train_transformed = encoder_predictors.fit_transform(X_train)
y_train_transformed = encoder_target.fit_transform(y_train)

In [7]:
#X_test_transformed = encoder_rare_predictors.transform(X_test)

X_test_transformed = encoder_predictors.transform(X_test)
y_test_transformed = encoder_target.transform(y_test)

In [8]:
model = RandomForestClassifier(random_state=RANDOM_STATE, max_depth=4, min_samples_leaf=100, n_jobs=-1)
model.fit(X_train_transformed, y_train_transformed)

In [13]:
y_pred_proba = model.predict_proba(X_test_transformed)

In [10]:
model.score(X_test_transformed, y_test_transformed)

0.2315733431036358

In [11]:
print(classification_report(y_test_transformed, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      6274
           1       0.00      0.00      0.00     18504
           2       0.00      0.00      0.00      5039
           3       0.19      0.36      0.24     25226
           4       0.24      0.84      0.38     34801
           5       0.28      0.02      0.04     10721
           6       0.00      0.00      0.00      8903
           7       0.00      0.00      0.00      7227
           8       0.00      0.00      0.00      8350
           9       0.00      0.00      0.00     15350
          10       0.00      0.00      0.00      4503
          11       0.00      0.00      0.00       900
          12       0.00      0.00      0.00      2085
          13       0.00      0.00      0.00        35
          14       0.35      0.18      0.24     10708
          15       0.00      0.00      0.00      3327
          16       0.00      0.00      0.00       851
          17       0.00    

In [15]:
lst_labels = np.arange(0, 39).tolist()

In [17]:
log_loss(y_test_transformed, y_pred_proba, labels=lst_labels)

2.5841613172554823

## Third attempt

For this attempt I've:
  - Created column "day_period" that shows if the crime happened during dawn, morning, afternoon or night

In [None]:
df = pd.read_parquet(PROCESSED_DATA + 'train_3.pqt')
df.describe()

In [None]:
X = df.drop(columns=['Category', 'Descript', 'Resolution', 'Address']).copy()
y = df[['Category']].copy()
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    shuffle = True, 
                                                    test_size=0.2, 
                                                    random_state=RANDOM_STATE)

In [None]:
# set up the encoder
#encoder_rare_predictors = RareLabelEncoder(n_categories=100, variables=['Address'])

encoder_predictors = OrdinalEncoder(encoding_method='arbitrary', variables=['DayOfWeek', 'PdDistrict', 'day_period'])
encoder_target = OrdinalEncoder(encoding_method='arbitrary', variables=['Category'])

# fit the encoder
#X_train_transformed = encoder_rare_predictors.fit_transform(X_train)

X_train_transformed = encoder_predictors.fit_transform(X_train)
y_train_transformed = encoder_target.fit_transform(y_train)

In [None]:
#X_test_transformed = encoder_rare_predictors.transform(X_test)

X_test_transformed = encoder_predictors.transform(X_test)
y_test_transformed = encoder_target.transform(y_test)

In [None]:
model = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=2)
model.fit(X_train_transformed, y_train_transformed)

In [None]:
y_pred = model.predict(X_test_transformed)

In [None]:
model.score(X_test_transformed, y_test_transformed)

In [None]:
print(classification_report(y_test_transformed, y_pred))

## Fourth attempt

For this attempt I've:
  - Replaced outliers longitude and latitude for mode values

In [None]:
df = pd.read_parquet(PROCESSED_DATA + 'train_4.pqt')
df.info()

In [None]:
mode_X = df['X'].mode().values[0]
mode_Y = df['X'].mode().values[0]
df['X'].fillna(mode_X, inplace=True)
df['Y'].fillna(mode_Y, inplace=True)
df.info()

In [None]:
X = df.drop(columns=['Category', 'Descript', 'Resolution', 'Address']).copy()
y = df[['Category']].copy()
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    shuffle = True, 
                                                    test_size=0.2, 
                                                    random_state=RANDOM_STATE)

In [None]:
df.info()

In [None]:
# set up the encoder
#encoder_rare_predictors = RareLabelEncoder(n_categories=100, variables=['Address'])

encoder_predictors = OrdinalEncoder(encoding_method='arbitrary', variables=['DayOfWeek', 'PdDistrict', 'day_period'])
encoder_target = OrdinalEncoder(encoding_method='arbitrary', variables=['Category'])

# fit the encoder
#X_train_transformed = encoder_rare_predictors.fit_transform(X_train)

X_train_transformed = encoder_predictors.fit_transform(X_train)
y_train_transformed = encoder_target.fit_transform(y_train)

In [None]:
#X_test_transformed = encoder_rare_predictors.transform(X_test)

X_test_transformed = encoder_predictors.transform(X_test)
y_test_transformed = encoder_target.transform(y_test)

In [None]:
model = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=2)
model.fit(X_train_transformed, y_train_transformed)

In [None]:
y_pred = model.predict(X_test_transformed)

In [None]:
model.score(X_test_transformed, y_test_transformed)

In [None]:
print(classification_report(y_test_transformed, y_pred))

## Fifth attempt

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

df['Category'] = le.fit_transform(df['Category'])

#df['Category'] = df['Category'].astype('category')
df.info()

In [None]:
rom sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
cat_cols = ['DayOfWeek', 'PdDistrict', 'Address']

# Encode Categorical Data
df_encoded = pd.DataFrame(encoder.fit_transform(df[cat_cols]))
df_encoded.columns = encoder.get_feature_names(cat_cols)

# Replace Categotical Data with Encoded Data
df = df.drop(cat_cols ,axis=1)
df = pd.concat([df_encoded, df], axis=1)

# Encode target value
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'yes' else 0)

print('Shape of dataframe:', df.shape)
df.head()

In [None]:
# Select Features
feature = df.drop(['Category', 'Descript', 'Resolution'], axis=1)

# Select Target
target = df['Category']

# Set Training and Testing Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature , target, 
                                                    shuffle = True, 
                                                    test_size=0.2, 
                                                    random_state=1)

# Show the Training and Testing Data
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)

In [None]:
def evaluate_model(model, x_test, y_test):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict(x_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    kappa = metrics.cohen_kappa_score(y_test, y_pred)

    # Calculate area under curve (AUC)
    y_pred_proba = model.predict_proba(x_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)

    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa, 
            'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm}

In [None]:
from sklearn import tree

# Building Decision Tree model 
dtc = tree.DecisionTreeClassifier(random_state=RANDOM_STATE)
dtc.fit(X_train, y_train)

# Evaluate Model
dtc_eval = evaluate_model(dtc, X_test, y_test)

# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Cohens Kappa Score:', dtc_eval['kappa'])
print('Area Under Curve:', dtc_eval['auc'])
print('Confusion Matrix:\n', dtc_eval['cm'])