# Modeling

## Library import

In [15]:
# Import libraries
## Basic libs
import pandas as pd
import numpy as np
import warnings
## Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

from feature_engine.encoding import OrdinalEncoder, RareLabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Configure libraries
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (10, 10)
plt.style.use('seaborn')

## Parameter definition

In [3]:
RAW_DATA = '../data/raw/' 
EXTERNAL_DATA = '../data/external/' 
INTERIM_DATA = '../data/interim/' 
PROCESSED_DATA = '../data/processed/'
REFERENCES = '../references/'
RANDOM_STATE = 14

## Importing data

In [4]:
df = pd.read_parquet(INTERIM_DATA + 'train.pqt')
df

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,month,day,year,hour
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,5,13,2015,23
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,5,13,2015,23
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,5,13,2015,23
...,...,...,...,...,...,...,...,...,...,...,...,...
878044,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056,1,6,2003,0
878045,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948,1,6,2003,0
878046,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266,1,6,2003,0
878047,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607,1,6,2003,0


## First attempt (baseline)

In [5]:
X = df.drop(columns=['Category', 'Descript', 'Resolution', 'Address']).copy()
y = df[['Category']].copy()
X

Unnamed: 0,DayOfWeek,PdDistrict,X,Y,month,day,year,hour
0,Wednesday,NORTHERN,-122.425892,37.774599,5,13,2015,23
1,Wednesday,NORTHERN,-122.425892,37.774599,5,13,2015,23
2,Wednesday,NORTHERN,-122.424363,37.800414,5,13,2015,23
3,Wednesday,NORTHERN,-122.426995,37.800873,5,13,2015,23
4,Wednesday,PARK,-122.438738,37.771541,5,13,2015,23
...,...,...,...,...,...,...,...,...
878044,Monday,TARAVAL,-122.459033,37.714056,1,6,2003,0
878045,Monday,INGLESIDE,-122.447364,37.731948,1,6,2003,0
878046,Monday,SOUTHERN,-122.403390,37.780266,1,6,2003,0
878047,Monday,SOUTHERN,-122.390531,37.780607,1,6,2003,0


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    shuffle = True, 
                                                    test_size=0.2, 
                                                    random_state=RANDOM_STATE)

In [9]:
# set up the encoder
#encoder_rare_predictors = RareLabelEncoder(n_categories=100, variables=['Address'])

encoder_predictors = OrdinalEncoder(encoding_method='arbitrary', variables=['DayOfWeek', 'PdDistrict'])
encoder_target = OrdinalEncoder(encoding_method='arbitrary', variables=['Category'])

# fit the encoder
#X_train_transformed = encoder_rare_predictors.fit_transform(X_train)

X_train_transformed = encoder_predictors.fit_transform(X_train)
y_train_transformed = encoder_target.fit_transform(y_train)


In [35]:
#encoder_predictors.encoder_dict_

In [10]:
#X_test_transformed = encoder_rare_predictors.transform(X_test)

X_test_transformed = encoder_predictors.transform(X_test)
y_test_transformed = encoder_target.transform(y_test)

In [11]:
model = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=2)
model.fit(X_train_transformed, y_train_transformed)

In [12]:
y_pred = model.predict(X_test_transformed)

In [14]:
model.score(X_test_transformed, y_test_transformed)

0.3014748590626957

In [17]:
print(classification_report(y_test_transformed, y_pred))

              precision    recall  f1-score   support

           0       0.25      0.32      0.28     25298
           1       0.16      0.12      0.14      1655
           2       0.38      0.61      0.47     35285
           3       0.14      0.11      0.12      8374
           4       0.09      0.04      0.05      6316
           5       0.24      0.21      0.22     15528
           6       0.24      0.20      0.21     18276
           7       0.61      0.49      0.54      5084
           8       0.17      0.08      0.11      8854
           9       0.37      0.43      0.40     10737
          10       0.18      0.09      0.12      7340
          11       0.48      0.43      0.45     10846
          12       0.28      0.20      0.24       361
          13       0.21      0.10      0.13      2178
          14       0.05      0.03      0.04      1366
          15       0.01      0.00      0.01      2035
          16       0.15      0.09      0.12       412
          17       0.23    

## Second attempt

In [20]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

df['Category'] = le.fit_transform(df['Category'])

#df['Category'] = df['Category'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   Category    878049 non-null  category
 1   Descript    878049 non-null  object  
 2   DayOfWeek   878049 non-null  object  
 3   PdDistrict  878049 non-null  object  
 4   Resolution  878049 non-null  object  
 5   Address     878049 non-null  object  
 6   X           878049 non-null  float64 
 7   Y           878049 non-null  float64 
 8   month       878049 non-null  int64   
 9   day         878049 non-null  int64   
 10  year        878049 non-null  int64   
 11  time        878049 non-null  object  
dtypes: category(1), float64(2), int64(3), object(6)
memory usage: 74.5+ MB


In [18]:
rom sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
cat_cols = ['DayOfWeek', 'PdDistrict', 'Address']

# Encode Categorical Data
df_encoded = pd.DataFrame(encoder.fit_transform(df[cat_cols]))
df_encoded.columns = encoder.get_feature_names(cat_cols)

# Replace Categotical Data with Encoded Data
df = df.drop(cat_cols ,axis=1)
df = pd.concat([df_encoded, df], axis=1)

# Encode target value
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'yes' else 0)

print('Shape of dataframe:', df.shape)
df.head()

SyntaxError: invalid syntax (1332060275.py, line 1)

In [None]:
# Select Features
feature = df.drop(['Category', 'Descript', 'Resolution'], axis=1)

# Select Target
target = df['Category']

# Set Training and Testing Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature , target, 
                                                    shuffle = True, 
                                                    test_size=0.2, 
                                                    random_state=1)

# Show the Training and Testing Data
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)

In [None]:
def evaluate_model(model, x_test, y_test):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict(x_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    kappa = metrics.cohen_kappa_score(y_test, y_pred)

    # Calculate area under curve (AUC)
    y_pred_proba = model.predict_proba(x_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)

    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa, 
            'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm}

In [None]:
from sklearn import tree

# Building Decision Tree model 
dtc = tree.DecisionTreeClassifier(random_state=RANDOM_STATE)
dtc.fit(X_train, y_train)

# Evaluate Model
dtc_eval = evaluate_model(dtc, X_test, y_test)

# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Cohens Kappa Score:', dtc_eval['kappa'])
print('Area Under Curve:', dtc_eval['auc'])
print('Confusion Matrix:\n', dtc_eval['cm'])