# Petty Theft experiment: base model

This notebooks contains the data processing, building and training part of the model used for the base iteration of the petty theft experiment. 

In [None]:
import shap
import random
import joblib
import witwidget
import numpy as np
import pandas as pd
import lightgbm as lgb
from keras import layers
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder

## Functions and Utils

In [None]:
def minmax_scaler(data):
  scaler = MinMaxScaler()
  scaled = scaler.fit_transform(data)
  return scaled

def process_data(data):
  x = data.loc[:, data.columns != 'Tipo salida 2']
  y = data['Tipo salida 2']

  x_cat = x[['Region', 'Defensor', 'Desarrollo','extranjero']]
  x_cat['Region'] = label_encoder.fit_transform(x_cat['Region'])
  x_cat['Defensor'] = label_encoder.fit_transform(x_cat['Defensor'])
  x_cat['Desarrollo'] = label_encoder.fit_transform(x_cat['Desarrollo'])
  x_cat['extranjero'] = label_encoder.fit_transform(x_cat['extranjero'])

  x_num = x.loc[:, ~x.columns.isin(x_cat.columns)]

  x_norm = minmax_scaler(x_num)
  x_norm = pd.DataFrame(x_norm, columns = x_num.columns)

  x_norm.reset_index(drop=True, inplace=True)
  x_cat.reset_index(drop=True, inplace=True)

  x_fin = pd.concat([x_norm, x_cat], axis = 1)
  #y_fin = label_encoder.fit_transform(y)

  return x_fin

def custom_predict(examples_to_infer):

  preds = model.predict(model_inputs)
  preds = [[1 - pred[0], pred[0]] for pred in preds]
  return preds

## Importing data and pre-processing

In [None]:
path = r'hurtoFalta.csv'

data = pd.read_csv(path)

In [None]:
## Cambiando las dos RM a una sola

data['Región (tribunal)']=data['Región (tribunal)'].replace('Metropolitana Sur','Metropolitana')
data['Región (tribunal)']=data['Región (tribunal)'].replace('Metropolitana Norte','Metropolitana')

## Agregar variable edad

data['edad'] = np.nan

mu = 31 ## Edad promedio entre 18 y 44 años (concentran la mayoría de los delitos de hurto)
sigma = 8
random.seed(23)

for i in range(len(data)):
    data['edad'][i] = round(max(18, min(np.random.normal(mu, sigma), 65)))

data

In [None]:
data.info()

In [None]:
data.drop(data.columns[[1, 3, 4, 5, 6, 8, 10, 11, 12]], axis=1, inplace=True)
data.rename(columns = {'Región (tribunal)':'Region', 'Grado desarrollo':'Desarrollo'}, inplace = True)

## Data Transformations

In [None]:
x1 = data.loc[:, ~data.columns.isin(['Tipo salida 1'])]
y1 = data.loc[:, data.columns.isin(['Tipo salida 1'])]

In [None]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1['Tipo salida 1'], test_size=0.3, random_state = 23)

## Del set de entrenamiento, se desprenden 1000 datos para generar un dataset de validación

x_val1 = x_train1[-1000:]
y_val1 = y_train1[-1000:]

In [None]:
x_cat_train = x_train1[['Region', 'Defensor', 'Desarrollo']]
x_cat_test = x_test1[['Region', 'Defensor', 'Desarrollo']]
x_cat_val = x_val1[['Region', 'Defensor', 'Desarrollo']]

In [None]:
label_encoder = preprocessing.LabelEncoder()

x_cat_train['Defensor'] = label_encoder.fit_transform(x_cat_train['Defensor'])
x_cat_train['Desarrollo'] = label_encoder.fit_transform(x_cat_train['Desarrollo'])
x_cat_train['Region'] = label_encoder.fit_transform(x_cat_train['Region'])

x_cat_test['Defensor'] = label_encoder.fit_transform(x_cat_test['Defensor'])
x_cat_test['Desarrollo'] = label_encoder.fit_transform(x_cat_test['Desarrollo'])
x_cat_test['Region'] = label_encoder.fit_transform(x_cat_test['Region'])

x_cat_val['Defensor'] = label_encoder.fit_transform(x_cat_val['Defensor'])
x_cat_val['Desarrollo'] = label_encoder.fit_transform(x_cat_val['Desarrollo'])
x_cat_val['Region'] = label_encoder.fit_transform(x_cat_val['Region'])

In [None]:
x_num_train = x_train1.loc[:, ~x_train1.columns.isin(x_cat_train.columns)]
x_num_test = x_test1.loc[:, ~x_test1.columns.isin(x_cat_test.columns)]
x_num_val = x_val1.loc[:, ~x_val1.columns.isin(x_cat_val.columns)]

In [None]:
x_norm_train = minmax_scaler(x_num_train)
x_norm_train = pd.DataFrame(x_norm_train, columns = x_num_train.columns)

x_norm_test = minmax_scaler(x_num_test)
x_norm_test = pd.DataFrame(x_norm_test, columns = x_num_test.columns)

x_norm_val = minmax_scaler(x_num_val)
x_norm_val = pd.DataFrame(x_norm_val, columns = x_num_val.columns)

In [None]:
for x in range(len(label_encoder.classes_)):
  print(x, label_encoder.classes_[x])

In [None]:
x_cat_train['Region_alt'] = x_cat_train['Region'].apply(lambda x: 1 if x in {14, 12, 11, 8, 7, 6, 5, 4} else 0)
x_cat_test['Region_alt'] = x_cat_test['Region'].apply(lambda x: 1 if x in {14, 12, 11, 8, 7, 6, 5, 4} else 0)
x_cat_val['Region_alt'] = x_cat_val['Region'].apply(lambda x: 1 if x in {14, 12, 11, 8, 7, 6, 5, 4} else 0)

In [None]:
x_norm_train.reset_index(drop=True, inplace=True)
x_cat_train.reset_index(drop=True, inplace=True)

x_norm_test.reset_index(drop=True, inplace=True)
x_cat_test.reset_index(drop=True, inplace=True)

x_norm_val.reset_index(drop=True, inplace=True)
x_cat_val.reset_index(drop=True, inplace=True)

In [None]:
x_train_fin = pd.concat([x_norm_train, x_cat_train], axis = 1)
x_test_fin = pd.concat([x_norm_test, x_cat_test], axis = 1)
x_val_fin = pd.concat([x_norm_val, x_cat_val], axis = 1)

In [None]:
y_train_fin = label_encoder.fit_transform(y_train1)
y_test_fin = label_encoder.fit_transform(y_test1)
y_val_fin = label_encoder.fit_transform(y_val1)

## Building the model

In [None]:
print('Training Features Shape:', x_train_fin.shape)
print('Training Labels Shape:', y_train_fin.shape)

print('Testing Features Shape:', x_test_fin.shape)
print('Testing Labels Shape:', y_test_fin.shape)

In [None]:
train_data = lgb.Dataset(x_train_fin, label=y_train_fin)
val_data = lgb.Dataset(x_val_fin, label=y_val_fin)
test_data = lgb.Dataset(x_test_fin, label=y_test_fin)

In [None]:
## Parámetros

boosting_type = 'gbdt'
num_leaves = 63
max_depth = -1
learning_rate = 0.01
n_estimators = 100
objective = 'binary'
#class_weight

In [None]:
## Diccionario de parámetros

model = lgb.LGBMClassifier(
    boosting_type = boosting_type,
    num_leaves = num_leaves,
    max_depth = max_depth,
    learning_rate = learning_rate,
    n_estimators = n_estimators,
    objective = objective
)

In [None]:
model.fit(x_train_fin, y_train_fin)

## Model Evaluation

In [None]:
## Generando predicciones

predict_train1 = (model.predict_proba(x_train_fin)[0] > 0.5).astype(int)
predict_test1 = (model.predict_proba(x_test_fin)[0] > 0.5).astype(int)

In [None]:
## Matriz de confusión para dataset de entrenamiento

print(confusion_matrix(y_train_fin,predict_train1))
print(classification_report(y_train_fin,predict_train1))

In [None]:
## Matriz de confusión para dataset de prueba

print(confusion_matrix(y_test_fin,predict_test1))
print(classification_report(y_test_fin,predict_test1))

## Saving results and model

In [None]:
test_results = pd.DataFrame(predict_test1, columns=['score'])

x_test1_alt = x_test1.reset_index(drop=True)
y_test1_alt = y_test1.reset_index(drop=True)

test = pd.concat([x_test1_alt, test_results, pd.DataFrame(y_test_fin)], axis = 1)

path2 = r'preds_pt_base.csv'
test.to_csv(path2)

In [None]:
joblib.dump(model, 'pt_base.pkl')

## SHAP Values

In [None]:
explainer = shap.Explainer(model, x_train_fin.values[:])
shap_values = explainer(x_train_fin.values[:])
shap_values

In [None]:
shap_values.feature_names = list(x_train_fin.columns)

In [None]:
shap_df = pd.DataFrame(shap_values.values, columns=shap_values.feature_names)

# Calcular el valor absoluto y luego el promedio para cada característica
shap_abs_avg = shap_df.abs().mean()
shap_avg = shap_df.mean()
shap_max = shap_df.max()
shap_min= shap_df.min()
shap_median = shap_df.median()

print('Media absoluta: ', '\n\n', shap_abs_avg)
print('----------------------------------')
print('Media: ', '\n\n', shap_avg)
print('----------------------------------')
print('Máximo: ', '\n\n', shap_max)
print('----------------------------------')
print('Mínimo: ', '\n\n', shap_min)
print('----------------------------------')
print('Mediana: ', '\n\n', shap_median)

In [None]:
fig, ax = plt.subplots()
shap.plots.beeswarm(shap_values)

In [None]:
fig.savefig("pt_base_shap.pdf", bbox_inches="tight", format="pdf")  # o "shap_plot.svg" para formato SVG

## What if Tool

In [None]:
num_datapoints = y_test_fin.shape[0]
tool_height_in_px = 750

examples_labels = pd.concat([x_test_fin.reset_index(drop=True), pd.DataFrame(y_test_fin, columns = ['Tipo salida 1']).reset_index(drop=True)], axis=1)
columns_not_for_model_input = [examples_labels.columns.get_loc('Tipo salida 1')]

examples_wit = examples_labels.values.tolist()
column_names = examples_labels.columns.tolist()

model_inputs = np.delete(np.array(examples_wit[:num_datapoints]), columns_not_for_model_input, axis=1)

def custom_predict_shap(examples_to_infer):

  preds = model.predict_proba(model_inputs)
  preds = [[pred[0], pred[1]] for pred in preds]

  shap_output = explainer(model_inputs)
  attributions = []
  for single_shap_output in shap_output:  # iteramos sobre cada resultado de shap (cada ejemplo)
    attrs = {}
    for i, col in enumerate(x_train_fin.columns):
      attrs[col] = single_shap_output.values[i]  # utilizamos single_shap_output.values
    attributions.append(attrs)

  ret = {'predictions': preds, 'attributions': attributions}

  return ret

In [None]:
# Setup the tool with the test examples and the trained classifier

config_builder = WitConfigBuilder(examples_wit[:num_datapoints],column_names).set_custom_predict_fn(custom_predict_shap).set_target_feature('Tipo salida 1')
WitWidget(config_builder, height=tool_height_in_px)