# Connecting to GitHub
If running on Google Colab, the following will connect to GitHub and clone the repository.

In [None]:
git_username = ''
git_token =  ''

if git_username == '':
  print('Github username:')
  git_username = %sx read -p ''
  git_username = git_username[0]

if git_token == '':
  print('Github access token (https://github.com/settings/tokens):')
  print('Github Token:')
  git_token = %sx read -p ''
  git_token = git_token[0]

In [None]:
# Clone the entire repo.
%cd /content
!git clone -l -s https://$git_username:$git_token@github.com/lougau92/MA1-Development-of-XAI-based-framework-to-Understand-Predict-and-Link-Homicides.git research-project
%cd research-project
!ls
!git init

In [None]:
# Update repo.
%cd /content
%cd research-project
!git pull
!ls

# Training Decision Trees

In [1]:
from sklearn.model_selection import cross_val_score, cross_val_predict 
from sklearn.metrics import confusion_matrix, make_scorer, classification_report, accuracy_score, balanced_accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid, GridSearchCV, StratifiedKFold, train_test_split
import pandas as pd
import numpy as np
import itertools
from typing import List
from preprocess import clean_dataframe, split_stratify, bin_age, to_numeric

random_state = 1
input_features = ['County', 'State', 'Area', 'VicAge', 'VicSex', 'VicRace', 'VicEthnic', 'VicCount', 'Weapon', 'Subcircum', 'Agency', 'Agentype', 'Circumstance', 'Homicide']
output_features = ['OffAge', 'OffSex', 'OffRace', 'OffEthnic', 'OffCount']
non_numeric_inputs = ['County', 'State', 'Area', 'VicSex', 'VicRace', 'VicEthnic', 'Weapon', 'Subcircum', 'Agency', 'Agentype', 'Circumstance', 'Homicide']

In [7]:
raw_data = pd.read_csv('Murder_Data.zip', index_col=0, compression='zip')
cleaned_data = clean_dataframe(raw_data)
cleaned_data['VicAge'].replace(to_replace='Unknown', value = 999, inplace = True)
cleaned_data['OffAge'] = bin_age(cleaned_data, 'OffAge')
cleaned_data['OffAge'] = cleaned_data['OffAge'].astype(str)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [8]:
numeric_data, ordinal_encoder = to_numeric(cleaned_data, use_ordinal_encoder=True, non_numeric_features=['County', 'State', 'Area', 'VicSex', 'VicRace', 'VicEthnic', 'Weapon', 'Subcircum', 'Agency', 'Agentype', 'Circumstance', 'Homicide'])

In [9]:
train_sample, test_sample = split_stratify(numeric_data, ['OffAge', 'OffSex', 'OffRace', 'OffEthnic'], 1.0, 0.0)
print(train_sample.info)

<bound method DataFrame.info of           Area  County FileMonth FileDay FileYear  State  Agency  Agentype  \
0        347.0  1507.0         3      22       85   25.0   811.0       1.0   
1        347.0  1507.0         3      22       85   25.0   811.0       1.0   
2        347.0  1507.0         3      22       85   25.0   811.0       1.0   
3        228.0  1194.0        10      26       93    9.0  5774.0       4.0   
4        228.0  1194.0        10      26       93    9.0  5774.0       4.0   
...        ...     ...       ...     ...      ...    ...     ...       ...   
1711516  197.0   203.0         U      nk       no    9.0  2669.0       5.0   
1711517  228.0  1188.0         U      nk       no    9.0  5754.0       1.0   
1711518  232.0   192.0         U      nk       no    9.0   823.0       4.0   
1711519  232.0   192.0         U      nk       no    9.0  5843.0       1.0   
1711520  221.0   993.0         U      nk       no    9.0  5621.0       1.0   

        Source Solved  ...  Off

In [9]:
def improved_tree_analysis(X_train:pd.DataFrame, y_train:pd.DataFrame, X_test:pd.DataFrame, y_test:pd.DataFrame, criterion:str = 'gini', random_state:int = 1, num_crossval:int = 5, verbosity: int = 0):
    assert criterion in ['entropy', 'gini'], 'invalid choice of criterion. Needs to be entropy or gini.'
    # only use cost_complexity_pruning_path if you have a lot of time and computational ressources
    # full_tree = DecisionTreeClassifier(random_state=random_state)
    # full_tree.fit(X_train,y_train)
    # ccp_alphas = full_tree.cost_complexity_pruning_path(X_train,y_train)['ccp_alphas']
    
    ccp_alphas = [0.00001, 0.001, 0.005, 0.01, 0.05, 0.1]
    alpha_grid_search = GridSearchCV(
                            estimator=DecisionTreeClassifier(random_state=random_state),
                            scoring=make_scorer(accuracy_score),
                            param_grid=ParameterGrid({"ccp_alpha": [[alpha] for alpha in ccp_alphas]}),
                            n_jobs=-1,
                            cv=num_crossval,
                            verbose=verbosity
                        )
    alpha_grid_search.fit(X_train, y_train)
    report = classification_report(y_test, alpha_grid_search.best_estimator_.predict(X_test))
    print(report)
    print(f'Best parameters: {alpha_grid_search.best_params_}')
    return alpha_grid_search, report

In [10]:
subset = numeric_data.sample(n=50000, random_state=random_state)
train, test = train_test_split(subset)
alpha_grid, report = improved_tree_analysis(train[input_features], train['OffAge'], test[input_features], test['OffAge'], verbosity=1)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       15-18       0.34      0.29      0.31      1260
       19-22       0.32      0.24      0.28      2202
       23-30       0.32      0.66      0.43      3399
        3-14       0.00      0.00      0.00       143
         30s       0.28      0.27      0.27      2484
         40s       0.30      0.12      0.17      1357
         50s       0.00      0.00      0.00       634
         60s       0.00      0.00      0.00       246
         70s       0.00      0.00      0.00       107
         80+       0.00      0.00      0.00        38
     Unknown       0.00      0.00      0.00       629
         nan       0.00      0.00      0.00         1

    accuracy                           0.32     12500
   macro avg       0.13      0.13      0.12     12500
weighted avg       0.27      0.32      0.27     12500

Best parameters: {'ccp_alpha': 0.001}


  _warn_prf(average, modifier, msg_start, len(result))
