# Connecting to GitHub
If running on Google Colab, the following will connect to GitHub and clone the repository.

In [None]:
git_username = ''
git_token =  ''

if git_username == '':
  print('Github username:')
  git_username = %sx read -p ''
  git_username = git_username[0]

if git_token == '':
  print('Github access token (https://github.com/settings/tokens):')
  print('Github Token:')
  git_token = %sx read -p ''
  git_token = git_token[0]

In [None]:
# Clone the entire repo.
%cd /content
!git clone -l -s https://$git_username:$git_token@github.com/lougau92/MA1-Development-of-XAI-based-framework-to-Understand-Predict-and-Link-Homicides.git research-project
%cd machine-learning-and-reasoning
!ls
!git init
%cd ..

In [None]:
# Update the entire repo.
%cd /content
%cd machine-learning-and-reasoning
!git pull
!ls
%cd ..

# Training Decision Trees

In [71]:
from sklearn.model_selection import cross_val_score, cross_val_predict 
from sklearn.metrics import confusion_matrix, make_scorer, classification_report, accuracy_score, balanced_accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid, GridSearchCV, StratifiedKFold, train_test_split
import pandas as pd
import numpy as np
import itertools
from typing import List
from preprocess import clean_dataframe, split_stratify, bin_age

random_state = 1
input_features = ['County', 'State', 'Area', 'VicAge', 'VicSex', 'VicRace', 'VicEthnic', 'VicCount', 'Weapon', 'Subcircum', 'Agency', 'Agentype', 'Circumstance', 'Homicide']
output_features = ['OffAge', 'OffSex', 'OffRace', 'OffEthnic', 'OffCount']
non_numeric_inputs = ['County', 'State', 'Area', 'VicSex', 'VicRace', 'VicEthnic', 'Weapon', 'Subcircum', 'Agency', 'Agentype', 'Circumstance', 'Homicide']

In [80]:
raw_data = pd.read_csv('Murder_Data.zip', index_col=0, compression='zip')
cleaned_data = clean_dataframe(raw_data)
cleaned_data['VicAge'].replace(to_replace='Unknown', value = 999, inplace = True)
cleaned_data['OffAge'] = bin_age(cleaned_data, 'OffAge')
cleaned_data['OffAge'] = cleaned_data['OffAge'].astype(str)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [82]:
def to_numeric(data: pd.DataFrame, non_numeric_features: List[str]):
    df = data.copy()
    encoder = OrdinalEncoder()
    encoder.fit(df[non_numeric_features])
    df[non_numeric_features] = encoder.transform(df[non_numeric_features])    
    return df, encoder

numeric_data, ordinal_encoder = to_numeric(cleaned_data, ['County', 'State', 'Area', 'VicSex', 'VicRace', 'VicEthnic', 'Weapon', 'Subcircum', 'Agency', 'Agentype', 'Circumstance', 'Homicide'])

In [4]:
train_sample, test_sample = split_stratify(cleaned_data, ['OffAge', 'OffSex', 'OffRace', 'OffEthnic'], 1.0, 0.0)

In [17]:
print(train_sample.info)

<bound method DataFrame.info of          Area  County FileMonth FileDay FileYear  State  Agency  Agentype  \
0         NaN     NaN         3      22       85    NaN     NaN       NaN   
1         NaN     NaN         3      22       85    NaN     NaN       NaN   
2         NaN     NaN         3      22       85    NaN     NaN       NaN   
3         NaN     NaN        10      26       93    NaN     NaN       NaN   
4         NaN     NaN        10      26       93    NaN     NaN       NaN   
...       ...     ...       ...     ...      ...    ...     ...       ...   
1711516   NaN     NaN         U      nk       no    NaN     NaN       NaN   
1711517   NaN     NaN         U      nk       no    NaN     NaN       NaN   
1711518   NaN     NaN         U      nk       no    NaN     NaN       NaN   
1711519   NaN     NaN         U      nk       no    NaN     NaN       NaN   
1711520   NaN     NaN         U      nk       no    NaN     NaN       NaN   

        Source Solved  ...  OffAge  OffSex 

In [103]:
def improved_tree_analysis(X_train:pd.DataFrame, y_train:pd.DataFrame, X_test:pd.DataFrame, y_test:pd.DataFrame, criterion:str = 'gini', random_state:int = 1, num_crossval:int = 5, verbosity: int = 0):
    assert criterion in ['entropy', 'gini'], 'invalid choice of criterion. Needs to be entropy or gini.'
    full_tree = DecisionTreeClassifier(random_state=random_state)
    full_tree.fit(X_train,y_train)
    ccp_alphas = full_tree.cost_complexity_pruning_path(X_train,y_train)['ccp_alphas']
    # ccp_alphas = [0.00001, 0.001, 0.005, 0.01, 0.05, 0.1]
    alpha_grid_search = GridSearchCV(
                            estimator=DecisionTreeClassifier(random_state=random_state),
                            scoring=make_scorer(accuracy_score),
                            param_grid=ParameterGrid({"ccp_alpha": [[alpha] for alpha in ccp_alphas]}),
                            n_jobs=-1,
                            cv=num_crossval,
                            verbose=verbosity
                        )
    alpha_grid_search.fit(X_train, y_train)
    report = classification_report(y_test, alpha_grid_search.best_estimator_.predict(X_test))
    print(report)
    return alpha_grid_search, report
    

def tree_analysis(X: pd.core.frame.DataFrame, y: pd.core.frame.DataFrame, n_crossval: int = 10, criterion: str = 'entropy', scoring = 'balanced_accuracy', random_state: int = 1, ccp_alpha: float = 0, verbosity: bool = 0, n_jobs = -1):
    assert criterion in ['entropy', 'gini'], 'invalid choice of criterion. Needs to be entropy or gini.'
    classifier = DecisionTreeClassifier(criterion = criterion, random_state=random_state, ccp_alpha=ccp_alpha)
    #y_pred = cross_val_predict(classifier, X = X, y = y, cv = n_crossval)
    cross_val_scores = cross_val_score(classifier, X = X, y = y, cv = n_crossval, scoring = scoring, verbose = 1, n_jobs = -1)
    #conf_matrix = confusion_matrix(y, y_pred)
    return(np.mean(cross_val_scores), np.std(cross_val_scores))#, conf_matrix)

In [104]:
subset = numeric_data.sample(n=50000)
train, test = train_test_split(subset)
alpha_grid, report = improved_tree_analysis(train[input_features], train['OffAge'], test[input_features], test['OffAge'], verbosity=2)

Fitting 5 folds for each of 11203 candidates, totalling 56015 fits




In [None]:
print(alpha_grid.best_params_)

{'ccp_alpha': 0.001}


In [None]:
# trying if training with dummies is feasible
X = pd.get_dummies(X, columns=['County', 'State', 'Area', 'VicSex', 'VicRace', 'VicEthnic', 'VicCount', 'Weapon', 'Subcircum', 'Agency', 'Agentype', 'Circumstance', 'Homicide'])
mean, std, conf_matrix = tree_analysis(X = X, y = y)
# result: training this one decision tree takes 12 hours with dummy variables