In [1]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import svm

from pmlb import fetch_data

## Read Data
You can change the code below to read your own data.

In [2]:
data_name = "crime"
file_path = "input/crime/communities.data"

# Returns a pandas DataFrame
df = pd.read_csv(filepath_or_buffer=file_path, sep=",", index_col=None, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,8,?,?,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,?,?,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,?,?,?,?,0.0,?,0.67
2,24,?,?,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,?,?,?,?,0.0,?,0.43
3,34,5,81440,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,?,?,?,?,0.0,?,0.12
4,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,?,?,?,?,0.0,?,0.03


In [4]:
cols = pd.read_csv(filepath_or_buffer="input/crime/columns.csv", sep=":", index_col=None, header=None)
cols.head(2)

Unnamed: 0,0,1
0,state,US state (by number) - not counted as predict...
1,county,"numeric code for county - not predictive, and..."


In [5]:
df.columns = cols[0].values.tolist()
df.head(2)

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8,?,?,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,?,?,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,?,?,?,?,0.0,?,0.67


In [6]:
df.dtypes

state                    int64
county                  object
community               object
communityname           object
fold                     int64
                        ...   
LemasPctPolicOnPatr     object
LemasGangUnitDeploy     object
LemasPctOfficDrugUn    float64
PolicBudgPerPop         object
ViolentCrimesPerPop    float64
Length: 128, dtype: object

In [7]:
numerical_cols = df.select_dtypes([np.number]).columns
numerical_cols

Index(['state', 'fold', 'population', 'householdsize', 'racepctblack',
       'racePctWhite', 'racePctAsian', 'racePctHisp', 'agePct12t21',
       'agePct12t29',
       ...
       'PctForeignBorn', 'PctBornSameState', 'PctSameHouse85', 'PctSameCity85',
       'PctSameState85', 'LandArea', 'PopDens', 'PctUsePubTrans',
       'LemasPctOfficDrugUn', 'ViolentCrimesPerPop'],
      dtype='object', length=102)

In [8]:
numerical_cols.values.shape

(102,)

In [9]:
df.shape

(1994, 128)

In [36]:
df['ViolentCrimesPerPop'].value_counts()

0.03    104
0.04     92
0.06     86
0.05     80
0.02     74
       ... 
0.78      2
0.77      1
0.94      1
0.89      1
0.96      1
Name: ViolentCrimesPerPop, Length: 98, dtype: int64

In [10]:
df['ViolentCrimesPerPop'].median()

0.15

In [11]:
gt = (df['ViolentCrimesPerPop'] > df['ViolentCrimesPerPop'].median()).astype(int)
df['gt'] = gt

df['gt'].value_counts()

0    1001
1     993
Name: gt, dtype: int64

## Train a classification model
In this step, you can either train a model in the cell below, or skip the training part and read the model prediction of the training data from your own model .

In [14]:
'''prepare data'''
target_col = 'gt'

to_keep = numerical_cols.values[:-1]
X = df[to_keep].values
y = df[target_col].values
y = y.reshape(len(y))

train, test, train_labels, test_labels = train_test_split(X, y, test_size = 0.2)

train_df = pd.DataFrame(train, columns=to_keep)
test_df = pd.DataFrame(test, columns=to_keep)


'''train the model'''
clf = MLPClassifier(random_state=1, max_iter=350)
clf.fit(train, train_labels)

'''report accuracy'''
print(clf.score(test,test_labels ))

0.8370927318295739


## Output Training Data

In [15]:
'''
cols: the column names of the input data.
data: the training input.
target_names: the names for the target classes (grount truth).
real_min: minimal values for columns of the input data.
real_max: maximal values for columns of the input data.
y_pred: the prediction of the input data from the model you want to explain.
y_gt: the ground truth of the input data.
'''

def output_data(cols, data, target_names, real_min, real_max, y_pred, y_gt):
    filename = "./output/"+data_name+"/test.json"
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    to_output = {}
    to_output['columns'] = cols
    to_output['data'] = data
    to_output['target_names'] = target_names
    to_output['real_min'] = real_min
    to_output['real_max'] = real_max
    to_output['y_pred'] = y_pred
    to_output['y_gt'] = y_gt
    to_output['n_cls'] = 2
    with open(filename, 'w') as output:
        output.write(json.dumps(to_output))

In [16]:
y_pred = clf.predict(train)

In [17]:
''' name the target classes '''
target_names = ["crime_rate<15%", "crime_rate>15%"]

''' obtain the min and max values '''
min_val = np.min(X, axis=0)
max_val = np.max(X, axis=0)

''' output data '''
output_data(cols=to_keep.tolist(), 
            data=train.tolist(), 
            target_names=target_names, 
            real_min=min_val.tolist(), 
            real_max=max_val.tolist(),
            y_pred=y_pred.tolist(), 
            y_gt=train_labels.tolist())