In [56]:
from __future__ import absolute_import, division, print_function

import logging
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_squared_error, accuracy_score

import torch
# from torch.distributions import constraints

import pyro
import pyro.distributions as dist
from pyro.distributions import Normal, Uniform, Delta
from pyro.infer import EmpiricalMarginal, SVI, Trace_ELBO, JitTrace_ELBO, TracePredictive
from pyro.contrib.autoguide import AutoMultivariateNormal
from pyro.infer.mcmc.api import MCMC
from pyro.infer.mcmc import NUTS
from pyro.infer.mcmc.util import diagnostics
import pyro.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from pyro.ops.stats import waic
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

from copy import deepcopy
from imblearn.under_sampling import RandomUnderSampler


pyro.set_rng_seed(1)
assert pyro.__version__.startswith('0.4.1')

In [55]:
!pip install imblearn



In [6]:
%matplotlib inline
logging.basicConfig(format='%(message)s', level=logging.INFO)
# Enable validation checks
pyro.enable_validation(True)
smoke_test = ('CI' in os.environ)
pyro.set_rng_seed(1)

In [57]:
data = pd.read_csv('data/crap.csv')

In [58]:
data['target'] = data.apply(lambda row : row['FELONY'] + row['MISDEMEANOR'] + row['VIOLATION'], axis=1)

In [59]:
data['target'] = data['target'].astype(int)

In [72]:
def get_data(data, features, target):
    df_X = data[features]
    df_y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2)
    # Train Data
    X_np_train = normalize(np.array(X_train))
    y_np_train = np.array(y_train)
    
    rus = RandomUnderSampler(random_state=42)
    X_np_train, y_np_train = rus.fit_resample(X_np_train, y_np_train)
    
    print(y_np_train.sum(), len(y_np_train)-y_np_train.sum())

    X_nuts_train = torch.from_numpy(X_np_train).type(torch.float32)
    y_nuts_train = torch.from_numpy(y_np_train).type(torch.float32)

   
    population =  torch.from_numpy(np.array(X_train['TotalPop'])).type(torch.float32)

    #Test Data
    X_np_test = normalize(np.array(X_test))
    y_np_test = np.array(y_test)

    X_nuts_test = torch.from_numpy(X_np_test).type(torch.float32)
    y_nuts_test = torch.from_numpy(y_np_test).type(torch.float32)

    test_population =  torch.from_numpy(np.array(X_test['TotalPop'])).type(torch.float32)
    return X_nuts_train, y_nuts_train, population, X_nuts_test, y_nuts_test, test_population


In [73]:
bool_split = 3
offset = 3

In [74]:
def sep_data(row):
    if row['target'] >= bool_split + offset :
        return 1
    elif row['target'] <= bool_split - offset :
        return 0
    else:
        return float('nan')

In [75]:
data.columns

Index(['Unnamed: 0', 'month', 'CensusTract', 'TotalPop', 'Income', 'IncomeErr',
       'IncomePerCap', 'IncomePerCapErr', 'Poverty', 'ChildPoverty',
       'Professional', 'Service', 'Office', 'Construction', 'Production',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment', 'FELONY', 'MISDEMEANOR',
       'VIOLATION', 'restaurants', 'bar', 'park', 'subway_station', 'target'],
      dtype='object')

In [76]:
features = ['TotalPop', 'Income', 'IncomeErr',
       'IncomePerCap', 'IncomePerCapErr', 'Poverty', 'ChildPoverty',
       'Professional', 'Service', 'Office', 'Construction', 'Production',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment','restaurants', 'bar', 'park', 'subway_station']
target = 'binary_target'            

In [None]:
# for bool_split in range(2,10):
#     for offset in range(1,5):

bool_split = 5
offset = 2

for bool_split in range(2,20):
    for offset in range(1,10):
        if bool_split - offset <= 0:
            continue
        print(bool_split, offset)
        data_new = deepcopy(data)
        data_new['binary_target'] = data_new.apply(sep_data, axis=1)
        data_new = data_new.dropna()
#         print(len(data_new[data_new['binary_target']==0]), len(data_new[data_new['binary_target']==1]))
        if (len(data_new[data_new['binary_target']==0]) == 0) or (len(data_new[data_new['binary_target']==1]) == 0):
            print('-'*30)
            continue
        X_nuts_train, y_nuts_train, population, X_nuts_test, y_nuts_test, test_population = get_data(data_new, features, target)
        reg = LogisticRegression(C=0.8).fit(X_nuts_train, y_nuts_train)
        print('Train Score : ',reg.score(X_nuts_train, y_nuts_train))
        print('Test Score :', reg.score(X_nuts_test, y_nuts_test))
        print('-'*30)

        

2 1
3513.0 3513.0
Train Score :  0.6827497865072587
Test Score : 0.6382252559726962
------------------------------
3 1




6588.0 6588.0
Train Score :  0.6955828779599271
Test Score : 0.6735807860262009
------------------------------
3 2




3507.0 3507.0
Train Score :  0.7392358140861135
Test Score : 0.7255389718076285
------------------------------
4 1




6143.0 6143.0
Train Score :  0.6974605241738564
Test Score : 0.7155555555555555
------------------------------
4 2




4775.0 4775.0
Train Score :  0.7358115183246073
Test Score : 0.7548228691687128
------------------------------
4 3




3485.0 3485.0
Train Score :  0.7793400286944046
Test Score : 0.7619577308120133
------------------------------
5 1




4824.0 4824.0
Train Score :  0.697139303482587
Test Score : 0.7261694058154236
------------------------------
5 2




3696.0 3696.0
Train Score :  0.7372835497835498
Test Score : 0.7517133956386293
------------------------------
5 3




2904.0 2904.0
Train Score :  0.7601584022038568
Test Score : 0.7882749894559258
------------------------------
5 4




2225.0 2225.0
Train Score :  0.7986516853932585
Test Score : 0.8019662921348315
------------------------------
6 1




3725.0 3725.0
Train Score :  0.7013422818791947
Test Score : 0.7357693263733532
------------------------------
6 2




2862.0 2862.0
Train Score :  0.7337526205450734
Test Score : 0.7734599884858953
------------------------------
6 3




2265.0 2265.0
Train Score :  0.7562913907284768
Test Score : 0.7930183356840621
------------------------------
6 4




1703.0 1703.0
Train Score :  0.8000587199060482
Test Score : 0.7898027898027898
------------------------------
6 5




1326.0 1326.0
Train Score :  0.8076923076923077
Test Score : 0.8309741881765196
------------------------------
7 1




2858.0 2858.0
Train Score :  0.7130860741777467
Test Score : 0.7407679139153828
------------------------------
7 2




2248.0 2248.0
Train Score :  0.7442170818505338
Test Score : 0.7736366127706221
------------------------------
7 3




1730.0 1730.0
Train Score :  0.7595375722543353
Test Score : 0.7832233741753063
------------------------------
7 4




1315.0 1315.0
Train Score :  0.7897338403041825
Test Score : 0.8075745983167559
------------------------------
7 5




1053.0 1053.0
Train Score :  0.798670465337132
Test Score : 0.8240837696335078
------------------------------
7 6




811.0 811.0
Train Score :  0.8255240443896424
Test Score : 0.8401869158878504
------------------------------
8 1




2244.0 2244.0
Train Score :  0.7210338680926917
Test Score : 0.757326229111165
------------------------------
8 2




1722.0 1722.0
Train Score :  0.7456445993031359
Test Score : 0.7621806689491704
------------------------------
8 3




1330.0 1330.0
Train Score :  0.762406015037594
Test Score : 0.776707530647986
------------------------------
8 4




1051.0 1051.0
Train Score :  0.7645099904852521
Test Score : 0.8002654280026543
------------------------------
8 5




800.0 800.0
Train Score :  0.78375
Test Score : 0.8122481869460113
------------------------------
8 6




621.0 621.0
Train Score :  0.7818035426731079
Test Score : 0.8419889502762431
------------------------------
8 7




495.0 495.0
Train Score :  0.804040404040404
Test Score : 0.8397177419354839
------------------------------
9 1




1721.0 1721.0
Train Score :  0.7315514235909355
Test Score : 0.7506593143131144
------------------------------
9 2




1343.0 1343.0
Train Score :  0.7446016381236039
Test Score : 0.7650371128743281
------------------------------
9 3




1050.0 1050.0
Train Score :  0.7585714285714286
Test Score : 0.788037486218302
------------------------------
9 4




793.0 793.0
Train Score :  0.7723833543505675
Test Score : 0.7881638846737481
------------------------------
9 5




643.0 643.0
Train Score :  0.7783825816485226
Test Score : 0.8057652711050103
------------------------------
9 6




477.0 477.0
Train Score :  0.7756813417190775
Test Score : 0.81738768718802
------------------------------
9 7




418.0 418.0
Train Score :  0.8038277511961722
Test Score : 0.8310502283105022
------------------------------
9 8




347.0 347.0
Train Score :  0.8256484149855908
Test Score : 0.8532494758909853
------------------------------
10 1




1316.0 1316.0
Train Score :  0.7291033434650456
Test Score : 0.7546450690805145
------------------------------
10 2




1015.0 1015.0
Train Score :  0.7502463054187192
Test Score : 0.7636181909045477
------------------------------
10 3




805.0 805.0
Train Score :  0.7652173913043478
Test Score : 0.7594701986754967
------------------------------
10 4




650.0 650.0
Train Score :  0.7661538461538462
Test Score : 0.786281179138322
------------------------------
10 5




492.0 492.0
Train Score :  0.7489837398373984
Test Score : 0.8032328256139261
------------------------------
10 6




401.0 401.0
Train Score :  0.7680798004987531
Test Score : 0.7982486865148861
------------------------------
10 7




321.0 321.0
Train Score :  0.7990654205607477
Test Score : 0.8131868131868132
------------------------------
10 8




270.0 270.0
Train Score :  0.8240740740740741
Test Score : 0.818975552968568
------------------------------
10 9




222.0 222.0
Train Score :  0.777027027027027
Test Score : 0.8506493506493507
------------------------------
11 1




1031.0 1031.0
Train Score :  0.7366634335596508
Test Score : 0.7450887573964498
------------------------------
11 2




794.0 794.0
Train Score :  0.7455919395465995
Test Score : 0.7565773297270716
------------------------------
11 3




642.0 642.0
Train Score :  0.7398753894080997
Test Score : 0.7616606868272681
------------------------------
11 4




475.0 475.0
Train Score :  0.7684210526315789
Test Score : 0.7798214768731404
------------------------------
11 5




399.0 399.0
Train Score :  0.7506265664160401
Test Score : 0.7847262247838617
------------------------------
11 6




342.0 342.0
Train Score :  0.7821637426900585
Test Score : 0.7700534759358288
------------------------------
11 7




255.0 255.0
Train Score :  0.7843137254901961
Test Score : 0.792981212336051
------------------------------
11 8




204.0 204.0
Train Score :  0.8063725490196079
Test Score : 0.8172871202396235
------------------------------
11 9




168.0 168.0
Train Score :  0.7827380952380952
Test Score : 0.8371681415929203
------------------------------
12 1




811.0 811.0
Train Score :  0.7385943279901356
Test Score : 0.733947119924457
------------------------------
12 2




631.0 631.0
Train Score :  0.7305863708399366
Test Score : 0.7476363636363637
------------------------------
12 3




472.0 472.0
Train Score :  0.7277542372881356
Test Score : 0.7540737026823765
------------------------------
12 4




420.0 420.0
Train Score :  0.7107142857142857
Test Score : 0.7686703096539163
------------------------------
12 5




323.0 323.0
Train Score :  0.7631578947368421
Test Score : 0.7586772342169992
------------------------------
12 6




271.0 271.0
Train Score :  0.7767527675276753
Test Score : 0.7836972343522561
------------------------------
12 7




222.0 222.0
Train Score :  0.777027027027027
Test Score : 0.7954906319466497
------------------------------
12 8




176.0 176.0
Train Score :  0.7755681818181818
Test Score : 0.8048606147248034
------------------------------
12 9




148.0 148.0
Train Score :  0.8108108108108109
Test Score : 0.8063820612332903
------------------------------
13 1




645.0 645.0
Train Score :  0.7286821705426356
Test Score : 0.7343199436222692
------------------------------
13 2




493.0 493.0
Train Score :  0.7363083164300203
Test Score : 0.733044733044733
------------------------------
13 3




405.0 405.0
Train Score :  0.7148148148148148
Test Score : 0.749631087063453
------------------------------
13 4




348.0 348.0
Train Score :  0.7471264367816092
Test Score : 0.7405720070868135
------------------------------
13 5




270.0 270.0
Train Score :  0.7277777777777777
Test Score : 0.7860330795484379
------------------------------
13 6




218.0 218.0
Train Score :  0.7568807339449541
Test Score : 0.7683195592286501
------------------------------
13 7




173.0 173.0
Train Score :  0.7745664739884393
Test Score : 0.7825322391559203
------------------------------
13 8




139.0 139.0
Train Score :  0.7697841726618705
Test Score : 0.8007662835249042
------------------------------
13 9




119.0 119.0
Train Score :  0.773109243697479
Test Score : 0.8185411426518145
------------------------------
14 1




481.0 481.0
Train Score :  0.7255717255717256
Test Score : 0.7402536402066698
------------------------------
14 2




401.0 401.0
Train Score :  0.7394014962593516
Test Score : 0.7339209147212958
------------------------------
14 3




337.0 337.0
Train Score :  0.7403560830860534
Test Score : 0.7473300970873786
------------------------------
14 4




271.0 271.0
Train Score :  0.7232472324723247
Test Score : 0.7537202380952381
------------------------------
14 5




206.0 206.0
Train Score :  0.6868932038834952
Test Score : 0.774292272379495
------------------------------
14 6




165.0 165.0
Train Score :  0.7484848484848485
Test Score : 0.7551505546751188
------------------------------
14 7




144.0 144.0
Train Score :  0.7395833333333334
Test Score : 0.780454042081949
------------------------------
14 8




121.0 121.0
Train Score :  0.7644628099173554
Test Score : 0.7875220718069452
------------------------------
14 9




92.0 92.0
Train Score :  0.7282608695652174
Test Score : 0.8378205128205128
------------------------------
15 1




413.0 413.0
Train Score :  0.698547215496368
Test Score : 0.7247427502338635
------------------------------
15 2




332.0 332.0
Train Score :  0.7018072289156626
Test Score : 0.7549763033175355
------------------------------
15 3




268.0 268.0
Train Score :  0.7593283582089553
Test Score : 0.7379923150816523
------------------------------
15 4




219.0 219.0
Train Score :  0.7168949771689498
Test Score : 0.7339853300733497
------------------------------
15 5




179.0 179.0
Train Score :  0.7430167597765364
Test Score : 0.7627837365926665
------------------------------
15 6




133.0 133.0
Train Score :  0.7556390977443609
Test Score : 0.7443647540983607
------------------------------
15 7




118.0 118.0
Train Score :  0.7288135593220338
Test Score : 0.7932131495227995
------------------------------
15 8




96.0 96.0
Train Score :  0.7552083333333334
Test Score : 0.8077777777777778
------------------------------
15 9




86.0 86.0
Train Score :  0.75
Test Score : 0.8129241664207731
------------------------------
16 1




327.0 327.0
Train Score :  0.746177370030581
Test Score : 0.7012622720897616
------------------------------
16 2




263.0 263.0
Train Score :  0.7262357414448669
Test Score : 0.7488799811365244
------------------------------
16 3




219.0 219.0
Train Score :  0.7009132420091324
Test Score : 0.7496420047732697
------------------------------
16 4




167.0 167.0
Train Score :  0.7664670658682635
Test Score : 0.7307413668196088
------------------------------
16 5




146.0 146.0
Train Score :  0.7123287671232876
Test Score : 0.7390130125214829
------------------------------
16 6




114.0 114.0
Train Score :  0.7412280701754386
Test Score : 0.7941912869303956
------------------------------
16 7


