<h1>Crime prediction</h1>

<h3>Data mergin, clearing and completing - R script (main.R)</h3>
1. from date to date +/- days in past you want to extract information
2. from rectangle to rectangle (better from latitude/longtitude to latitude/longtitude) +/- extra area to extract information at border

<h3>Generate numpy arrays - IPython notebook (preprocess_neighbour_arrays.ipynb)</h3>
1. Go throught rectangles
2. Go througth dates
3. Select surrounding in defined distance the day before -> 2D array
4. Save array and result into result array -> 3D array

<h3>Extract features using CNN - IPython notebook (crime_feature_extraction_CNN.ipynb)</h3>
1. Train convolution net
2. Remove last layer
3. Generate feature vector for all data

<h3>Extract features using timeseries and RNN - IPython notebook (crime_feature_extraction_RNN.ipynb) </h3>
1. Train RNN net
2. Generate feature vector for all data

<h3>Features selection - IPython notebook (crime_feature_selection.ipynb)</h3>
1. Load and select data
2. Join together with tm_data_complete datasest
3. Train tree model
4. Visualize feature importance
5. Select the most important features
6. Save data

<h2><font color='green'>Crime prediction - IPython notebook (crime_prediction.ipynb)</font></h2>
<b>
1. Create models (Deep learning, Random Forest, Gradient Boosting Machines)
2. Train models
3. Test models
4. Colect and visualize the results
5. Do it for various features
6. Compare all models and select best results
</b>

In [None]:
import numpy as np
from numpy.ma import masked_array

import pandas as pd

import time
from tqdm import tqdm

import copy

import math

import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import matplotlib.gridspec as gridspec
import matplotlib.colors as col

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

import seaborn as sns

import h2o

from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

from h2o.grid.grid_search import H2OGridSearch

raster = 200
crime_type = "E05"
date_from = "2013-06-16"
date_to = "2017-03-08"
dist = 5
days = 21
main_dir = "/home/mori/Documents/4_semestr/DIP"
main_dir = "/home/elita/keras/DIP"

In [None]:
# Data preprocess+timeseries+neighbours
data_dir = "%s/output/python/complet_tm_neigh_%d_%s_%s_%s_x_%d_s_select_pca.csv" % (main_dir, raster,crime_type, date_from, date_to, dist)
data = pd.read_csv(data_dir)
data.sort_values("date", inplace=True)
print(data.shape)
data.head()

In [None]:
for i in range(1,22):
    name = "d%d" % i
    data[name] = data[name]/7

# to binary classification
y = 'crimecount_x'

binary = True
if binary:
    data.crimecount_x[data[y] > 0] = 1

    max_y = int(data[y].max())
    hist, bin_edges = np.histogram(data[y])
    plt.bar(bin_edges[:-1], hist, width = 1)
    plt.xlim(min(bin_edges), max(bin_edges))
    plt.show()   

    print([hist[0], hist[9]])
else:
    max_y = int(data[y].max())

    hist, bin_edges = np.histogram(data[y])
    plt.bar(bin_edges[:-1], hist, width = 1)
    plt.xlim(min(bin_edges), max(bin_edges))
    plt.show()   
    print(hist)

In [None]:
data.date.max()

In [None]:
# Split data

train_days_percent = 0.7
validation_days_percent = 0.2

ids_count = np.array((data.drop_duplicates("id", inplace = False)).iloc[:,0]).shape[0]
rows = data.shape[0]

total_days = rows/ids_count

train_rows = int(ids_count * total_days * train_days_percent)
validation_rows = int(ids_count * total_days * validation_days_percent)
test_rows = rows - train_rows - validation_rows

print("Train rows: %d, validation rows: %d, test_rows: %d, all: %d < %d" % (train_rows, validation_rows, test_rows, (train_rows+validation_rows+test_rows), rows))

train = data.iloc[0:train_rows,]
validation = data.iloc[(train_rows):(train_rows+validation_rows),]
test = data.iloc[(train_rows+validation_rows):,]

n_train, _ = train.shape
n_validation, _ = validation.shape
n_test, _ = test.shape


n_train, n_validation, n_test, (n_train + n_validation + n_test), rows, max_y, train.date.min(), validation.date.min(), test.date.min()

In [None]:
ind = np.arange(2)  
width = 0.25    
fig, ax = plt.subplots()

hist, _ = np.histogram(train[y])
train_c = (hist[0], hist[-1])
rects1 = ax.bar(ind, train_c, width)

hist, _ = np.histogram(validation[y])
validation_c = (hist[0], hist[-1])
rects2 = ax.bar(ind + width, validation_c, width)

hist, _ = np.histogram(test[y])
test_c = (hist[0], hist[-1])
rects3 = ax.bar(ind + 2*width, test_c, width)

axis_font = {'size':'20'}
font_prop = font_manager.FontProperties(size=16)


ax.set_xticks(ind + width)
ax.set_xticklabels(('0', '1'))
ax.set_ylim(0,1300000)
ax.set_ylabel('Count', **axis_font)
ax.set_xlabel('Class', **axis_font)

for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set_fontsize(16)

ax.legend((rects1[0], rects2[0], rects3[0]), ('Training', 'Validation', 'Test'), prop= font_prop)

def autolabel(rects, ax, decimal = False):
    (y_bottom, y_top) = ax.get_ylim()
    y_height = y_top - y_bottom

    for rect in rects:
        height = rect.get_height()
        label_position = height + (y_height * 0.01)

        if decimal:
            ax.text(rect.get_x() + rect.get_width()/2., label_position, '%0.3f' % height, ha='center', va='bottom', size = 16)
        else:
            ax.text(rect.get_x() + rect.get_width()/2., label_position, '%d' % int(height), ha='center', va='bottom', size = 16)
            

autolabel(rects1, ax)
autolabel(rects2, ax)
autolabel(rects3, ax)

plt.savefig("%s/images/data_spliting_a.png" % main_dir)
plt.show()


fig, ax = plt.subplots()

hist, _ = np.histogram(train[y])
train_c = np.array((hist[0], hist[-1]))
train_cn = (train_c/train_c.sum()).tolist()
rects1 = ax.bar(ind, train_cn, width)

hist, _ = np.histogram(validation[y])
validation_c = np.array((hist[0], hist[-1]))
validation_cn = (validation_c/validation_c.sum()).tolist()
rects2 = ax.bar(ind+width, validation_cn, width)

hist, _ = np.histogram(test[y])
test_c = np.array((hist[0], hist[-1]))
test_cn = (test_c/test_c.sum()).tolist()
rects3 = ax.bar(ind+2*width, test_cn, width)

# add some text for labels, title and axes ticks
ax.set_ylabel('Normalised count', **axis_font)
ax.set_xlabel('Class', **axis_font)
ax.set_xticks(ind + width)
ax.set_xticklabels(('0', '1'))
ax.set_ylim(0,1.1)

for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set_fontsize(16)
        
ax.legend((rects1[0], rects2[0], rects3[0]), ('Training', 'Validation', 'Test'), prop= font_prop)

autolabel(rects1, ax, True)
autolabel(rects2, ax, True)
autolabel(rects3, ax, True)

plt.savefig("%s/images/data_spliting_n.png" % main_dir)

plt.show()

In [None]:
header = ",".join(data.columns.values)
np.savetxt("%s/output/python/train.csv"%main_dir, train, fmt='%s', header=header, delimiter=',')
np.savetxt("%s/output/python/validation.csv"%main_dir, validation, fmt='%s', header=header, delimiter=',')
np.savetxt("%s/output/python/test.csv"%main_dir, test, fmt='%s', header=header,delimiter=',')

In [None]:
train[y].max()

In [None]:
header = ",".join(data.columns.values[182:])
np.savetxt("%s/output/python/train_pca_only.csv"%main_dir, train.iloc[:,182:], fmt='%s', header=header, delimiter=';')
np.savetxt("%s/output/python/validation_pca_only.csv"%main_dir, validation.iloc[:,182:], fmt='%s', header=header, delimiter=';')
np.savetxt("%s/output/python/test_pca_only.csv"%main_dir, test.iloc[:,182:], fmt='%s', header=header,delimiter=';')

In [None]:
train.iloc[:,183:].head()

In [None]:
#h2o.cluster().shutdown()
h2o.init(min_mem_size="10G")

In [None]:
column_types = ["enum", "numeric", "numeric", "numeric",
               "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
               "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
               "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",         
               ]

column_names = ["crimecount", "PCA_0","PCA_1",
                "PCA_2","PCA_3","PCA_4", "PCA_5","PCA_6","PCA_7","PCA_8","PCA_9","PCA_10","PCA_11","PCA_12","PCA_13","PCA_14",
"PCA_15","PCA_16","PCA_17","PCA_18","PCA_19","PCA_20","PCA_21","PCA_22","PCA_23","PCA_24",
"PCA_25","PCA_26","PCA_27","PCA_28","PCA_29","PCA_30","PCA_31","PCA_32","PCA_33","PCA_34",
"PCA_35","PCA_36","PCA_37","PCA_38","PCA_39","PCA_40","PCA_41","PCA_42","PCA_43","PCA_44",
"PCA_45","PCA_46","PCA_47","PCA_48","PCA_49","PCA_50","PCA_51","PCA_52","PCA_53","PCA_54",
"PCA_55","PCA_56","PCA_57","PCA_58","PCA_59","PCA_60","PCA_61","PCA_62","PCA_63","PCA_64",
"PCA_65","PCA_66","PCA_67","PCA_68","PCA_69","PCA_70","PCA_71","PCA_72","PCA_73","PCA_74",
"PCA_75","PCA_76","PCA_77","PCA_78","PCA_79","PCA_80","PCA_81","PCA_82","PCA_83","PCA_84",
"PCA_85","PCA_86","PCA_87","PCA_88","PCA_89","PCA_90","PCA_91","PCA_92","PCA_93","PCA_94",
"PCA_95","PCA_96","PCA_97","PCA_98","PCA_99","PCA_100","PCA_101","PCA_102","PCA_103","PCA_104",
"PCA_105","PCA_106","PCA_107","PCA_108","PCA_109","PCA_110","PCA_111","PCA_112","PCA_113","PCA_114",
"PCA_115","PCA_116","PCA_117","PCA_118","PCA_119","PCA_120","PCA_121","PCA_122","PCA_123","PCA_124",
"PCA_125","PCA_126","PCA_127","PCA_128","PCA_129","PCA_130","PCA_131","PCA_132","PCA_133","PCA_134",
"PCA_135","PCA_136","PCA_137","PCA_138","PCA_139","PCA_140","PCA_141","PCA_142","PCA_143","PCA_144",
"PCA_145","PCA_146","PCA_147","PCA_148","PCA_149","PCA_150","PCA_151","PCA_152","PCA_153","PCA_154",
"PCA_155","PCA_156","PCA_157","PCA_158","PCA_159","PCA_160","PCA_161","PCA_162"
]

trainH2o = h2o.upload_file(path="%s/output/python/train_pca_only.csv"%main_dir, destination_frame="train_data", header=1, col_names = column_names, col_types = column_types)
validationH2o = h2o.import_file(path="%s/output/python/validation_pca_only.csv"%main_dir, destination_frame="validation_data", header=1, col_names = column_names, col_types = column_types)
testH2o = h2o.import_file(path="%s/output/python/test_pca_only.csv"%main_dir, destination_frame="test_data", header=1, col_names = column_names, col_types = column_types)

#trainH2o = h2o.H2OFrame(train, column_types=column_types)
#validationH2o = h2o.H2OFrame(validation, column_types=column_types)
#testH2o = h2o.H2OFrame(test, column_types=column_types)

#true_values = test.loc[:, "crimecount_x"]
true_values = pd.read_csv("%s/output/python/test.csv"%main_dir, skipinitialspace=True, usecols=["crimecount_x"]).iloc[:,0]

In [None]:
column_types = ["numeric","string","enum","enum","enum","enum","enum","enum","enum", "enum","enum",
                "enum","enum","enum","enum","enum", "enum","enum","enum","enum",
                "enum","enum","enum","enum","numeric","numeric","numeric","numeric",
                "enum","enum","enum","enum","enum","enum","numeric","numeric",
                "numeric","numeric", "numeric","enum", 
                "enum", "enum", "enum", 
                "enum", "enum", "enum", 
                "enum", "enum", "enum", "enum", 
                "enum", "enum", "enum", "enum", "enum", 
                "enum", "enum", "enum", "enum", "numeric", "numeric",
                "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", 
                "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", 
                "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum",
                "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", 
                "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", 
                "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum",  
                "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", 
                "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", 
                "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", 
                "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum",
                "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", "enum", 
                "enum", "numeric", "numeric", "enum", "numeric", "numeric", "numeric", "numeric", "numeric"]

column_names = ["id","date","d1","d2","d3","d4","d5","d6","d7","d8","d9",
                "d10","d11","d12","d13","d14","d15","d16","d17","d18","d19",
                "d20","d21","praha","idx200","x200","idy200","y200","Y",
                "Mo","W","WD","D","MoCat","lokalitarelevance","vzdbcs","vzdpostabanka",
                "vzdprodejna", "vzdrestaurace","datDoOdDiffDf_0","datDoOdDiffDf_1","datDoOdDiffDf_2","datDoOdDiffDf_3",
                "datDoOdDiffDf_4","datDoOdDiffDf_5","datDoOdDiffDf_6","datDoOdDiffDf_7","crimecount_x",
                "stavobj_A","stavobj_B","stavobj_C","stavobj_D","stavobj_F","stavobj_G","datodHCat_1",
                "datodHCat_2","datodHCat_3","datodHCat_4","datodHCat_5","0","1","2","3","4","5","6","7",
                "8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25",
                "26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43",
                "44","45","46","47","48","49","50","51","52","53","54","55","56","57","58","59","60","61",
                "62","63","64","65","66","67","68","69","70","71","72","73","74","75","76","77","78","79",
                "80","81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97",
                "98","99","100","101","102","103","104","105","106","107","108","109","110","111","112",
                "113","114","115","116","117","118","119","120","x","y","crimecount_y","PCA_0","PCA_1",
                "PCA_2","PCA_3","PCA_4"
]

trainH2o = h2o.upload_file(path="%s/output/python/train.csv"%main_dir, destination_frame="train_data", header=1, col_names = column_names, col_types = column_types)
validationH2o = h2o.import_file(path="%s/output/python/validation.csv"%main_dir, destination_frame="validation_data", header=1, col_names = column_names, col_types = column_types)
testH2o = h2o.import_file(path="%s/output/python/test.csv"%main_dir, destination_frame="test_data", header=1, col_names = column_names, col_types = column_types)

#trainH2o = h2o.H2OFrame(train, column_types=column_types)
#validationH2o = h2o.H2OFrame(validation, column_types=column_types)
#testH2o = h2o.H2OFrame(test, column_types=column_types)

#true_values = test.loc[:, "crimecount_x"]
true_values = pd.read_csv("%s/output/python/test.csv"%main_dir, skipinitialspace=True, usecols=["crimecount_x"]).iloc[:,0]

In [None]:
startcolor = '#fff2e5'
midcolor = '#f2a285'
endcolor = '#c40d21'    
cmap1 = col.LinearSegmentedColormap.from_list('own1',[startcolor,midcolor,endcolor])
plt.cm.register_cmap(name = 'wrong', cmap=cmap1)

startcolor = '#f3ffe5' 
midcolor = '#b4f185'   
endcolor = 'darkgreen'   
cmap2 = col.LinearSegmentedColormap.from_list('own2',[startcolor,midcolor,endcolor])
plt.cm.register_cmap(name = 'correct', cmap=cmap2)

classes=[0,1]

def print_save_single_roc(true_values, predictions, color, title, fig_name):
    fpr, tpr, thresholds = roc_curve(true_values, predictions)
    prediction_auc = auc(fpr, tpr)

    plt.figure(figsize=(5,5))
    lw = 2
    plt.plot(fpr, tpr, color=color, lw=lw, label='ROC curve (area = %0.4f)' % prediction_auc)
    plt.plot([0, 1], [0, 1], color='gray', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.savefig("%s/images/%s.png" % (main_dir, fig_name))
    plt.show()


def print_save_multy_roc(true_values, predictions, n, model_names, colors, title, fig_name):
    fpr = dict()
    tpr = dict()
    prediction_auc = dict()
    for i in range(n):
        fpr[i], tpr[i], _ = roc_curve(true_values, predictions[i])
        prediction_auc[i] = auc(fpr[i], tpr[i])

    title_font = {'size':'30', 'color':'black', 'weight':'normal', 'verticalalignment':'bottom'}
    axis_font = {'size':'25'}
    font_path = "/usr/share/fonts/truetype/msttcorefonts/Arial.ttf"
    font_prop = font_manager.FontProperties(size=20)
        
    fig = plt.figure(figsize=(15,15))
    ax = fig.add_subplot(111)
    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set_fontsize(16)
        
    lw = 2
    for j in range(n):
        plt.plot(fpr[j], tpr[j], color=colors[j], lw=lw, label='ROC curve of {0} model (area = {1:0.4f})'.format(model_names[j], prediction_auc[j]))
    
    plt.plot([0, 1], [0, 1], color='gray', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', **axis_font)
    plt.ylabel('True Positive Rate', **axis_font)
    plt.title(title, **title_font)
    plt.legend(loc="lower right", prop=font_prop)
    plt.savefig("%s/images/%s.png" % (main_dir, fig_name))
    plt.show() 
    
def print_save_multy_prc(true_values, predictions, n, model_names, colors, title, fig_name):
    precision = dict()
    recall = dict()
    average_precision = dict()
    for i in range(n):
        precision[i], recall[i], _ = precision_recall_curve(true_values, predictions[i])
        average_precision[i] = average_precision_score(true_values, predictions[i])
    
    title_font = {'size':'30', 'color':'black', 'weight':'normal', 'verticalalignment':'bottom'}
    axis_font = {'size':'25'}
    font_path = "/usr/share/fonts/truetype/msttcorefonts/Arial.ttf"
    font_prop = font_manager.FontProperties(size=20)
    
    fig = plt.figure(figsize=(15,15))
    ax = fig.add_subplot(111)
    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set_fontsize(16)
    
    lw = 2
    for j in range(n):
        plt.plot(precision[j], recall[j], color=colors[j], lw=lw, label='PRC curve of {0} model (area = {1:0.4f})'.format(model_names[j], average_precision[j]))
    
    ratio = len(true_values[true_values == 1])/len(true_values[true_values == 0])
    plt.plot([ratio, ratio], color='gray', lw=lw, linestyle='--')
    plt.xlim([0.0, 0.5])
    plt.ylim([0.0, 0.2])
    plt.xlabel('Recall',  **axis_font)
    plt.ylabel('Precision', **axis_font)
    plt.title(title, **title_font)
    plt.legend(loc="upper right", prop=font_prop)
    plt.savefig("%s/images/%s.png" % (main_dir, fig_name))
    plt.show()
    
def find_max_mcc_threshold(true_values, predictions, thresholds):
    max_mcc = -1
    max_threshold = thresholds[0]
    max_predictions = predictions[:]
    stop = 0
    for i in range(1,thresholds.shape[0]):
        tmp_threshold = thresholds[i]
        tmp_predictions = copy.deepcopy(predictions)
        tmp_predictions[tmp_predictions >= tmp_threshold] = int(1)
        tmp_predictions[tmp_predictions < 1] = int(0)
        tmp_mcc = matthews_corrcoef(true_values, tmp_predictions)
        if tmp_mcc > max_mcc:
            max_mcc = tmp_mcc
            max_threshold = tmp_threshold
            max_predictions = copy.deepcopy(tmp_predictions)
            stop = 0
        else:
            stop += 1
        if stop == 3000:
            break;
    return max_mcc, max_threshold, max_predictions.astype(int)

def find_max_f1_threshold(true_values, predictions, thresholds):
    max_f1 = -1
    max_threshold = thresholds[0]
    max_predictions = predictions[:]
    stop = 0
    for i in range(1,thresholds.shape[0]):
        tmp_threshold = thresholds[i]
        tmp_predictions = copy.deepcopy(predictions)
        tmp_predictions[tmp_predictions >= tmp_threshold] = int(1)
        tmp_predictions[tmp_predictions < 1] = int(0)
        tmp_f1 = f1_score(true_values, tmp_predictions)
        if tmp_f1 > max_f1:
            max_f1 = tmp_f1
            max_threshold = tmp_threshold
            max_predictions = copy.deepcopy(tmp_predictions)
            stop = 0
        else:
            stop += 1
        #if stop == 3000:
        #    break;
    return max_f1, max_threshold, max_predictions.astype(int)

def plot_and_save_bi_confusion_matrix(cm, fig_name, title='Confusion matrix', classes = [0,1], cmap1=plt.cm.get_cmap("correct"), cmap2=plt.cm.get_cmap("wrong")):

    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111)
    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set_fontsize(15)
    
    title_font = {'size':'20', 'color':'black', 'weight':'normal', 'verticalalignment':'bottom'}
    axis_font = {'size':'15'}
    cb_font = {'size':'15', 'horizontalalignment':'left'}
    font_path = "/usr/share/fonts/truetype/msttcorefonts/Arial.ttf"
    font_prop = font_manager.FontProperties(size=20)
    
    cm_n = cm / cm.sum(axis=1)[:, np.newaxis]
    cm_f = [[1,0],[1,0]]
    
    mask1 = [[0,  1], [1, 0]]
    mask2 = [[1,  0], [0, 1]]
    cm1 = masked_array(cm_n,mask1)
    cm2 = masked_array(cm_n,mask2)
    
    cm1f = masked_array(cm_f,mask1)
    cm2f = masked_array(cm_f,mask2)
    
    p2f = ax.imshow(cm2f,interpolation='nearest',cmap=cmap2)
    p1f = ax.imshow(cm1f,interpolation='nearest',cmap=cmap1)
    
    p2 = ax.imshow(cm2,interpolation='nearest',cmap=cmap2)
    p1 = ax.imshow(cm1,interpolation='nearest',cmap=cmap1)
    
    cb2 = plt.colorbar(p2,shrink=0.5)
    cb2.set_clim(0, 1)
    cb2.remove()
    
    cb1 = plt.colorbar(p1,shrink=0.5)
    cb1.set_clim(0, 1)
    cb1.remove()
    
    cb2 = plt.colorbar(p2f,shrink=0.5)
    cb2.set_clim(0, 1)
    cb2.ax.get_xaxis().labelpad = 10
    cb2.ax.set_xlabel('False', **cb_font)
    cb2.ax.tick_params(labelsize=15)
    
    
    cb1 = plt.colorbar(p1f,shrink=0.5)
    cb1.set_clim(0, 1)
    cb1.ax.get_xaxis().labelpad = 10
    cb1.ax.set_xlabel('True', **cb_font)
    cb1.ax.tick_params(labelsize=15)

    plt.grid(False)

    plt.title(title, **title_font)

    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    thresh = 0.6

    plt.text(0, 0, "%d\n(%0.2f)" % (cm[0,0], cm_n[0,0]), horizontalalignment="center", color="white" if cm_n[0, 0] > thresh else "black", fontproperties=font_prop)
    plt.text(1, 1, "%d\n(%0.2f)" % (cm[1,1], cm_n[1,1]), horizontalalignment="center", color="white" if cm_n[1, 1] > thresh else "black", fontproperties=font_prop)
    plt.text(1, 0, "%d\n(%0.2f)" % (cm[0,1], cm_n[0,1]), horizontalalignment="center", color="white" if cm_n[0, 1] > thresh else "black", fontproperties=font_prop)
    plt.text(0, 1, "%d\n(%0.2f)" % (cm[1,0], cm_n[1,0]), horizontalalignment="center", color="white" if cm_n[1, 0] > thresh else "black", fontproperties=font_prop)

    plt.tight_layout()
    plt.ylabel('True label', **axis_font)
    plt.xlabel('Predicted label', **axis_font)    
    plt.savefig("%s/output/python/h2o/images/%s.png" % (main_dir, fig_name))

    plt.show()

    
def print_save_single_roc_threshold(true_values, predictions, title, fig_name):
    fpr, tpr, thresholds = roc_curve(true_values, predictions)
    prediction_auc = auc(fpr, tpr)
    
    mcc_max, threshold, predictions = find_max_mcc_threshold(true_values, predictions, thresholds)
    acc = accuracy_score(true_values, predictions)
    f1 = f1_score(true_values, predictions)
    th_index = thresholds.tolist().index(threshold)

    title_font = {'size':'17', 'color':'black', 'weight':'normal', 'verticalalignment':'bottom'}
    axis_font = {'size':'15'}
    cb_font = {'size':'15', 'horizontalalignment':'left'}
    font_path = "/usr/share/fonts/truetype/msttcorefonts/Arial.ttf"
    font_prop = font_manager.FontProperties(size=15)
    
    
    fig = plt.figure(figsize=(15,7))
    gs1 = gridspec.GridSpec(1, 2)
    ax_list = [fig.add_subplot(ss) for ss in gs1]
    ax1 =  ax_list[0]
    ax2 =  ax_list[1]
    lw = 2

    ax1.scatter(fpr[th_index], tpr[th_index], s=75, c="red", label='Optimal threshold \n(cutoff = %0.4f)' % (threshold))
    ax1.plot(fpr, tpr, color="black", lw=lw, label='ROC curve (area = %0.4f)' % prediction_auc)
    ax1.plot([0, 1], [0, 1], color='gray', lw=lw, linestyle='--')
    ax1.set_xlim([0.0, 1.0])
    ax1.set_ylim([0.0, 1.05])
    ax1.set_xlabel('False Positive Rate', **axis_font)
    ax1.set_ylabel('True Positive Rate', **axis_font)
    ax1.set_title("Roc analysis",  **title_font)
    ax1.legend(loc="lower right", prop= font_prop)
    
    cm = confusion_matrix(true_values, predictions)
    cm_n = cm / cm.sum(axis=1)[:, np.newaxis]
    cm_f = [[1,0],[1,0]]
    
    mask1 = [[0,  1], [1, 0]]
    mask2 = [[1,  0], [0, 1]]
    cm1 = masked_array(cm_n,mask1)
    cm2 = masked_array(cm_n,mask2)
    cm1f = masked_array(cm_f,mask1)
    cm2f = masked_array(cm_f,mask2)
    
    cmap1=plt.cm.get_cmap("correct")
    cmap2=plt.cm.get_cmap("wrong")
    
    p2f = ax2.imshow(cm2f,interpolation='nearest',cmap=cmap2)
    p1f = ax2.imshow(cm1f,interpolation='nearest',cmap=cmap1)
   
    p2 = ax2.imshow(cm2,interpolation='nearest',cmap=cmap2)
    p1 = ax2.imshow(cm1,interpolation='nearest',cmap=cmap1)
    
    cb2 = plt.colorbar(p2,shrink=0.5)
    cb2.set_clim(0, 1)
    cb2.remove()
    
    cb1 = plt.colorbar(p1,shrink=0.5)
    cb1.set_clim(0, 1)
    cb1.remove()
    
    cb2 = plt.colorbar(p2f,shrink=0.5)
    cb2.set_clim(0, cm1.sum())
    cb2.ax.get_xaxis().labelpad = 10
    cb2.ax.set_xlabel('False', **cb_font)
    cb2.ax.tick_params(labelsize=15)

    cb1 = plt.colorbar(p1f,shrink=0.5)
    cb1.set_clim(0, cm1.sum())
    cb1.ax.get_xaxis().labelpad = 10
    cb1.ax.set_xlabel('True', **cb_font)
    cb1.ax.tick_params(labelsize=15)

    ax2.grid(False)

    ax2.set_title("Confusion matrix\n\nACC: %0.3f   F1: %0.3f   MCC: %0.3f" % (acc, f1, mcc_max), **title_font)

    tick_marks = np.arange(len(classes))
    ax2.set_xticks(tick_marks)
    ax2.set_yticks(tick_marks)
    ax2.set_xticklabels(classes)
    ax2.set_yticklabels(classes)
    
    thresh = 0.6

    ax2.text(0, 0, "%d\n(%0.2f)" % (cm[0,0], cm_n[0,0]), horizontalalignment="center", color="white" if cm_n[0, 0] > thresh else "black", fontproperties=font_prop)
    ax2.text(1, 1, "%d\n(%0.2f)" % (cm[1,1], cm_n[1,1]), horizontalalignment="center", color="white" if cm_n[1, 1] > thresh else "black", fontproperties=font_prop)
    ax2.text(1, 0, "%d\n(%0.2f)" % (cm[0,1], cm_n[0,1]), horizontalalignment="center", color="white" if cm_n[0, 1] > thresh else "black", fontproperties=font_prop)
    ax2.text(0, 1, "%d\n(%0.2f)" % (cm[1,0], cm_n[1,0]), horizontalalignment="center", color="white" if cm_n[1, 0] > thresh else "black", fontproperties=font_prop)

    ax2.set_ylabel('True label', **axis_font)
    ax2.set_xlabel('Predicted label', **axis_font)
    
    for label in (ax2.get_xticklabels() + ax2.get_yticklabels() + ax1.get_xticklabels() + ax1.get_yticklabels()):
        label.set_fontsize(13)
    
    plt.suptitle(title, fontsize=20)   
    
    plt.tight_layout(pad=1, w_pad=1, h_pad=1)
    plt.savefig("%s/output/python/h2o/images/%s.png" % (main_dir, fig_name))
    plt.show()
    prediction_string = ",".join(str(x) for x in predictions)
    return "%0.5f,%0.5f,%0.5f,%0.5f,%0.5f,%d,%d,%d,%d,%0.5f,%0.5f,%0.5f,%0.5f,*%s*" % (prediction_auc,acc,f1, mcc_max,threshold,cm[0,0],cm[1,1],cm[0,1],cm[1,0], cm_n[0,0],cm_n[1,1],cm_n[0,1],cm_n[1,0], prediction_string)
    
def print_save_single_roc_threshold_f1(true_values, predictions, title, fig_name):
    fpr, tpr, thresholds = roc_curve(true_values, predictions)
    prediction_auc = auc(fpr, tpr)
    
    f1_max, threshold, predictions = find_max_f1_threshold(true_values, predictions, thresholds)
    acc = accuracy_score(true_values, predictions)
    mcc = matthews_corrcoef(true_values, predictions)
    th_index = thresholds.tolist().index(threshold)

    title_font = {'size':'17', 'color':'black', 'weight':'normal', 'verticalalignment':'bottom'}
    axis_font = {'size':'15'}
    cb_font = {'size':'15', 'horizontalalignment':'left'}
    font_path = "/usr/share/fonts/truetype/msttcorefonts/Arial.ttf"
    font_prop = font_manager.FontProperties(size=15)
    
    
    fig = plt.figure(figsize=(15,7))
    gs1 = gridspec.GridSpec(1, 2)
    ax_list = [fig.add_subplot(ss) for ss in gs1]
    ax1 =  ax_list[0]
    ax2 =  ax_list[1]
    lw = 2

    ax1.scatter(fpr[th_index], tpr[th_index], s=75, c="red", label='Optimal threshold \n(cutoff = %0.4f)' % (threshold))
    ax1.plot(fpr, tpr, color="black", lw=lw, label='ROC curve (area = %0.4f)' % prediction_auc)
    ax1.plot([0, 1], [0, 1], color='gray', lw=lw, linestyle='--')
    ax1.set_xlim([0.0, 1.0])
    ax1.set_ylim([0.0, 1.05])
    ax1.set_xlabel('False Positive Rate', **axis_font)
    ax1.set_ylabel('True Positive Rate', **axis_font)
    ax1.set_title("Roc analysis",  **title_font)
    ax1.legend(loc="lower right", prop= font_prop)
    
    cm = confusion_matrix(true_values, predictions)
    cm_n = cm / cm.sum(axis=1)[:, np.newaxis]
    cm_f = [[1,0],[1,0]]
    
    mask1 = [[0,  1], [1, 0]]
    mask2 = [[1,  0], [0, 1]]
    cm1 = masked_array(cm_n,mask1)
    cm2 = masked_array(cm_n,mask2)
    cm1f = masked_array(cm_f,mask1)
    cm2f = masked_array(cm_f,mask2)
    
    cmap1=plt.cm.get_cmap("correct")
    cmap2=plt.cm.get_cmap("wrong")
    
    p2f = ax2.imshow(cm2f,interpolation='nearest',cmap=cmap2)
    p1f = ax2.imshow(cm1f,interpolation='nearest',cmap=cmap1)
   
    p2 = ax2.imshow(cm2,interpolation='nearest',cmap=cmap2)
    p1 = ax2.imshow(cm1,interpolation='nearest',cmap=cmap1)
    
    cb2 = plt.colorbar(p2,shrink=0.5)
    cb2.set_clim(0, 1)
    cb2.remove()
    
    cb1 = plt.colorbar(p1,shrink=0.5)
    cb1.set_clim(0, 1)
    cb1.remove()
    
    cb2 = plt.colorbar(p2f,shrink=0.5)
    cb2.set_clim(0, cm1.sum())
    cb2.ax.get_xaxis().labelpad = 10
    cb2.ax.set_xlabel('False', **cb_font)
    cb2.ax.tick_params(labelsize=15)

    cb1 = plt.colorbar(p1f,shrink=0.5)
    cb1.set_clim(0, cm1.sum())
    cb1.ax.get_xaxis().labelpad = 10
    cb1.ax.set_xlabel('True', **cb_font)
    cb1.ax.tick_params(labelsize=15)

    ax2.grid(False)

    ax2.set_title("Confusion matrix\n\nACC: %0.3f   F1: %0.3f   MCC: %0.3f" % (acc, f1_max, mcc), **title_font)

    tick_marks = np.arange(len(classes))
    ax2.set_xticks(tick_marks)
    ax2.set_yticks(tick_marks)
    ax2.set_xticklabels(classes)
    ax2.set_yticklabels(classes)
    
    thresh = 0.6

    ax2.text(0, 0, "%d\n(%0.2f)" % (cm[0,0], cm_n[0,0]), horizontalalignment="center", color="white" if cm_n[0, 0] > thresh else "black", fontproperties=font_prop)
    ax2.text(1, 1, "%d\n(%0.2f)" % (cm[1,1], cm_n[1,1]), horizontalalignment="center", color="white" if cm_n[1, 1] > thresh else "black", fontproperties=font_prop)
    ax2.text(1, 0, "%d\n(%0.2f)" % (cm[0,1], cm_n[0,1]), horizontalalignment="center", color="white" if cm_n[0, 1] > thresh else "black", fontproperties=font_prop)
    ax2.text(0, 1, "%d\n(%0.2f)" % (cm[1,0], cm_n[1,0]), horizontalalignment="center", color="white" if cm_n[1, 0] > thresh else "black", fontproperties=font_prop)

    ax2.set_ylabel('True label', **axis_font)
    ax2.set_xlabel('Predicted label', **axis_font)
    
    for label in (ax2.get_xticklabels() + ax2.get_yticklabels() + ax1.get_xticklabels() + ax1.get_yticklabels()):
        label.set_fontsize(13)
    
    plt.suptitle(title, fontsize=20)   
    
    plt.tight_layout(pad=1, w_pad=1, h_pad=1)
    plt.savefig("%s/output/python/h2o/images/%s.png" % (main_dir, fig_name))
    plt.show()
    prediction_string = ",".join(str(x) for x in predictions)
    return "%0.5f,%0.5f,%0.5f,%0.5f,%0.5f,%d,%d,%d,%d,%0.5f,%0.5f,%0.5f,%0.5f,%s" % (prediction_auc,acc,f1_max,mcc,threshold,cm[0,0],cm[1,1],cm[0,1],cm[1,0], cm_n[0,0],cm_n[1,1],cm_n[0,1],cm_n[1,0], prediction_string)
    

def print_save_single_prc_threshold(true_values, predictions, title, fig_name):
    precision, recall, thresholds = precision_recall_curve(true_values, predictions)
    average_precision = average_precision_score(true_values, predictions)
    
    mcc_max, threshold, predictions = find_max_mcc_threshold(true_values, predictions, thresholds)
    acc = accuracy_score(true_values, predictions)
    f1 = f1_score(true_values, predictions)
    th_index = thresholds.tolist().index(threshold)

    title_font = {'size':'17', 'color':'black', 'weight':'normal', 'verticalalignment':'bottom'}
    axis_font = {'size':'15'}
    cb_font = {'size':'15', 'horizontalalignment':'left'}
    font_path = "/usr/share/fonts/truetype/msttcorefonts/Arial.ttf"
    font_prop = font_manager.FontProperties(size=15)
    
    
    fig = plt.figure(figsize=(15,7))
    gs1 = gridspec.GridSpec(1, 2)
    ax_list = [fig.add_subplot(ss) for ss in gs1]
    ax1 =  ax_list[0]
    ax2 =  ax_list[1]
    lw = 2

    
    ax1.scatter(recall[th_index], precision[th_index], s=75, c="red", label='Optimal threshold \n(cutoff = %0.4f,' % (threshold))
    ax1.plot(precision, recall, color='black', lw=lw, label='PRC curve (area = {1:0.4f})'.format(average_precision))
    ratio = len(true_values[true_values == 1])/len(true_values[true_values == 0])
    ax1.plot([ratio, ratio], color='gray', lw=lw, linestyle='--')
    ax1.set_xlim([0.0, 0.5])
    ax1.set_ylim([0.0, 0.2])
    ax1.set_xlabel('Recall',  **axis_font)
    ax1.set_ylabel('Precision', **axis_font)
    ax1.set_title("PRC analysis", **title_font)
    ax1.set_legend(loc="upper right", prop=font_prop)
    
    cm = confusion_matrix(true_values, predictions)
    cm_n = cm / cm.sum(axis=1)[:, np.newaxis]
    cm_f = [[1,0],[1,0]]
    
    mask1 = [[0,  1], [1, 0]]
    mask2 = [[1,  0], [0, 1]]
    cm1 = masked_array(cm_n,mask1)
    cm2 = masked_array(cm_n,mask2)    
    cm1f = masked_array(cm_f,mask1)
    cm2f = masked_array(cm_f,mask2)
    
    cmap1=plt.cm.get_cmap("correct")
    cmap2=plt.cm.get_cmap("wrong")
    
    p2f = ax2.imshow(cm2f,interpolation='nearest',cmap=cmap2)
    p1f = ax2.imshow(cm1f,interpolation='nearest',cmap=cmap1)
    
    p2 = ax2.imshow(cm2,interpolation='nearest',cmap=cmap2)
    p1 = ax2.imshow(cm1,interpolation='nearest',cmap=cmap1)
    
    cb2 = plt.colorbar(p2,shrink=0.5)
    cb2.set_clim(0, 1)
    cb2.remove()
    cb1 = plt.colorbar(p1,shrink=0.5)
    cb1.set_clim(0, 1)
    cb1.remove()
        
    cb2 = plt.colorbar(p2,shrink=0.5)
    cb2.set_clim(0, cm.sum())
    cb2.ax.get_xaxis().labelpad = 10
    cb2.ax.set_xlabel('False', **cb_font)
    cb2.ax.tick_params(labelsize=15)
    
    cb1 = plt.colorbar(p1,shrink=0.5)
    cb1.set_clim(0, cm.sum())
    cb1.ax.get_xaxis().labelpad = 10
    cb1.ax.set_xlabel('True', **cb_font)
    cb1.ax.tick_params(labelsize=15)

    ax2.grid(False)

    ax2.set_title("Confusion matrix\n\nACC: %0.3f   F1: %0.3f   MCC: %0.3f" % (acc, f1, mcc_max), **title_font)

    tick_marks = np.arange(len(classes))
    ax2.set_xticks(tick_marks)
    ax2.set_yticks(tick_marks)
    ax2.set_xticklabels(classes)
    ax2.set_yticklabels(classes)

    thresh = 0.6

    ax2.text(0, 0, "%d\n(%0.2f)" % (cm[0,0], cm_n[0,0]), horizontalalignment="center", color="white" if cm_n[0, 0] > thresh else "black", fontproperties=font_prop)
    ax2.text(1, 1, "%d\n(%0.2f)" % (cm[1,1], cm_n[1,1]), horizontalalignment="center", color="white" if cm_n[1, 1] > thresh else "black", fontproperties=font_prop)
    ax2.text(1, 0, "%d\n(%0.2f)" % (cm[0,1], cm_n[0,1]), horizontalalignment="center", color="white" if cm_n[0, 1] > thresh else "black", fontproperties=font_prop)
    ax2.text(0, 1, "%d\n(%0.2f)" % (cm[1,0], cm_n[1,0]), horizontalalignment="center", color="white" if cm_n[1, 0] > thresh else "black", fontproperties=font_prop)

    ax2.set_ylabel('True label', **axis_font)
    ax2.set_xlabel('Predicted label', **axis_font)
    
    for label in (ax2.get_xticklabels() + ax2.get_yticklabels() + ax1.get_xticklabels() + ax1.get_yticklabels()):

        label.set_fontsize(13)
    
    plt.suptitle(title, fontsize=20)   
    
    plt.tight_layout(pad=4, w_pad=2, h_pad=1)
    plt.savefig("%s/images/%s.png" % (main_dir, fig_name))
    plt.show()
    
def plot_variable_importance(variables, scaled_importance, fig_name):
    var_n = len(variables)
    variables_axis = copy.deepcopy(variables)
    for i in range(var_n):
        variables_axis[i] = "%s (%0.2f)" % (variables[i], scaled_importance[i])
    if var_n > 30 :
        fig, ax = plt.subplots(figsize=(7,17))
    else:
        fig, ax = plt.subplots(figsize=(7,10))
    y_pos = np.arange(var_n)
    ax.barh(y_pos, scaled_importance, align='center', color='black', ecolor='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(variables_axis)
    ax.invert_yaxis()
    ax.set_xlabel('Scaled Importance')
    ax.set_ylabel('Variable')
    
    plt.tight_layout(pad=1, w_pad=1, h_pad=1)
    plt.savefig("%s/output/python/h2o/images/%s.png" % (main_dir, fig_name))
    plt.show()
    table = np.stack([variables, scaled_importance], axis=1)
    np.savetxt("%s/output/python/h2o/images/%s_table.csv" % (main_dir, fig_name), table, fmt="%s", delimiter=",")


In [None]:
# Default models

X_date = ['id', 'x200', 'y200', 'idx200', 'idy200', 'Y', 'Mo', 'W','WD', 'D', 'MoCat', 'vzdbcs', 'vzdpostabanka', 
 'vzdprodejna','vzdrestaurace','lokalitarelevance','datodHCat_1', 'datodHCat_2', 'datodHCat_3', 'datodHCat_4','datodHCat_5',
 'stavobj_A','stavobj_B', 'stavobj_C','stavobj_D','stavobj_F','stavobj_G']

X_tm = ['id', 'x200', 'y200', 'Y', 'Mo', 'W','WD', 'D', 'MoCat', 'vzdbcs', 'vzdpostabanka', 
 'vzdprodejna','vzdrestaurace','datodHCat_1', 'datodHCat_2', 'datodHCat_3', 'datodHCat_4','datodHCat_5',
 'stavobj_A','stavobj_B', 'stavobj_C','stavobj_D','stavobj_F','stavobj_G', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14', 
 'd15', 'd16', 'd17', 'd18', 'd19', 'd20', 'd21']

X_neigh = ['id', 'x200', 'y200', 'idx200', 'idy200', 'Y', 'Mo', 'W','WD', 'D', 'MoCat', 'vzdbcs', 'vzdpostabanka', 
 'vzdprodejna','vzdrestaurace','lokalitarelevance','datodHCat_1', 'datodHCat_2', 'datodHCat_3', 'datodHCat_4','datodHCat_5',
 'stavobj_A','stavobj_B', 'stavobj_C','stavobj_D','stavobj_F','stavobj_G',
  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35',
       '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46',
       '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57',
       '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68',
       '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79',
       '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90',
       '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101',
       '102', '103', '104', '105', '106', '107', '108', '109', '110',
       '111', '112', '113', '114', '115', '116', '117', '118', '119',
       '120']

X_all = ['id', 'x200', 'y200', 'idx200', 'idy200', 'Y', 'Mo', 'W','WD', 'D', 'MoCat', 'vzdbcs', 'vzdpostabanka', 
 'vzdprodejna','vzdrestaurace', 'lokalitarelevance', 'datodHCat_1', 'datodHCat_2', 'datodHCat_3', 'datodHCat_4','datodHCat_5',
 'stavobj_A','stavobj_B', 'stavobj_C','stavobj_D','stavobj_F','stavobj_G',
  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
    '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
    '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35',
    '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46',
    '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57',
    '58', '59', '61', '62', '63', '64', '65', '66', '67', '68',
    '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79',
    '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90',
    '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101',
    '102', '103', '104', '105', '106', '107', '108', '109', '110',
    '111', '112', '113', '114', '115', '116', '117', '118', '119',
    '120', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10', 
    'd11', 'd12', 'd13', 'd14','d15', 'd16', 'd17', 'd18', 'd19', 'd20', 'd21','PCA_0', 'PCA_1', 'PCA_2', 'PCA_3', 'PCA_4']

X_fs = ['PCA_0', 'vzdrestaurace', 'd21', 'd15', 'd7', 'd20', 'd10', 'd8', 'PCA_4', 'vzdpostabanka', 
        'd19', 'd1', 'd17', 'd2', 'd4', 'd9', 'd5', 'd16', 'd6', 'd18', 'd3', 'd13', 'stavobj_C', 'datodHCat_5', 'datodHCat_4', 
        'stavobj_F', 'Y', 'D', 'vzdprodejna','datodHCat_3', 'datodHCat_2', 'x200', 'PCA_1', 'PCA_3', 'd14', 'idx200', 
        '71', 'datodHCat_1', '49', 'WD', '59']

y = 'crimecount_x'

result_file_dir = "%s/output/python/model_results.csv" % main_dir

def train_test_model_save_results_h2o(model, model_id, data_id, X, train = False, selection = False, n_select = 0):
    if train:
        model.train(x=X, y=y, training_frame=trainH2o, validation_frame=validationH2o)
    print("save")
    path = h2o.save_model(model=model, path="%s/output/python/h2o/models" % main_dir, force=True)
    print("predict")
    prediction = model.predict(testH2o).as_data_frame()["p1"].values
    print("roc")
    result_line = print_save_single_roc_threshold(true_values, prediction, "", "roc_%s_%s_th" % (data_id, model_id))
    params = str(model.params)
    with open(result_file_dir, 'a') as file:
        file.write("%s,%s,%s,%s,*%s*\n" % (model_id, data_id, path, result_line, params))
    if model._model_json['output']['variable_importances']:
        variables = model._model_json['output']['variable_importances']['variable']
        scaled_importance = model._model_json['output']['variable_importances']['scaled_importance']
        plot_variable_importance(variables, scaled_importance, "vi_%s_%s" % (data_id, model_id))
        if selection:
            Xv = variables[:n_select]
            train_test_model_save_results_h2o(model, model_id+"_FS", data_id, Xv, True)
        
data_ids = ["tm_data", "neigh_data", "all_data", "fs_data"]
Xs = [X_tm, X_neigh, X_all, X_fs]
model_ids = ["drf", "gbm", "dl"]

def grid_search(model, hyperparameters, search_criteria, X, grid_id):
    grid_models = H2OGridSearch(model, hyperparameters, search_criteria=search_criteria, grid_id = grid_id)
    grid_models.train(x=X,y=y, training_frame=trainH2o, validation_frame=validationH2o)
    grid_models = grid_models.get_grid(sort_by = "AUC", decreasing=True)
    return grid_models
    

In [None]:
hyperparameters = {'ntrees':[20,40,50,70,90,100,150], 'max_depth':[5,10,30,50,100]}
search_criteria = {'strategy': "RandomDiscrete", 'seed': 42,
            'stopping_metric': "AUC", 'stopping_tolerance': 0.01,
            'stopping_rounds': 5}


setting = "grid"
model_id = "drf"

hyperparameters = {'ntrees':[30,40,60], 'max_depth':[5,10,30,50]}
search_criteria = {'strategy': "RandomDiscrete", 'seed': 42,
            'stopping_metric': "AUC", 'stopping_tolerance': 0.01,
            'stopping_rounds': 5}
setting = "grid"
model_id = "gbm"

data_id = "pca"

y = 'crimecount'

X = ["PCA_0","PCA_1",
"PCA_2","PCA_3","PCA_4", "PCA_5","PCA_6","PCA_7","PCA_8","PCA_9","PCA_10","PCA_11","PCA_12","PCA_13","PCA_14",
"PCA_15","PCA_16","PCA_17","PCA_18","PCA_19","PCA_20","PCA_21","PCA_22","PCA_23","PCA_24",
"PCA_25","PCA_26","PCA_27","PCA_28","PCA_29","PCA_30","PCA_31","PCA_32","PCA_33","PCA_34",
"PCA_35","PCA_36","PCA_37","PCA_38","PCA_39","PCA_40","PCA_41","PCA_42","PCA_43","PCA_44",
"PCA_45","PCA_46","PCA_47","PCA_48","PCA_49","PCA_50","PCA_51","PCA_52","PCA_53","PCA_54",
"PCA_55","PCA_56","PCA_57","PCA_58","PCA_59","PCA_60","PCA_61","PCA_62","PCA_63","PCA_64",
"PCA_65","PCA_66","PCA_67","PCA_68","PCA_69","PCA_70","PCA_71","PCA_72","PCA_73","PCA_74",
"PCA_75","PCA_76","PCA_77","PCA_78","PCA_79","PCA_80","PCA_81","PCA_82","PCA_83","PCA_84",
"PCA_85","PCA_86","PCA_87","PCA_88","PCA_89","PCA_90","PCA_91","PCA_92","PCA_93","PCA_94",
"PCA_95","PCA_96","PCA_97","PCA_98","PCA_99","PCA_100","PCA_101","PCA_102","PCA_103","PCA_104",
"PCA_105","PCA_106","PCA_107","PCA_108","PCA_109","PCA_110","PCA_111","PCA_112","PCA_113","PCA_114",
"PCA_115","PCA_116","PCA_117","PCA_118","PCA_119","PCA_120","PCA_121","PCA_122","PCA_123","PCA_124",
"PCA_125","PCA_126","PCA_127","PCA_128","PCA_129","PCA_130","PCA_131","PCA_132","PCA_133","PCA_134",
"PCA_135","PCA_136","PCA_137","PCA_138","PCA_139","PCA_140","PCA_141","PCA_142","PCA_143","PCA_144",
"PCA_145","PCA_146","PCA_147","PCA_148","PCA_149","PCA_150","PCA_151","PCA_152","PCA_153","PCA_154",
"PCA_155","PCA_156","PCA_157","PCA_158","PCA_159","PCA_160","PCA_161","PCA_162"]


#model = H2ORandomForestEstimator(model_id = "%s_%s_%s" % (model_id, data_id, setting))
#model = H2ORandomForestEstimator(model_id = model_id+"_"+data_id,  min_rows=20)
model = H2OGradientBoostingEstimator(model_id = model_id+"_"+data_id,  min_rows=20)

models = grid_search(model, hyperparameters, search_criteria, X, model_id+"_"+data_id+"_"+setting)

models_length = len(models)
if models_length > 3:
    models_length = 3
    
for j in range(models_length):
    if data_id == "all_data":
        train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X, False,  True, 42)
    else:
        train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X)


In [None]:
model = H2ORandomForestEstimator(model_id = "%s_%s_%s" % (model_id, data_id, setting))
model.train(x=Xs[0], y=y, training_frame=trainH2o, validation_frame=validationH2o)
prediction = model.predict(testH2o).as_data_frame()
prediction

In [None]:
# Default models
setting = "default"

for i in range(len(data_ids)):
    data_id = data_ids[i]
    X = Xs[i]
    for j in range(len(model_ids)):
        model_id = model_ids[j]
        print("%s_%s" % (model_id, data_id))
        if model_id  == "drf":
            model = H2ORandomForestEstimator(model_id = "%s_%s_%s" % (model_id, data_id, setting))
        elif model_id  == "gbm":
            model = H2OGradientBoostingEstimator(model_id = "%s_%s_%s" % (model_id, data_id, setting))
        elif model_id == "dl":
            model = H2ODeepLearningEstimator(model_id = "%s_%s_%s" % (model_id, data_id, setting))
            
        if data_id == "all_data":
            train_test_model_save_results_h2o(model, model_id+"_"+setting, data_id, X, True, True, 42)
        else:
            train_test_model_save_results_h2o(model, model_id+"_"+setting, data_id, X, True)

In [None]:
# hyperparameters drf
hyperparameters = {'ntrees':[20,40,50,70,90,100,150], 'max_depth':[5,10,30,50,100]}
search_criteria = {'strategy': "RandomDiscrete", 'seed': 42,
            'stopping_metric': "AUC", 'stopping_tolerance': 0.01,
            'stopping_rounds': 5}
setting = "grid"
model_id = "drf"

for i in range(len(data_ids)):
    data_id = data_ids[i]
    X = Xs[i]
    model = H2ORandomForestEstimator(model_id = model_id+"_"+data_id, min_rows=20)
    models = grid_search(model, hyperparameters, search_criteria, X, model_id+"_"+data_id+"_"+setting)
    
    models_length = len(models)
    if models_length > 3:
        models_length = 3

    for j in range(models_length):
        print(model_id+str(j)+"_"+setting+"_"+data_id)
        if data_id == "all_data":
            train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X, False, True, 42)
        else:
            train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X)



In [None]:
# hyperparameters gbm

hyperparameters = {'ntrees':[20,30,40,50,60,70], 'max_depth':[5,10,30,50]}
search_criteria = {'strategy': "RandomDiscrete", 'seed': 42,
            'stopping_metric': "AUC", 'stopping_tolerance': 0.01,
            'stopping_rounds': 5}
setting = "grid"
model_id = "gbm"

for i in range(len(data_ids)):
    data_id = data_ids[i]
    X = Xs[i]
    model = H2OGradientBoostingEstimator(model_id = model_id+"_"+data_id,  min_rows=20)
    models = grid_search(model, hyperparameters, search_criteria, X, model_id+"_"+data_id+"_"+setting)

    models_length = len(models)
    if models_length > 3:
        models_length = 3
    
    for j in range(models_length):
        if data_id == "all_data":
            train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X, False,  True, 42)
        else:
            train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X)

In [None]:
#https://www.slideshare.net/0xdata/h2o-world-top-10-deep-learning-tips-tricks-arno-candel
# hyperparameters dl

hyperparameters = {'hidden':[[100,100], [512], [16,16,16,16,16], [32,32,32,32,32], [64,64,64]], 'input_dropout_ratio': [0.2, 0.3, 0.4]}
search_criteria = { 'strategy': "RandomDiscrete", 'seed': 42,
            'stopping_metric': "AUC", 'stopping_tolerance': 0.01,
            'stopping_rounds': 5}
setting = "grid"
model_id = "dl"

for i in range(len(data_ids)):
    data_id = data_ids[i]
    X = Xs[i]
    model = H2ODeepLearningEstimator(model_id = model_id+"_"+data_id,
                                      score_validation_sampling='Stratified',
                                      l1=0.0001,  l2=0.0001)
    models = grid_search(model, hyperparameters, search_criteria, X, model_id+"_"+data_id+"_"+setting)

    models_length = len(models)
    if models_length > 3:
        models_length = 3
    
    for j in range(models_length):
        if data_id == "all_data":
            train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X, False, True, 30)
        else:
            train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X)

In [None]:
#https://www.slideshare.net/0xdata/h2o-world-top-10-deep-learning-tips-tricks-arno-candel
# hyperparameters dl

hyperparameters = {'hidden':[[16,16,16,16,16,16], [32,32,32,32,32,32,32,32], [64,64,64,64,64], [96,96,96,96,96]], 'input_dropout_ratio': [0.2, 0.3, 0.4, 0.5, 0.6]}
search_criteria = {'strategy': "RandomDiscrete", 'seed': 42,
            'stopping_metric': "F1", 'stopping_tolerance': 0.01,
            'stopping_rounds': 2}
setting = "grid2"
model_id = "dl"

for i in range(len(data_ids)):
    data_id = data_ids[i]
    X = Xs[i]
    model = H2ODeepLearningEstimator(model_id = model_id+"_"+data_id,
                                      score_validation_sampling='Stratified', 
                                      stopping_rounds=5,  
                                      l1=0.0001,  l2=0.0001)
    models = grid_search(model, hyperparameters, search_criteria, X, model_id+"_"+data_id+"_"+setting)

    for j in range(3):
        if data_id == "all_data":
            train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X, False, True, 42)
        else:
            train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X)

In [None]:
#https://www.slideshare.net/0xdata/h2o-world-top-10-deep-learning-tips-tricks-arno-candel
# hyperparameters dl

hyperparameters = {'hidden':[[16,16,16], [32,32,32], [64,64,64], [96,96,96]], 'input_dropout_ratio': [0.2, 0.3, 0.4, 0.5, 0.6]}
search_criteria = { 'strategy': "RandomDiscrete", 'seed': 42,
            'stopping_metric': "F1", 'stopping_tolerance': 0.1,
            'stopping_rounds': 2}
setting = "grid2"
model_id = "dl"

for i in range(len(data_ids)):
    data_id = data_ids[i]
    X = Xs[i]
    model = H2ODeepLearningEstimator(model_id = model_id+"_"+data_id,
                                      score_validation_sampling='Stratified', 
                                      l1=0.0001,  l2=0.0001)
    models = grid_search(model, hyperparameters, search_criteria, X, model_id+"_"+data_id+"_"+setting)

    for j in range(3):
        if data_id == "all_data":
            train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X, False, True, 42)
        else:
            train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X)

In [None]:
# Evaluation on test data - ROC, PRC

true_values = test["crimecount_x"]

# The best case
cm_base_best = confusion_matrix(true_values, true_values)
mcc_base_best = matthews_corrcoef(true_values, true_values)
acc = accuracy_score(true_values, true_values)
f1 = f1_score(true_values, true_values)
plot_and_save_bi_confusion_matrix(cm_base_best, "cm_base_best", "ACC: %0.3f   F1: %0.3f   MCC: %0.3f" % (acc, f1, mcc_base_best))
print_save_single_roc_threshold(true_values, true_values*0.5, "", "roc_base_best")

# The worst case
prediction_worst = copy.deepcopy(true_values)
prediction_worst[prediction_worst == 1] = -1
prediction_worst[prediction_worst == 0] = 1
mcc_base_worst = matthews_corrcoef(true_values, prediction_worst)
prediction_worst[prediction_worst == -1] = 0
cm_base_worst = confusion_matrix(true_values, prediction_worst)
acc = accuracy_score(true_values, prediction_worst)
f1 = f1_score(true_values, prediction_worst)
plot_and_save_bi_confusion_matrix(cm_base_worst, "cm_base_worst", "ACC: %0.3f   F1: %0.3f   MCC: %0.3f" % (acc, f1, mcc_base_worst))
#print_save_single_roc_threshold(true_values, prediction_worst/2, "", "roc_base_worst")

# Base model - predict by zero
prediction_base_zero = [0 for i in range(test.shape[0])]
cm_base_zero = confusion_matrix(true_values, prediction_base_zero)
mcc_prediction = copy.deepcopy(prediction_base_zero)
mcc_prediction[mcc_prediction == 0] = -1
mcc_base_zero = matthews_corrcoef(true_values, mcc_prediction)
acc = accuracy_score(true_values, prediction_base_zero)
f1 = f1_score(true_values, prediction_base_zero)
plot_and_save_bi_confusion_matrix(cm_base_zero, "cm_base_zero", "ACC: %0.3f   F1: %0.3f   MCC: %0.3f" % (acc, f1, mcc_base_zero))
#print_save_single_roc_threshold(true_values, prediction_base_zero, "", "roc_base_zero")

# Base model - predict by one
prediction_base_one = [1 for i in range(test.shape[0])]
cm_base_one = confusion_matrix(true_values, prediction_base_one)
mcc_base_one = matthews_corrcoef(true_values, prediction_base_one)
mse_base_one = mean_squared_error(true_values, prediction_base_one)
acc = accuracy_score(true_values, prediction_base_one)
f1 = f1_score(true_values, prediction_base_one)
plot_and_save_bi_confusion_matrix(cm_base_one, "cm_base_one", "ACC: %0.3f   F1: %0.3f   MCC: %0.3f" % (acc, f1, mcc_base_one))
#print_save_single_roc_threshold(true_values, prediction_base_one, "", "roc_base_one")

# Base model - predict by previous day
prediction_base_prev = copy.deepcopy(test["d21"])
prediction_base_prev[prediction_base_prev > 0] = 1
cm_base_prev = confusion_matrix(true_values, prediction_base_prev)
mcc_prediction = copy.deepcopy(prediction_base_prev)
mcc_prediction[mcc_prediction == 0] = -1
mcc_base_prev = matthews_corrcoef(true_values, mcc_prediction)
acc = accuracy_score(true_values, prediction_base_prev)
f1 = f1_score(true_values, prediction_base_prev)
plot_and_save_bi_confusion_matrix(cm_base_prev, "cm_base_prev",  "ACC: %0.3f   F1: %0.3f   MCC: %0.3f" % (acc, f1, mcc_base_prev))
#print_save_single_roc_threshold(true_values, prediction_base_prev, "", "roc_base_prev")

In [None]:
# Advanced models different data
colors = sns.color_palette("hls", 18)
colors_drf = [colors[0], colors[1], colors[2], colors[3]]
colors_gbm = [colors[4], colors[5], colors[6], colors[7]]
colors_dl = [colors[8], colors[9], colors[10], colors[11]]


#thresholds_drf1 = print_save_single_roc(true_values, prediction_drf1, colors_drf[0], "", "roc_time_data_drf")
#thresholds_gbm1 = print_save_single_roc(true_values, prediction_gbm1, colors_gbm[0], "", "roc_time_data_gbm")
#thresholds_dl1 = print_save_single_roc(true_values, prediction_dl1, colors_dl[0], "", "roc_time_data_dl")


#thresholds_drf2 = print_save_single_roc(true_values, prediction_drf2, colors_drf[1], "", "roc_time_loc_data_drf")
#thresholds_gbm2 = print_save_single_roc(true_values, prediction_gbm2, colors_gbm[1], "", "roc_time_loc_data_gbm")
#thresholds_dl2 = print_save_single_roc(true_values, prediction_dl2, colors_dl[1], "", "roc_time_loc_data_dl")


#thresholds_drf3 = print_save_single_roc(true_values, prediction_drf3, colors_drf[2], "", "roc_tm_data_drf")
#thresholds_gbm3 = print_save_single_roc(true_values, prediction_gbm3, colors_gbm[2], "", "roc_tm_data_gbm")
#thresholds_dl3 = print_save_single_roc(true_values, prediction_dl3, colors_dl[2], "", "roc_tm_data_dl")


#thresholds_drf4 = print_save_single_roc(true_values, prediction_drf4, colors_drf[3], "", "roc_neigh_data_drf")
#thresholds_gbm4 = print_save_single_roc(true_values, prediction_gbm4, colors_gbm[3], "", "roc_neigh_data_gbm")
#thresholds_dl4 = print_save_single_roc(true_values, prediction_dl4, colors_dl[3], "", "roc_neigh_data_DL")


In [None]:
model_names = ["drf1", "gbm1", "dl1", "drf2", "gbm2", "dl2", "drf3", "gbm3", "dl3",
               "drf4", "gbm4", "dl4", "drf5", "gbm5", "dl5", "drf6", "gbm6", "dl6"]
predictions = [prediction_drf1, prediction_gbm1, prediction_dl1, prediction_drf2, prediction_gbm2, prediction_dl2, 
               prediction_drf3, prediction_gbm3, prediction_dl3, prediction_drf4, prediction_gbm4, prediction_dl4,
              prediction_drf5, prediction_gbm5, prediction_dl5, prediction_drf6, prediction_gbm6, prediction_dl6]

colors_all = [colors[0], colors[6], colors[12], 
              colors[1], colors[7], colors[13], 
              colors[2], colors[8], colors[14], 
              colors[3], colors[9], colors[15], 
              colors[4], colors[10], colors[16], 
              colors[5], colors[11], colors[17]]

n = 18

# ROC all

title = ""
fig_name = "roc_all"
print_save_multy_roc(true_values, predictions, n, model_names, colors_all, title, fig_name)

# PRC all

title = ""
fig_name = "prc_all"
print_save_multy_prc(true_values, predictions, n, model_names, colors_all, title, fig_name)

# ROC and PRC all date data models

model_names = ["drf1", "gbm1", "dl1"]
predictions = [prediction_drf1, prediction_gbm1, prediction_dl1]
n = 3

title = ""
fig_name = "roc_all_time_data"

print_save_multy_roc(true_values, predictions, n, model_names, [colors[0], colors[6], colors[12]], title, fig_name)

title = ""
fig_name = "prc_all_time_data"
print_save_multy_prc(true_values, predictions, n, model_names, [colors[0], colors[6], colors[12]], title, fig_name)


# ROC and PRC time and local data

model_names = ["drf2", "gbm2", "dl2"]
predictions = [prediction_drf2, prediction_gbm2, prediction_dl2]
n = 3

title = ""
fig_name = "roc_all_time_local_data"

print_save_multy_roc(true_values, predictions, n, model_names, [colors[1], colors[7], colors[13]], title, fig_name)

title = ""
fig_name = "prc_all_time_local_data"
print_save_multy_prc(true_values, predictions, n, model_names, [colors[1], colors[7], colors[13]], title, fig_name)


# ROC and PRC time and local data + timeseries 21 days

model_names = ["drf3", "gbm3", "dl3"]
predictions = [prediction_drf3, prediction_gbm3, prediction_dl3]
n = 3

title = ""
fig_name = "roc_all_tm_data"

print_save_multy_roc(true_values, predictions, n, model_names, [colors[2], colors[8], colors[14]], title, fig_name)

title = ""
fig_name = "prc_all_tm_data"
print_save_multy_prc(true_values, predictions, n, model_names, [colors[2], colors[8], colors[14]], title, fig_name)

# ROC and PRC neigh data

model_names = ["drf4", "gbm4", "dl4"]
predictions = [prediction_drf4, prediction_gbm4, prediction_dl4]
n = 3

title = ""
fig_name = "roc_all_neigh_data"

print_save_multy_roc(true_values, predictions, n, model_names, [colors[3], colors[9], colors[15]], title, fig_name)

title = ""
fig_name = "prc_all_neigh_data"
print_save_multy_prc(true_values, predictions, n, model_names, [colors[3], colors[9], colors[15]], title, fig_name)

# ROC and PRC neigh data + local

model_names = ["drf5", "gbm5", "dl5"]
predictions = [prediction_drf5, prediction_gbm5, prediction_dl5]
n = 3

title = ""
fig_name = "roc_all_neigh_local_data"

print_save_multy_roc(true_values, predictions, n, model_names, [colors[4], colors[10], colors[16]], title, fig_name)

title = ""
fig_name = "prc_all_neigh_local_data"
print_save_multy_prc(true_values, predictions, n, model_names, [colors[4], colors[10], colors[16]], title, fig_name)

# ROC and PRC neigh data

model_names = ["drf6", "gbm6", "dl6"]
predictions = [prediction_drf6, prediction_gbm6, prediction_dl6]
n = 3

title = ""
fig_name = "roc_all_neigh_tm_data"

print_save_multy_roc(true_values, predictions, n, model_names, [colors[5], colors[11], colors[17]], title, fig_name)

title = ""
fig_name = "prc_all_neigh_tm_data"
print_save_multy_prc(true_values, predictions, n, model_names, [colors[5], colors[11], colors[17]], title, fig_name)


In [None]:
h2o.shutdown()

In [None]:
#data.columns.values
#columns= ['id', 'date', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9',
#       'd10', 'd11', 'd12', 'd13', 'd14', 'd15', 'd16', 'd17', 'd18',
#       'd19', 'd20', 'd21','datodHCat_1', 'datodHCat_2',
#       'datodHCat_3', 'datodHCat_4', 'datodHCat_5', 'crimecount_x', '60']
#data.loc[data.id==2882045,columns]

In [None]:
data_ids = ["all_data"]
Xs = [X_all]
hyperparameters = {'ntrees':[10,30], 'max_depth':[10]}
search_criteria = { 'strategy': "RandomDiscrete", 'seed': 42,
            'stopping_metric': "AUC", 'stopping_tolerance': 0.1,
            'stopping_rounds': 2}
setting = "grid"
model_id = "drf"

for i in range(len(data_ids)):
    data_id = data_ids[i]
    X = Xs[i]
    model = H2ORandomForestEstimator(model_id = model_id+"_"+data_id)
    models = grid_search(model, hyperparameters, search_criteria, X, model_id+"_"+data_id+"_grid")
    for j in range(3):
        if data_id == "all_data":
            train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X, True, 30)
        else:
            train_test_model_save_results_h2o(models[j], model_id+str(j)+"_"+setting, data_id, X)
