In [39]:
import os
import sys
import csv
import operator
import time
import datetime
import platform
import threading

In [40]:
def writeText(text, path, mode = 'w'):
    with open (path, mode, encoding = 'utf-8') as textout:
        textout.write((text))
        
def writeJson(json, path, mode = 'w'):
    with open(path, mode) as file:
        file.write(json.dumps(json))
        
def writeCsv(listOut, outputFile):
    with open (outputFile, "w", newline='', encoding = 'utf-8') as outputfile:
        writer = csv.writer(outputfile, delimiter = ",")
        for element in listOut:
            writer.writerow(element)
            
def getTxt(path):
    return open(path, 'r').read()

def getCsv(path, delim = ','):
    list_return = []
    with open (path, encoding = 'utf-8') as file:
        csvreader = csv.reader(file, delimiter = delim)        
        for i, line in enumerate(csvreader):
            list_return.append(line)
    return list_return

In [41]:
def getFloatCsv(path, delim = ','):
    list_return = []
    with open (path, encoding = 'utf-8') as file:
        csvreader = csv.reader(file, delimiter = delim)        
        for i, line in enumerate(csvreader):
            list_return.append([float(x) for x in line])
    return list_return

### data import

In [42]:
if platform.system() == 'Windows':
    feat = getFloatCsv('..\\output\\feat.csv')
else:
    feat = getFloatCsv('../output/feat.csv')

In [43]:
if platform.system() == 'Windows':
    label = getCsv('..\\output\\labels.csv')
else:
    label = getCsv('../output/labels.csv')

In [44]:
# ll = []
# for line in label:
#     for word in line:
#         if word not in ll:
#             ll.append(word)
# len(ll)'

### numpy prep

In [45]:
import numpy as np

In [46]:
X = np.array(feat)

### sklearn prep

In [47]:
from sklearn.preprocessing import MultiLabelBinarizer

In [48]:
from sklearn import metrics

In [49]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(label)
y = multilabel_binarizer.transform(label)

In [50]:
X, y

(array([[-0.40620422,  0.37869263,  0.58084106, ...,  0.03137207,
          0.64916992, -1.32333374],
        [-0.50666809,  0.45892334,  0.73953247, ..., -0.08215332,
          1.0402832 , -1.47055054],
        [-0.44416809,  0.43939209,  0.61599731, ..., -0.01086426,
          0.69067383, -1.39877319],
        ...,
        [ 2.33901978,  7.3309021 , -1.46138   , ..., -2.83006287,
          7.28421021,  1.01919556],
        [-1.52600098,  1.57382202,  2.0201416 , ..., -1.18414307,
          1.93450928, -0.99139404],
        [-1.51477051,  1.66415405,  1.97241211, ..., -1.46881104,
          2.1239624 , -1.10540771]]), array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [51]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [52]:
print('Dims training set: ', train_X.shape, train_y.shape)
print('Dims training set: ', test_X.shape, test_y.shape)

Dims training set:  (2694, 300) (2694, 125)
Dims training set:  (674, 300) (674, 125)


## Creating own Metric

In [53]:
def getMLA(test_y, pred_labels):
    tp, fp, tn, fn = 0, 0, 0, 0
    for i, labelset in enumerate(test_y):
        for j, label in enumerate(labelset):
            if (label == 1) & (pred_labels[i][j] == 1):
                tp += 1
            if (label == 0) & (pred_labels[i][j] == 1):
                fp += 1
            if (label == 0) & (pred_labels[i][j] == 0):
                tn += 1
            if (label == 1) & (pred_labels[i][j] == 0):
                fn += 1
#     print(tp, fp)
#     print(fn, tn)

    mall = (fp + tp)/(tp+fp+tn+fn)
    mp = tp/(tp+fp)
    return ((mall**(1/9) + mp**(1/9))/2)

### RandomForest

In [54]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
def gridResultRFC(data, n_est1, n_est2, n_est_grid, depth1, depth2, depth_grid, param_3, param3_1_2, param3_grid, filename = 'log_rf_grid', thread = 1):
    train_X, test_X, train_y, test_y = data[0], data[1], data[2], data[3]
    acc_best = 0
    prec_best = 0
    mla_best = 0
    acc_params_best = [0,0,0]
    prec_params_best = [0,0,0]
    mla_params_best = [0,0,0]
    str_log = 'Started at:\n' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') + '\nThread: ' + str(thread) + '\n'
    print(str_log)
    est_runs = (n_est2-n_est1)/n_est_grid
    depth_runs = (depth2-depth1)/depth_grid
    param3_runs = (param3_1_2-param_3)/param3_grid
    str_tmp = 'Grid Search will test ' + str(est_runs*depth_runs*param3_runs) + ' combinations.\n'
    str_log += str_tmp
    print(str_tmp)
    
    est_act = n_est1
    depth_act = depth1
    param3_act = param_3
    
    while (est_act < n_est2):
        depth_act = depth1
        while (depth_act < depth2):
            param3_act = param_3
            while (param3_act < param3_1_2):
                rf = RandomForestClassifier(n_estimators=est_act, max_depth=depth_act, n_jobs = -1)
                rf.fit(train_X, train_y)
                pred_rf = rf.predict(test_X)        
                prec =  metrics.precision_score(test_y, pred_rf, average="samples")
                acc = metrics.accuracy_score(test_y, pred_rf)
                mla = getMLA(test_y, pred_rf)
                print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                str_tmp = '----------------------\n'+ 'Thread: ' + str(thread) + '\n' + 'Params: '+str(est_act)+','+str(depth_act)+','+str(param3_act)+'\n'+'Accuracy: '+str(acc)+';'+' Precision: '+str(prec)+';'+' Own Metric: '+str(mla)+'\n'
                print('Own Metric: ', str(mla_best), '\nParams: ', str(mla_params_best))
                print('Accuracy: ', str(acc_best), '\nParams: ', str(acc_params_best))
                print('Precision: ', str(prec_best), '\nParams: ', str(prec_params_best), '\n')
                str_log += str_tmp
                print(str_tmp)
                if acc_best <= acc:
                    acc_best = acc
                    acc_params_best = [est_act,depth_act,param3_act]
                if prec_best <= prec:
                    prec_best = prec
                    prec_params_best = [est_act,depth_act,param3_act]
                if mla_best <= mla:
                    mla_best = mla
                    mla_params_best = [est_act,depth_act,param3_act]
                param3_act += param3_grid
            depth_act += depth_grid
        est_act += n_est_grid
        
    str_tmp = '==========================\n==========================\n\n\n'+'Accuracy: '+str(acc_best)+'\nParams: '+str(acc_params_best) + '\n' + 'Precision: '+str(prec_best)+'\nParams: '+str(prec_params_best)+ 'Own Metric: '+str(mla_best)+'\nParams: '+str(mla_params_best)+'\n'+'Ended at:\n'+datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
    str_log += str_tmp
    
    print('==========================\n==========================\n\n')
    print('Thread: ', thread)
    print('Own Metric: ', str(mla_best), '\nParams: ', str(mla_params_best))
    print('Accuracy: ', str(acc_best), '\nParams: ', str(acc_params_best))
    print('Precision: ', str(prec_best), '\nParams: ', str(prec_params_best), '\n\n')
    print('Ended at:\n' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    
    writeText(str_log, ('..\\output\\' + filename + '.txt'))             
    

In [56]:
test_train_set = [train_X, test_X, train_y, test_y]

In [57]:
thread_list = []

t1 = threading.Thread(target=gridResultRFC, args=(test_train_set, 100, 151, 10, 40, 45, 2, 1, 3, 3, 'log_rf_grid_1.3', 1))
t2 = threading.Thread(target=gridResultRFC, args=(test_train_set, 100, 151, 10, 46, 51, 2, 1, 3, 3, 'log_rf_grid_2.3', 2))
t3 = threading.Thread(target=gridResultRFC, args=(test_train_set, 100, 151, 10, 52, 57, 2, 1, 3, 3, 'log_rf_grid_3.3', 3))
# t4 = threading.Thread(target=gridResultRFC, args=(test_train_set, 175, 226, 25, 40, 45, 2, 1, 3, 3, 'log_rf_grid_4.2', 4))
# t5 = threading.Thread(target=gridResultRFC, args=(test_train_set, 175, 226, 25, 46, 51, 2, 1, 3, 3, 'log_rf_grid_5.2', 5))
# t6 = threading.Thread(target=gridResultRFC, args=(test_train_set, 175, 226, 25, 52, 57, 2, 1, 3, 3, 'log_rf_grid_6.2', 6))


# Sticks the thread in a list so that it remains accessible
thread_list.append(t1)
thread_list.append(t2)
thread_list.append(t3)
# thread_list.append(t4)
# thread_list.append(t5)
# thread_list.append(t6)

# for thread in thread_list:
#     thread.start()

# for thread in thread_list:
#     thread.join()

print('done')

done


In [58]:
rf = RandomForestClassifier(n_estimators=115, max_depth=42, n_jobs = -1)
rf.fit(train_X, train_y)
pred_rf = rf.predict(test_X)
prec =  metrics.precision_score(test_y, pred_rf, average="samples")
acc = metrics.accuracy_score(test_y, pred_rf)
mla = getMLA(test_y, pred_rf)
print(acc, prec, mla)

0.6008902077151336 0.8413786016159903 0.8405466748774202


In [61]:
rf2 = RandomForestClassifier(n_estimators=125, max_depth=50, n_jobs = -1)
rf2.fit(train_X, train_y)
pred_rf2 = rf2.predict(test_X)
prec =  metrics.precision_score(test_y, pred_rf2, average="samples")
acc = metrics.accuracy_score(test_y, pred_rf2)
mla = getMLA(test_y, pred_rf2)
print(acc, prec, mla)

0.5905044510385756 0.8408699114930569 0.8405806883955813


In [62]:
import _pickle
# save the classifier
with open('dumped_randomforestclassifier.pkl', 'wb') as fid:
    _pickle.dump(rf, fid)  
with open('dumped_randomforestclassifier2.pkl', 'wb') as fid:
    _pickle.dump(rf2, fid)    
with open('binarizer.pkl', 'wb') as fid:
    _pickle.dump(multilabel_binarizer, fid)

In [63]:
# load it again
with open('dumped_randomforestclassifier.pkl', 'rb') as fid:
    rf_load = _pickle.load(fid)
with open('binarizer.pkl', 'rb') as fid:
    bin_load = _pickle.load(fid)

pred_rf2 = rf_load.predict(test_X)
prec =  metrics.precision_score(test_y, pred_rf2, average="samples")
acc = metrics.accuracy_score(test_y, pred_rf2)
print(acc, prec)

0.6008902077151336 0.8413786016159903


In [64]:
pred_rf2[0]

array([0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1.,
       1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0.,
       1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0.,
       0., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.])

In [65]:
test_y[0]

array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [25]:
# cnt_lb = 0
# detrans_pred = bin_load.inverse_transform(pred_rf2[400:430])
# detrans_y = bin_load.inverse_transform(test_y[400:430])
# while cnt_lb < 30:
#     print('Predicted Labels: ', detrans_pred[cnt_lb])
#     print('Actual Labels: ', detrans_y[cnt_lb])
#     print('==================')
#     cnt_lb += 1

# kNN

In [66]:
from sklearn.neighbors import KNeighborsClassifier

In [27]:
# n = 1
# n_max =50
# l_acc, l_prec = [], []
# acc_n_best, prec_n_best, acc_best, prec_best = 0, 0, 0, 0
# while n < 50:
#     print('Evaluate n=', n)
#     knn = KNeighborsClassifier(n_neighbors=n)
#     knn.fit(train_X, train_y)
    
#     pred_knn = knn.predict(test_X)
#     prec =  metrics.precision_score(test_y, pred_knn, average="samples")
#     acc = metrics.accuracy_score(test_y, pred_knn)
    
#     l_acc.append(acc)
#     l_prec.append(prec)
    
#     if acc_best <= acc:
#         acc_best = acc
#         acc_n_best = n
#     if prec_best <= prec:
#         prec_best = prec
#         prec_n_best = n
        
#     n += 1

In [28]:
# l_prec, l_acc

In [29]:
# acc_n_best, acc_best

In [30]:
# prec_n_best, prec_best

In [31]:
# pred_knn = knn.predict(test_X)
# print('Precision: ', metrics.precision_score(test_y, pred_knn, average="samples"))

In [32]:
# print('Accuracy: ', metrics.accuracy_score(test_y, pred_knn))

# OnevsRest

In [67]:
from sklearn.multiclass import OneVsRestClassifier

# ovr = OneVsRestClassifier(RandomForestClassifier(n_estimators=120, max_depth=50, n_jobs=-1))
# ovr.fit(train_X, train_y)
# ovr_pred = ovr.predict(test_X)

# ovr = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=30))
# ovr.fit(train_X, train_y)
# ovr_pred = ovr.predict(test_X)

# metrics.precision_score(test_y, ovr_pred, average="samples")

In [34]:
def gridResultOVR_kNN(data, param_3, param3_1_2, param3_grid, filename = 'log_ovrknn_grid', thread = 1):
    train_X, test_X, train_y, test_y = data[0], data[1], data[2], data[3]
    acc_best = 0
    prec_best = 0
    mla_best = 0
    acc_params_best = 0
    prec_params_best = 0
    mla_params_best = 0
    str_log = 'Started at:\n' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') + '\nThread: ' + str(thread) + '\n'
    print(str_log)
    param3_runs = (param3_1_2-param_3)/param3_grid
    str_tmp = 'Grid Search will test ' + str(param3_runs) + ' combinations.\n'
    str_log += str_tmp
    print(str_tmp)

    param3_act = param_3
    
    while (param3_act < param3_1_2):
        ovr = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=param3_act))
        ovr.fit(train_X, train_y)
        pred_ovr = ovr.predict(test_X)        
        prec =  metrics.precision_score(test_y, pred_ovr, average="samples")
        acc = metrics.accuracy_score(test_y, pred_ovr)
        mla = getMLA(test_y, pred_ovr)
        print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
        str_tmp = '----------------------\n'+ 'Thread: ' + str(thread) + '\n' + 'Params: '+str(param3_act)+'\n'+'Accuracy: '+str(acc)+';'+' Precision: '+str(prec)+';'+' Own Metric: '+str(mla)+'\n'
        print('Own Metric: ', str(mla_best), '\nParams: ', str(mla_params_best))
        print('Accuracy: ', str(acc_best), '\nParams: ', str(acc_params_best))
        print('Precision: ', str(prec_best), '\nParams: ', str(prec_params_best), '\n')
        str_log += str_tmp
        print(str_tmp)
        if acc_best <= acc:
            acc_best = acc
            acc_params_best = param3_act
        if prec_best <= prec:
            prec_best = prec
            prec_params_best = param3_act
        if mla_best <= mla:
            mla_best = mla                
            mla_params_best = [param3_act]
        param3_act += param3_grid
        
    str_tmp = '==========================\n==========================\n\n\n'+'Accuracy: '+str(acc_best)+'\nParams: '+str(acc_params_best) + '\n' + 'Precision: '+str(prec_best)+'\nParams: '+str(prec_params_best)+'\n'+'Own Metric: '+str(mla_best)+'\nParams: '+str(mla_params_best)+'\n'+'Ended at:\n'+datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
    str_log += str_tmp
    
    print('==========================\n==========================\n\n')
    print('Thread: ', thread)
    print('Own Metric: ', str(mla_best), '\nParams: ', str(mla_params_best))
    print('Accuracy: ', str(acc_best), '\nParams: ', str(acc_params_best))
    print('Precision: ', str(prec_best), '\nParams: ', str(prec_params_best), '\n\n')
    print('Ended at:\n' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    
    writeText(str_log, ('..\\output\\' + filename + '.txt'))             
    

In [35]:
thread_list = []

t1 = threading.Thread(target=gridResultOVR_kNN, args=(test_train_set, 5, 16, 5, 'log_ovrknn_grid_1', 1))
t2 = threading.Thread(target=gridResultOVR_kNN, args=(test_train_set, 20, 31, 5, 'log_ovrknn_grid_2', 2))
t3 = threading.Thread(target=gridResultOVR_kNN, args=(test_train_set, 35, 46, 5, 'log_ovrknn_grid_3', 3))

# Sticks the thread in a list so that it remains accessible
thread_list.append(t1)
thread_list.append(t2)
thread_list.append(t3)

# for thread in thread_list:
#     thread.start()

# for thread in thread_list:
#     thread.join()

print('done')

  **self._backend_args)


Started at:
2018-12-05 00:34:15
Thread: 1
Started at:
2018-12-05 00:34:15
Thread: 2

Grid Search will test 2.2 combinations.


Grid Search will test 2.2 combinations.

Started at:
2018-12-05 00:34:15
Thread: 3

Grid Search will test 2.2 combinations.



  **self._backend_args)
  **self._backend_args)
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  'precision', 'predicted', average, warn_for)


2018-12-05 00:36:08
Own Metric:  0 
Params:  0
Accuracy:  0 
Params:  0
Precision:  0 
Params:  0 

----------------------
Thread: 1
Params: 5
Accuracy: 0.5178041543026706; Precision: 0.8059846022754034; Own Metric: 0.83953403547044



  **self._backend_args)
  str(classes[c]))
  str(classes[c]))
  'precision', 'predicted', average, warn_for)


2018-12-05 00:36:52
Own Metric:  0 
Params:  0
Accuracy:  0 
Params:  0
Precision:  0 
Params:  0 

----------------------
Thread: 2
Params: 20
Accuracy: 0.27596439169139464; Precision: 0.6698581833596671; Own Metric: 0.8329807132716607



  **self._backend_args)
  str(classes[c]))
  str(classes[c]))
  'precision', 'predicted', average, warn_for)


2018-12-05 00:37:00
Own Metric:  0 
Params:  0
Accuracy:  0 
Params:  0
Precision:  0 
Params:  0 

----------------------
Thread: 3
Params: 35
Accuracy: 0.21513353115727002; Precision: 0.6356487725923928; Own Metric: 0.8300977405906169



  **self._backend_args)
  str(classes[c]))
  str(classes[c]))
  'precision', 'predicted', average, warn_for)


2018-12-05 00:38:26
Own Metric:  0.83953403547044 
Params:  [5]
Accuracy:  0.5178041543026706 
Params:  5
Precision:  0.8059846022754034 
Params:  5 

----------------------
Thread: 1
Params: 10
Accuracy: 0.413946587537092; Precision: 0.7358077382558094; Own Metric: 0.8362038844359568



  **self._backend_args)
  str(classes[c]))
  str(classes[c]))
  'precision', 'predicted', average, warn_for)


2018-12-05 00:39:26
Own Metric:  0.8329807132716607 
Params:  [20]
Accuracy:  0.27596439169139464 
Params:  20
Precision:  0.6698581833596671 
Params:  20 

----------------------
Thread: 2
Params: 25
Accuracy: 0.258160237388724; Precision: 0.6555424422264184; Own Metric: 0.832056642076885



  **self._backend_args)
  str(classes[c]))
  str(classes[c]))
  'precision', 'predicted', average, warn_for)


2018-12-05 00:39:42
Own Metric:  0.8300977405906169 
Params:  [35]
Accuracy:  0.21513353115727002 
Params:  35
Precision:  0.6356487725923928 
Params:  35 

----------------------
Thread: 3
Params: 40
Accuracy: 0.19881305637982197; Precision: 0.6151669300037252; Own Metric: 0.8290222054092249



  **self._backend_args)
  str(classes[c]))
  str(classes[c]))
  'precision', 'predicted', average, warn_for)


2018-12-05 00:40:51
Own Metric:  0.83953403547044 
Params:  [5]
Accuracy:  0.5178041543026706 
Params:  5
Precision:  0.8059846022754034 
Params:  5 

----------------------
Thread: 1
Params: 15
Accuracy: 0.3486646884272997; Precision: 0.7077873906508921; Own Metric: 0.8350298240082841



Thread:  1
Own Metric:  0.83953403547044 
Params:  [5]
Accuracy:  0.5178041543026706 
Params:  5
Precision:  0.8059846022754034 
Params:  5 


Ended at:
2018-12-05 00:40:51
2018-12-05 00:41:54
Own Metric:  0.8329807132716607 
Params:  [20]
Accuracy:  0.27596439169139464 
Params:  20
Precision:  0.6698581833596671 
Params:  20 

----------------------
Thread: 2
Params: 30
Accuracy: 0.22106824925816024; Precision: 0.633400291597621; Own Metric: 0.8305723227031581



Thread:  2
Own Metric:  0.8329807132716607 
Params:  [20]
Accuracy:  0.27596439169139464 
Params:  20
Precision:  0.6698581833596671 
Params:  20 


Ended at:
2018-12-05 00:41:54
2018-12-05 00:42:11
Own Metric:  0.8300977405906169 
Params:  

In [68]:
ovrknn = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=20))
ovrknn.fit(train_X, train_y)
pred_ovrknn = ovrknn.predict(test_X)
prec =  metrics.precision_score(test_y, pred_ovrknn, average="samples")
acc = metrics.accuracy_score(test_y, pred_ovrknn)
mla = getMLA(test_y, pred_ovrknn)
print(acc, prec, mla)

with open('dumped_ovrknn.pkl', 'wb') as fid:
    _pickle.dump(ovrknn, fid)  

0.27596439169139464 0.6698581833596671 0.8329807132716607


In [18]:
def gridResultOVR_RFC(data, n_est1, n_est2, n_est_grid, depth1, depth2, depth_grid, param_3, param3_1_2, param3_grid, filename = 'log_ovrrfc_grid', thread = 1):
    train_X, test_X, train_y, test_y = data[0], data[1], data[2], data[3]
    acc_best = 0
    prec_best = 0
    mla_best = 0
    acc_params_best = [0,0,0]
    prec_params_best = [0,0,0]
    mla_params_best = [0,0,0]
    str_log = 'Started at:\n' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') + '\nThread: ' + str(thread) + '\n'
    print(str_log)
    est_runs = (n_est2-n_est1)/n_est_grid
    depth_runs = (depth2-depth1)/depth_grid
    param3_runs = (param3_1_2-param_3)/param3_grid
    str_tmp = 'Grid Search will test ' + str(est_runs*depth_runs*param3_runs) + ' combinations.\n'
    str_log += str_tmp
    print(str_tmp)
    
    est_act = n_est1
    depth_act = depth1
    param3_act = param_3
    
    while (est_act < n_est2):
        depth_act = depth1
        while (depth_act < depth2):
            param3_act = param_3
            while (param3_act < param3_1_2):
                ovr = OneVsRestClassifier(RandomForestClassifier(n_estimators=est_act, max_depth=depth_act, n_jobs = -1))
                ovr.fit(train_X, train_y)
                pred_ovr = ovr.predict(test_X)        
                prec =  metrics.precision_score(test_y, pred_ovr, average="samples")
                acc = metrics.accuracy_score(test_y, pred_ovr)
                mla = getMLA(test_y, pred_ovr)
                print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                str_tmp = '----------------------\n'+ 'Thread: ' + str(thread) + '\n' + 'Params: '+str(est_act)+','+str(depth_act)+','+str(param3_act)+'\n'+'Accuracy: '+str(acc)+';'+' Precision: '+str(prec)+';'+' Own Metric: '+str(mla)+'\n'
                print('Own Metric: ', str(mla_best), '\nParams: ', str(mla_params_best))
                print('Accuracy: ', str(acc_best), '\nParams: ', str(acc_params_best))
                print('Precision: ', str(prec_best), '\nParams: ', str(prec_params_best), '\n')
                str_log += str_tmp
                print(str_tmp)
                if acc_best <= acc:
                    acc_best = acc
                    acc_params_best = [est_act,depth_act,param3_act]
                if prec_best <= prec:
                    prec_best = prec
                    prec_params_best = [est_act,depth_act,param3_act]
                if mla_best <= mla:
                    mla_best = mla
                    mla_params_best = [est_act,depth_act,param3_act]
                param3_act += param3_grid
            depth_act += depth_grid
        est_act += n_est_grid
        
    str_tmp = '==========================\n==========================\n\n\n'+'Accuracy: '+str(acc_best)+'\nParams: '+str(acc_params_best) + '\n' + 'Precision: '+str(prec_best)+'\nParams: '+str(prec_params_best)+'Own Metric: '+str(mla_best)+'\nParams: '+str(mla_params_best)+'\n'+'Ended at:\n'+datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
    str_log += str_tmp
    
    print('==========================\n==========================\n\n')
    print('Thread: ', thread)
    print('Own Metric: ', str(mla_best), '\nParams: ', str(mla_params_best))
    print('Accuracy: ', str(acc_best), '\nParams: ', str(acc_params_best))
    print('Precision: ', str(prec_best), '\nParams: ', str(prec_params_best), '\n\n')
    print('Ended at:\n' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    
    writeText(str_log, ('..\\output\\' + filename + '.txt'))             
    

In [38]:
thread_list = []

t1 = threading.Thread(target=gridResultOVR_RFC, args=(test_train_set, 100, 151, 25, 40, 45, 2, 1, 3, 3, 'log_ovrrfc_grid_1', 1))
t2 = threading.Thread(target=gridResultOVR_RFC, args=(test_train_set, 100, 151, 25, 46, 51, 2, 1, 3, 3, 'log_ovrrfc_grid_2', 2))
t3 = threading.Thread(target=gridResultOVR_RFC, args=(test_train_set, 100, 151, 25, 52, 57, 2, 1, 3, 3, 'log_ovrrfc_grid_3', 3))


# Sticks the thread in a list so that it remains accessible
thread_list.append(t1)
thread_list.append(t2)
thread_list.append(t3)

# for thread in thread_list:
#     thread.start()

# for thread in thread_list:
#     thread.join()

print('done')

Started at:
2018-12-05 01:26:49
Thread: 1
Started at:
2018-12-05 01:26:49
Thread: 2

Grid Search will test 3.3999999999999995 combinations.


Grid Search will test 3.3999999999999995 combinations.

Started at:
2018-12-05 01:26:49
Thread: 3

Grid Search will test 3.3999999999999995 combinations.



  **self._backend_args)
  **self._backend_args)
Exception in thread Thread-6:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\threading.py", line 917, in _bootstrap_inner
    self.run()
  File "C:\ProgramData\Anaconda3\lib\threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-18-0c6ac8e2e054>", line 28, in gridResultOVR_RFC
    ovr.fit(train_X, train_y)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\multiclass.py", line 215, in fit
    for i, column in enumerate(columns))
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
    job = self._backen

2018-12-05 01:29:19
Own Metric:  0 
Params:  [0, 0, 0]
Accuracy:  0 
Params:  [0, 0, 0]
Precision:  0 
Params:  [0, 0, 0] 

----------------------
Thread: 3
Params: 100,52,1
Accuracy: 0.5534124629080118; Precision: 0.8368522921193545; Own Metric: 0.8399656168281626

2018-12-05 01:30:36
Own Metric:  0.8399656168281626 
Params:  [100, 52, 1]
Accuracy:  0.5534124629080118 
Params:  [100, 52, 1]
Precision:  0.8368522921193545 
Params:  [100, 52, 1] 

----------------------
Thread: 3
Params: 100,54,1
Accuracy: 0.56973293768546; Precision: 0.8372031035235783; Own Metric: 0.8398930845586363

2018-12-05 01:31:56
Own Metric:  0.8399656168281626 
Params:  [100, 52, 1]
Accuracy:  0.56973293768546 
Params:  [100, 54, 1]
Precision:  0.8372031035235783 
Params:  [100, 54, 1] 

----------------------
Thread: 3
Params: 100,56,1
Accuracy: 0.5637982195845698; Precision: 0.8311094197592713; Own Metric: 0.839998000918867

2018-12-05 01:33:22
Own Metric:  0.839998000918867 
Params:  [100, 56, 1]
Accuracy: 

In [69]:
ovrrfc = OneVsRestClassifier(RandomForestClassifier(n_estimators=130, max_depth=43, n_jobs = -1))
ovrrfc.fit(train_X, train_y)
pred_ovrrfc = ovrrfc.predict(test_X)
prec =  metrics.precision_score(test_y, pred_ovrrfc, average="samples")
acc = metrics.accuracy_score(test_y, pred_ovrrfc)
mla = getMLA(test_y, pred_ovrrfc)
print(acc, prec, mla)

with open('dumped_ovrrfc.pkl', 'wb') as fid:
    _pickle.dump(ovrrfc, fid)  

0.5578635014836796 0.828568414770195 0.8398930845586363
