In [1]:
import os
import sys
import csv
import operator
import time
import datetime
import platform
import threading

In [2]:
def writeText(text, path, mode = 'w'):
    with open (path, mode, encoding = 'utf-8') as textout:
        textout.write((text))
        
def writeJson(json, path, mode = 'w'):
    with open(path, mode) as file:
        file.write(json.dumps(json))
        
def writeCsv(listOut, outputFile):
    with open (outputFile, "w", newline='', encoding = 'utf-8') as outputfile:
        writer = csv.writer(outputfile, delimiter = ",")
        for element in listOut:
            writer.writerow(element)
            
def getTxt(path):
    return open(path, 'r').read()

def getCsv(path, delim = ','):
    list_return = []
    with open (path, encoding = 'utf-8') as file:
        csvreader = csv.reader(file, delimiter = delim)        
        for i, line in enumerate(csvreader):
            list_return.append(line)
    return list_return

In [3]:
def getFloatCsv(path, delim = ','):
    list_return = []
    with open (path, encoding = 'utf-8') as file:
        csvreader = csv.reader(file, delimiter = delim)        
        for i, line in enumerate(csvreader):
            list_return.append([float(x) for x in line])
    return list_return

### data import

In [4]:
if platform.system() == 'Windows':
    feat = getFloatCsv('..\\output\\feat.csv')
else:
    feat = getFloatCsv('../output/feat.csv')

In [5]:
if platform.system() == 'Windows':
    label = getCsv('..\\output\\labels.csv')
else:
    label = getCsv('../output/labels.csv')

### numpy prep

In [6]:
import numpy as np

In [7]:
X = np.array(feat)

### sklearn prep

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

In [9]:
from sklearn import metrics

In [10]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(label)
y = multilabel_binarizer.transform(label)

In [11]:
X, y

(array([[-0.40620422,  0.37869263,  0.58084106, ...,  0.03137207,
          0.64916992, -1.32333374],
        [-0.50666809,  0.45892334,  0.73953247, ..., -0.08215332,
          1.0402832 , -1.47055054],
        [-0.44416809,  0.43939209,  0.61599731, ..., -0.01086426,
          0.69067383, -1.39877319],
        ...,
        [-0.03445435,  0.78988647, -0.72647095, ...,  0.727005  ,
          1.64715576,  1.22302246],
        [-0.02322388,  0.88021851, -0.77420044, ...,  0.44233704,
          1.83660889,  1.10900879],
        [-0.13491821,  0.87011719, -0.56777954, ...,  0.61347961,
          2.03826904,  1.07580566]]), array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [12]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [13]:
print('Dims training set: ', train_X.shape, train_y.shape)
print('Dims training set: ', test_X.shape, test_y.shape)

Dims training set:  (2830, 300) (2830, 352)
Dims training set:  (708, 300) (708, 352)


### RandomForest

In [14]:
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


# Grid Search

In [15]:
def gridResultRFC(data, n_est1, n_est2, n_est_grid, depth1, depth2, depth_grid, min_leaf1=0, min_leaf2=100, min_leaf_grid=50, filename = 'log_rf_grid', thread = 1):
    train_X, test_X, train_y, test_y = data[0], data[1], data[2], data[3]
    acc_best = 0
    prec_best = 0
    acc_params_best = [0,0,0]
    prec_params_best = [0,0,0]
    str_log = 'Started at:\n' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') + '\nThread: ' + str(thread) + '\n'
    print(str_log)
    est_runs = (n_est2-n_est1)/n_est_grid
    depth_runs = (depth2-depth1)/depth_grid
    leaf_runs = (min_leaf2-min_leaf1)/min_leaf_grid
    str_tmp = 'Grid Search will test ' + str(est_runs*depth_runs*leaf_runs) + ' combinations.\n'
    str_log += str_tmp
    print(str_tmp)
    
    est_act = n_est1
    depth_act = depth1
    leaf_act = min_leaf1
    
    while (est_act < n_est2):
        depth_act = depth1
        while (depth_act < depth2):
            leaf_act = min_leaf1
            while (leaf_act < min_leaf2):
                rf = RandomForestClassifier(n_estimators=est_act, max_depth=depth_act, min_samples_leaf=leaf_act)
                rf.fit(train_X, train_y)
                pred_rf = rf.predict(test_X)        
                prec =  metrics.precision_score(test_y, pred_rf, average="samples")
                acc = metrics.accuracy_score(test_y, pred_rf)
                print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                str_tmp = '----------------------\n'+ 'Thread: ' + str(thread) + '\n' + 'Params: '+str(est_act)+','+str(depth_act)+','+str(leaf_act)+'\n'+'Accuracy: '+str(acc)+';'+' Precision: '+str(prec)+'\n'
                print('Accuracy: ', str(acc_best), '\nParams: ', str(acc_params_best))
                print('Precision: ', str(prec_best), '\nParams: ', str(prec_params_best), '\n')
                str_log += str_tmp
                print(str_tmp)
                if acc_best <= acc:
                    acc_best = acc
                    acc_params_best = [est_act,depth_act,leaf_act]
                if prec_best <= prec:
                    prec_best = prec
                    prec_params_best = [est_act,depth_act,leaf_act]
                leaf_act += min_leaf_grid
            depth_act += depth_grid
        est_act += n_est_grid
        
    str_tmp = '==========================\n==========================\n\n\n'+'Accuracy: '+str(acc_best)+'\nParams: '+str(acc_params_best) + '\n' + 'Precision: '+str(prec_best)+'\nParams: '+str(prec_params_best)+'\n'+'Ended at:\n'+datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
    str_log += str_tmp
    
    print('==========================\n==========================\n\n')
    print('Thread: ', thread)
    print('Accuracy: ', str(acc_best), '\nParams: ', str(acc_params_best))
    print('Precision: ', str(prec_best), '\nParams: ', str(prec_params_best), '\n\n')
    print('Ended at:\n' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    
    writeText(str_log, ('../output/' + filename + '.txt'))             
    

In [17]:
test_train_set = [train_X, test_X, train_y, test_y]

In [None]:
thread_list = []

t1 = threading.Thread(target=gridResultRFC, args=(test_train_set, 100, 151, 25, 20, 36, 3, 1, 52, 25, 'log_rf_grid_1', 1))
t2 = threading.Thread(target=gridResultRFC, args=(test_train_set, 150, 201, 25, 20, 36, 3, 1, 52, 25, 'log_rf_grid_2', 2))
t3 = threading.Thread(target=gridResultRFC, args=(test_train_set, 200, 251, 25, 20, 36, 3, 1, 52, 25, 'log_rf_grid_3', 3))
t4 = threading.Thread(target=gridResultRFC, args=(test_train_set, 100, 151, 25, 35, 51, 3, 1, 52, 25, 'log_rf_grid_4', 4))
t5 = threading.Thread(target=gridResultRFC, args=(test_train_set, 150, 201, 25, 35, 51, 3, 1, 52, 25, 'log_rf_grid_5', 5))
t6 = threading.Thread(target=gridResultRFC, args=(test_train_set, 200, 251, 25, 35, 51, 3, 1, 52, 25, 'log_rf_grid_6', 6))


# Sticks the thread in a list so that it remains accessible
thread_list.append(t1)
thread_list.append(t2)
thread_list.append(t3)
thread_list.append(t4)
thread_list.append(t5)
thread_list.append(t6)

# Starts threads
for thread in thread_list:
    thread.start()

# This blocks the calling thread until the thread whose join() method is called is terminated.
# From http://docs.python.org/2/library/threading.html#thread-objects
for thread in thread_list:
    thread.join()

# Demonstrates that the main process waited for threads to complete
print('done')

Started at:
2018-11-09 23:37:11
Thread: 1

Grid Search will test 18.651428571428568 combinations.

Started at:
2018-11-09 23:37:11
Thread: 2
Started at:
2018-11-09 23:37:11
Thread: 3


Grid Search will test 18.651428571428568 combinations.

Grid Search will test 18.651428571428568 combinations.

Started at:
2018-11-09 23:37:11
Thread: 4

Grid Search will test 18.651428571428568 combinations.

Started at:
2018-11-09 23:37:11
Thread: 5

Grid Search will test 18.651428571428568 combinations.

Started at:
2018-11-09 23:37:11
Thread: 6

Grid Search will test 18.651428571428568 combinations.



  'precision', 'predicted', average, warn_for)


2018-11-09 23:44:22
Accuracy:  0 
Params:  [0, 0, 0]
Precision:  0 
Params:  [0, 0, 0] 

----------------------
Thread: 1
Params: 100,20,40
Accuracy: 0.3983050847457627; Precision: 0.8010356992348519



  'precision', 'predicted', average, warn_for)


2018-11-09 23:44:44
Accuracy:  0 
Params:  [0, 0, 0]
Precision:  0 
Params:  [0, 0, 0] 

----------------------
Thread: 4
Params: 100,35,40
Accuracy: 0.4576271186440678; Precision: 0.8559760429463821



  'precision', 'predicted', average, warn_for)


2018-11-09 23:47:59
Accuracy:  0 
Params:  [0, 0, 0]
Precision:  0 
Params:  [0, 0, 0] 

----------------------
Thread: 2
Params: 150,20,40
Accuracy: 0.3898305084745763; Precision: 0.8027530340030341



  'precision', 'predicted', average, warn_for)


2018-11-09 23:48:35
Accuracy:  0 
Params:  [0, 0, 0]
Precision:  0 
Params:  [0, 0, 0] 

----------------------
Thread: 5
Params: 150,35,40
Accuracy: 0.4562146892655367; Precision: 0.8535384812927186



  'precision', 'predicted', average, warn_for)


2018-11-09 23:51:30
Accuracy:  0 
Params:  [0, 0, 0]
Precision:  0 
Params:  [0, 0, 0] 

----------------------
Thread: 3
Params: 200,20,40
Accuracy: 0.3997175141242938; Precision: 0.8103639494529324



  'precision', 'predicted', average, warn_for)


2018-11-09 23:51:31
Accuracy:  0.3983050847457627 
Params:  [100, 20, 40]
Precision:  0.8010356992348519 
Params:  [100, 20, 40] 

----------------------
Thread: 1
Params: 100,20,75
Accuracy: 0.3983050847457627; Precision: 0.8010356992348519



  'precision', 'predicted', average, warn_for)


2018-11-09 23:52:19
Accuracy:  0 
Params:  [0, 0, 0]
Precision:  0 
Params:  [0, 0, 0] 

----------------------
Thread: 6
Params: 200,35,40
Accuracy: 0.4632768361581921; Precision: 0.8552429248403826



  'precision', 'predicted', average, warn_for)


2018-11-09 23:52:20
Accuracy:  0.4576271186440678 
Params:  [100, 35, 40]
Precision:  0.8559760429463821 
Params:  [100, 35, 40] 

----------------------
Thread: 4
Params: 100,35,75
Accuracy: 0.4576271186440678; Precision: 0.8559760429463821



  'precision', 'predicted', average, warn_for)


2018-11-09 23:58:20
Accuracy:  0.3898305084745763 
Params:  [150, 20, 40]
Precision:  0.8027530340030341 
Params:  [150, 20, 40] 

----------------------
Thread: 2
Params: 150,20,75
Accuracy: 0.3898305084745763; Precision: 0.8027530340030341



  'precision', 'predicted', average, warn_for)


2018-11-09 23:58:31
Accuracy:  0.3983050847457627 
Params:  [100, 20, 75]
Precision:  0.8010356992348519 
Params:  [100, 20, 75] 

----------------------
Thread: 1
Params: 100,23,40
Accuracy: 0.423728813559322; Precision: 0.8283271264203468



  'precision', 'predicted', average, warn_for)


2018-11-09 23:59:34
Accuracy:  0.4562146892655367 
Params:  [150, 35, 40]
Precision:  0.8535384812927186 
Params:  [150, 35, 40] 

----------------------
Thread: 5
Params: 150,35,75
Accuracy: 0.4562146892655367; Precision: 0.8535384812927186



  'precision', 'predicted', average, warn_for)


2018-11-09 23:59:41
Accuracy:  0.4576271186440678 
Params:  [100, 35, 75]
Precision:  0.8559760429463821 
Params:  [100, 35, 75] 

----------------------
Thread: 4
Params: 100,38,40
Accuracy: 0.4505649717514124; Precision: 0.8592083254159526



<br><br><br><br>

In [None]:
# rf = RandomForestClassifier(n_estimators=100, max_depth=15,random_state=0)

In [None]:
# rf.fit(train_X, train_y)
# pred_rf = rf.predict(test_X)

# kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train_X, train_y)

In [None]:
pred_knn = knn.predict(test_X)
print('Precision: ', metrics.precision_score(test_y, pred_knn, average="samples"))

In [None]:
print('Accuracy: ', metrics.accuracy_score(test_y, pred_knn))

# OnevsRest

In [None]:
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from numpy import array

clf = OneVsRestClassifier(SVC(probability=True, gamma='auto'))
clf.fit(train_X, train_y)
predictions = clf.predict(test_X)

my_metrics = metrics.classification_report(test_y, predictions)

# print(my_metrics)

In [None]:
print(clf.score(test_X, test_y, sample_weight=None))

# Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mclf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)

In [None]:
mclf.fit(train_X, train_y)
predictionsm = mclf.predict(test_X)

In [None]:
print('Precision: ', metrics.precision_score(test_y, predictionsm,average='samples'))

In [None]:
print('Accuracy: ', metrics.accuracy_score(test_y, predictionsm))