In [1]:
import os
import sys
import csv
import operator
import time
import datetime
import platform
import threading

In [2]:
def writeText(text, path, mode = 'w'):
    with open (path, mode, encoding = 'utf-8') as textout:
        textout.write((text))
        
def writeJson(json, path, mode = 'w'):
    with open(path, mode) as file:
        file.write(json.dumps(json))
        
def writeCsv(listOut, outputFile):
    with open (outputFile, "w", newline='', encoding = 'utf-8') as outputfile:
        writer = csv.writer(outputfile, delimiter = ",")
        for element in listOut:
            writer.writerow(element)
            
def getTxt(path):
    return open(path, 'r').read()

def getCsv(path, delim = ','):
    list_return = []
    with open (path, encoding = 'utf-8') as file:
        csvreader = csv.reader(file, delimiter = delim)        
        for i, line in enumerate(csvreader):
            list_return.append(line)
    return list_return

In [3]:
def getFloatCsv(path, delim = ','):
    list_return = []
    with open (path, encoding = 'utf-8') as file:
        csvreader = csv.reader(file, delimiter = delim)        
        for i, line in enumerate(csvreader):
            list_return.append([float(x) for x in line])
    return list_return

### data import

In [4]:
if platform.system() == 'Windows':
    feat = getFloatCsv('..\\output\\feat.csv')
else:
    feat = getFloatCsv('../output/feat.csv')

In [5]:
if platform.system() == 'Windows':
    label = getCsv('..\\output\\labels.csv')
else:
    label = getCsv('../output/labels.csv')

In [6]:
# ll = []
# for line in label:
#     for word in line:
#         if word not in ll:
#             ll.append(word)
# len(ll)'

### numpy prep

In [7]:
import numpy as np

In [8]:
X = np.array(feat)

### sklearn prep

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

In [10]:
from sklearn import metrics

In [11]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(label)
y = multilabel_binarizer.transform(label)

In [12]:
X, y

(array([[-0.40620422,  0.37869263,  0.58084106, ...,  0.03137207,
          0.64916992, -1.32333374],
        [-0.50666809,  0.45892334,  0.73953247, ..., -0.08215332,
          1.0402832 , -1.47055054],
        [-0.44416809,  0.43939209,  0.61599731, ..., -0.01086426,
          0.69067383, -1.39877319],
        ...,
        [ 2.33901978,  7.3309021 , -1.46138   , ..., -2.83006287,
          7.28421021,  1.01919556],
        [-1.52600098,  1.57382202,  2.0201416 , ..., -1.18414307,
          1.93450928, -0.99139404],
        [-1.51477051,  1.66415405,  1.97241211, ..., -1.46881104,
          2.1239624 , -1.10540771]]), array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [13]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [14]:
print('Dims training set: ', train_X.shape, train_y.shape)
print('Dims training set: ', test_X.shape, test_y.shape)

Dims training set:  (2694, 300) (2694, 125)
Dims training set:  (674, 300) (674, 125)


### RandomForest

In [15]:
from sklearn.ensemble import RandomForestClassifier

# Grid Search

In [16]:
def gridResultRFC(data, n_est1, n_est2, n_est_grid, depth1, depth2, depth_grid, min_leaf1, min_leaf2, min_leaf_grid, filename = 'log_rf_grid', thread = 1):
    train_X, test_X, train_y, test_y = data[0], data[1], data[2], data[3]
    acc_best = 0
    prec_best = 0
    acc_params_best = [0,0,0]
    prec_params_best = [0,0,0]
    str_log = 'Started at:\n' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') + '\nThread: ' + str(thread) + '\n'
    print(str_log)
    est_runs = (n_est2-n_est1)/n_est_grid
    depth_runs = (depth2-depth1)/depth_grid
    leaf_runs = (min_leaf2-min_leaf1)/min_leaf_grid
    str_tmp = 'Grid Search will test ' + str(est_runs*depth_runs*leaf_runs) + ' combinations.\n'
    str_log += str_tmp
    print(str_tmp)
    
    est_act = n_est1
    depth_act = depth1
    leaf_act = min_leaf1
    
    while (est_act < n_est2):
        depth_act = depth1
        while (depth_act < depth2):
            leaf_act = min_leaf1
            while (leaf_act < min_leaf2):
                rf = RandomForestClassifier(n_estimators=est_act, max_depth=depth_act, min_samples_leaf=leaf_act)
                rf.fit(train_X, train_y)
                pred_rf = rf.predict(test_X)        
                prec =  metrics.precision_score(test_y, pred_rf, average="samples")
                acc = metrics.accuracy_score(test_y, pred_rf)
                print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
                str_tmp = '----------------------\n'+ 'Thread: ' + str(thread) + '\n' + 'Params: '+str(est_act)+','+str(depth_act)+','+str(leaf_act)+'\n'+'Accuracy: '+str(acc)+';'+' Precision: '+str(prec)+'\n'
                print('Accuracy: ', str(acc_best), '\nParams: ', str(acc_params_best))
                print('Precision: ', str(prec_best), '\nParams: ', str(prec_params_best), '\n')
                str_log += str_tmp
                print(str_tmp)
                if acc_best <= acc:
                    acc_best = acc
                    acc_params_best = [est_act,depth_act,leaf_act]
                if prec_best <= prec:
                    prec_best = prec
                    prec_params_best = [est_act,depth_act,leaf_act]
                leaf_act += min_leaf_grid
            depth_act += depth_grid
        est_act += n_est_grid
        
    str_tmp = '==========================\n==========================\n\n\n'+'Accuracy: '+str(acc_best)+'\nParams: '+str(acc_params_best) + '\n' + 'Precision: '+str(prec_best)+'\nParams: '+str(prec_params_best)+'\n'+'Ended at:\n'+datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
    str_log += str_tmp
    
    print('==========================\n==========================\n\n')
    print('Thread: ', thread)
    print('Accuracy: ', str(acc_best), '\nParams: ', str(acc_params_best))
    print('Precision: ', str(prec_best), '\nParams: ', str(prec_params_best), '\n\n')
    print('Ended at:\n' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    
    writeText(str_log, ('..\\output\\' + filename + '.txt'))             
    

In [17]:
test_train_set = [train_X, test_X, train_y, test_y]

In [18]:
thread_list = []

t1 = threading.Thread(target=gridResultRFC, args=(test_train_set, 100, 151, 25, 40, 45, 2, 1, 3, 3, 'log_rf_grid_1.2', 1))
t2 = threading.Thread(target=gridResultRFC, args=(test_train_set, 100, 151, 25, 46, 51, 2, 1, 3, 3, 'log_rf_grid_2.2', 2))
t3 = threading.Thread(target=gridResultRFC, args=(test_train_set, 100, 151, 25, 52, 57, 2, 1, 3, 3, 'log_rf_grid_3.2', 3))
t4 = threading.Thread(target=gridResultRFC, args=(test_train_set, 175, 226, 25, 40, 45, 2, 1, 3, 3, 'log_rf_grid_4.2', 4))
t5 = threading.Thread(target=gridResultRFC, args=(test_train_set, 175, 226, 25, 46, 51, 2, 1, 3, 3, 'log_rf_grid_5.2', 5))
t6 = threading.Thread(target=gridResultRFC, args=(test_train_set, 175, 226, 25, 52, 57, 2, 1, 3, 3, 'log_rf_grid_6.2', 6))


# Sticks the thread in a list so that it remains accessible
thread_list.append(t1)
thread_list.append(t2)
thread_list.append(t3)
thread_list.append(t4)
thread_list.append(t5)
thread_list.append(t6)

# for thread in thread_list:
#     thread.start()

# for thread in thread_list:
#     thread.join()

print('done')

done


In [19]:
rf = RandomForestClassifier(n_estimators=120, max_depth=50, min_samples_leaf=1)
rf.fit(train_X, train_y)
pred_rf = rf.predict(test_X)
prec =  metrics.precision_score(test_y, pred_rf, average="samples")
acc = metrics.accuracy_score(test_y, pred_rf)
print(acc, prec)

0.6068249258160238 0.8416859352319291


  'precision', 'predicted', average, warn_for)


In [20]:
import _pickle
# save the classifier
with open('my_dumped_classifier.pkl', 'wb') as fid:
    _pickle.dump(rf, fid)    
with open('my_dumped_binarizer.pkl', 'wb') as fid:
    _pickle.dump(multilabel_binarizer, fid)

In [22]:
# load it again
with open('my_dumped_classifier.pkl', 'rb') as fid:
    rf_load = _pickle.load(fid)
with open('my_dumped_binarizer.pkl', 'rb') as fid:
    bin_load = _pickle.load(fid)

In [23]:
pred_rf2 = rf_load.predict(test_X)
prec =  metrics.precision_score(test_y, pred_rf2, average="samples")
acc = metrics.accuracy_score(test_y, pred_rf2)
print(acc, prec)

0.6068249258160238 0.8416859352319291


  'precision', 'predicted', average, warn_for)


In [24]:
pred_rf2[0]

array([0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1.,
       1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0.,
       1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0.,
       0., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.])

In [42]:
cnt_lb = 0
detrans_pred = bin_load.inverse_transform(pred_rf2[400:430])
detrans_y = bin_load.inverse_transform(test_y[400:430])
while cnt_lb < 30:
    print('Predicted Labels: ', detrans_pred[cnt_lb])
    print('Actual Labels: ', detrans_y[cnt_lb])
    print('==================')
    cnt_lb += 1

Predicted Labels:  (' health', ' social issues', 'health_medical_pharma')
Actual Labels:  (' health', ' social issues', 'health_medical_pharma')
Predicted Labels:  (' environment', 'disaster_accident')
Actual Labels:  (' environment', 'disaster_accident')
Predicted Labels:  (' unemployment', ' workforce', 'labor')
Actual Labels:  (' unemployment', ' workforce', 'labor')
Predicted Labels:  (' female labor force in the muslim world', ' workforce', ' world bank population', 'social issues')
Actual Labels:  (' female labor force in the muslim world', ' world bank population', 'social issues')
Predicted Labels:  (' world bank population', 'social issues')
Actual Labels:  (' law_crime', ' world bank population', 'social issues')
Predicted Labels:  (' demography', 'social issues')
Actual Labels:  (' demography', 'social issues')
Predicted Labels:  (' international standard industrial classification', 'national accounts')
Actual Labels:  (' international standard industrial classification', 'n

# kNN

In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [27]:
# n = 1
# n_max =50
# l_acc, l_prec = [], []
# acc_n_best, prec_n_best, acc_best, prec_best = 0, 0, 0, 0
# while n < 50:
#     print('Evaluate n=', n)
#     knn = KNeighborsClassifier(n_neighbors=n)
#     knn.fit(train_X, train_y)
    
#     pred_knn = knn.predict(test_X)
#     prec =  metrics.precision_score(test_y, pred_knn, average="samples")
#     acc = metrics.accuracy_score(test_y, pred_knn)
    
#     l_acc.append(acc)
#     l_prec.append(prec)
    
#     if acc_best <= acc:
#         acc_best = acc
#         acc_n_best = n
#     if prec_best <= prec:
#         prec_best = prec
#         prec_n_best = n
        
#     n += 1

In [28]:
# l_prec, l_acc

In [29]:
# acc_n_best, acc_best

In [30]:
# prec_n_best, prec_best

In [31]:
# pred_knn = knn.predict(test_X)
# print('Precision: ', metrics.precision_score(test_y, pred_knn, average="samples"))

In [32]:
# print('Accuracy: ', metrics.accuracy_score(test_y, pred_knn))

# OnevsRest

In [33]:
# from sklearn import metrics
# from sklearn.preprocessing import MultiLabelBinarizer
# from sklearn.multiclass import OneVsRestClassifier
# from sklearn.svm import SVC
# from numpy import array

# clf = OneVsRestClassifier(SVC(probability=True, gamma='auto'))
# clf.fit(train_X, train_y)
# predictions = clf.predict(test_X)

# my_metrics = metrics.classification_report(test_y, predictions)

# # print(my_metrics)

In [34]:
# print(clf.score(test_X, test_y, sample_weight=None))

# Neural Network

In [35]:
from sklearn.neural_network import MLPClassifier

In [36]:
# mclf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)

In [37]:
# mclf.fit(train_X, train_y)
# predictionsm = mclf.predict(test_X)

In [38]:
# print('Precision: ', metrics.precision_score(test_y, predictionsm,average='samples'))

In [39]:
# print('Accuracy: ', metrics.accuracy_score(test_y, predictionsm))

In [40]:
def gridResultNN(data, hl1, hl2, hlg, filename, thread):
    train_X, test_X, train_y, test_y = data[0], data[1], data[2], data[3]
    hla = hl1
    l_acc, l_prec = [], []
    str_tmp = ''
    acc_n_best, prec_n_best, acc_best, prec_best = 0, 0, 0, 0
    str_log = 'Started at:\n' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') + '\nThread: ' + str(thread) + '\n'
    print(str_log)
    while hla < hl2:
        print('Evaluate Hidden Layers = ', hla)
        mclf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(hla,), random_state=1)
        mclf.fit(train_X, train_y)
        predictionsm = mclf.predict(test_X)
        prec =  metrics.precision_score(test_y, predictionsm, average="samples")
        acc = metrics.accuracy_score(test_y, predictionsm)

        l_acc.append(acc)
        l_prec.append(prec)
    
        if acc_best <= acc:
            acc_best = acc
            acc_n_best = hla
        if prec_best <= prec:
            prec_best = prec
            prec_n_best = hla
        
        str_tmp += 'Evaluate Hidden Layers = ' + str(hla) + '\n' + 'Acc: ' + str(acc) + ', Prec: ' + str(prec) + '\n'
        print(str_tmp)
        str_log += str_tmp
        hla += hlg
        
    print('==========================\n==========================\n\n')
    print('Thread: ', thread)
    print('Accuracy: ', str(acc_best), '\nParams: ', str(acc_n_best))
    print('Precision: ', str(prec_best), '\nParams: ', str(prec_n_best), '\n\n')
        
    str_tmp = ('Ended at:\n' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    print(str_tmp)
    str_log += '\n' + str_tmp
    
    writeText(str_log, ('..\\output\\' + filename + '.txt')) 

In [41]:
thread_list = []

t1 = threading.Thread(target=gridResultNN, args=(test_train_set, 2, 51, 2, 'log_nn_grid_1', 1))
t2 = threading.Thread(target=gridResultNN, args=(test_train_set, 52, 200, 10, 'log_nn_grid_2', 2))
t3 = threading.Thread(target=gridResultNN, args=(test_train_set, 220, 500, 30, 'log_nn_grid_3', 3))
t4 = threading.Thread(target=gridResultNN, args=(test_train_set, 520, 1000, 50, 'log_nn_grid_4', 4))
t5 = threading.Thread(target=gridResultNN, args=(test_train_set, 1000, 1800, 75, 'log_nn_grid_5', 5))
t6 = threading.Thread(target=gridResultNN, args=(test_train_set, 1800, 3000, 180, 'log_nn_grid_6', 6))


# Sticks the thread in a list so that it remains accessible
thread_list.append(t1)
thread_list.append(t2)
thread_list.append(t3)
thread_list.append(t4)
thread_list.append(t5)
thread_list.append(t6)

# # Starts threads
# for thread in thread_list:
#     thread.start()

# # This blocks the calling thread until the thread whose join() method is called is terminated.
# # From http://docs.python.org/2/library/threading.html#thread-objects
# for thread in thread_list:
#     thread.join()

# Demonstrates that the main process waited for threads to complete
print('done')

done
