In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from metrics.Multilabel_classification_metrics import Precision, Accuracy, Recall, F1Measure, Hamming_Loss
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import ClassifierChain
import pickle
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, accuracy_score

### Load Data

In [4]:
# Dataframe
path_df = "../Pickles/multilabel_binarizer.pickle"
with open(path_df, 'rb') as data:
    multilabel_binarizer = pickle.load(data)

# features_train
path_features_train = "../Pickles/features_train.pickle"
with open(path_features_train, 'rb') as data:
    features_train = pickle.load(data)

# labels_train
path_labels_train = "../Pickles/labels_train.pickle"
with open(path_labels_train, 'rb') as data:
    labels_train = pickle.load(data)

# features_test
path_features_test = "../Pickles/features_test.pickle"
with open(path_features_test, 'rb') as data:
    features_test = pickle.load(data)

# labels_test
path_labels_test = "../Pickles/labels_test.pickle"
with open(path_labels_test, 'rb') as data:
    labels_test = pickle.load(data)

### Train model

In [5]:
clf = ClassifierChain(MultinomialNB())
clf.fit(features_train,labels_train)
y_predictions = clf.predict(features_test)

In [6]:
multilabel_binarizer.inverse_transform(y_predictions)[95]

('Fiction', 'Novel')

In [7]:
F1Measure(labels_test, y_predictions)

0.28958805744519683

In [8]:
y_pred_prob = clf.predict_proba(features_test)
y_pred_prob

array([[0.19997835, 0.0358366 , 0.02673108, ..., 0.02620826, 0.0219563 ,
        0.04747359],
       [0.127521  , 0.03953723, 0.05731229, ..., 0.06813338, 0.02401308,
        0.00853458],
       [0.14026314, 0.02803743, 0.11264337, ..., 0.2891638 , 0.01636124,
        0.14843827],
       ...,
       [0.10952384, 0.02176672, 0.08624203, ..., 0.25775045, 0.01720869,
        0.07862722],
       [0.10066258, 0.07224459, 0.09508248, ..., 0.1842166 , 0.05624017,
        0.05701813],
       [0.21828009, 0.07378297, 0.08263376, ..., 0.2160141 , 0.05508707,
        0.26089134]])

In [9]:
def findBestThresholdForLabel(predicted, expected):
    thresholds = np.arange(0.0,1.0, 0.005)
    best_thresh = 0.0
    best_accuracy = 0.0
    for t in thresholds:
       y_pred_new = (predicted >= t).astype(int)
       best_accuracy_cur = accuracy_score(expected, y_pred_new)
       if best_accuracy_cur >= best_accuracy:
           best_thresh = t
           best_accuracy = best_accuracy_cur
    return best_thresh
       
def getBestThresholdsForLables(predicted_list, expected_list):
    array_of_thresholds = []
    for i in range(0, len(predicted_list[0])):
        predicted_column = getColumn(predicted_list, i)
        expected_column = getColumn(expected_list, i)
        array_of_thresholds.append(findBestThresholdForLabel(predicted_column, expected_column))
    return array_of_thresholds

def getColumn(list_of_labels, index) :
    return [l[index] for l in list_of_labels]

In [10]:
list_of_thresholds = getBestThresholdsForLables( y_pred_prob, labels_test)

In [13]:
t = 0.2485 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)

In [11]:
for key,values in enumerate(y_pred_prob):
    for key1,values1 in enumerate(values):
        y_pred_prob[key][key1] = int(values1 >= list_of_thresholds[key1])

In [14]:
F1Measure(labels_test, y_pred_prob)

0.4069765217472451

In [13]:
accuracy_score(labels_test, y_pred_prob)

0.12962962962962962

In [15]:
Accuracy(labels_test, y_pred_prob)

0.3379678620419364

In [16]:
Precision(labels_test, y_pred_prob)

0.42202625906329666

In [17]:
Recall(labels_test, y_pred_prob)

0.4646531452087007