In [1]:
from MultiEvalDataset import MultiEurlexDataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_recall_fscore_support, multilabel_confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
import itertools as it
import warnings
from sklearn.exceptions import ConvergenceWarning

In [10]:
warnings.simplefilter('ignore', category=ConvergenceWarning)
warnings.simplefilter('ignore', category=UserWarning)

In [None]:
train_dataset = MultiEurlexDataset(languages='hr')
validation_dataset = MultiEurlexDataset(split='validation', languages='hr')
test_dataset = MultiEurlexDataset(split='test', languages='hr')

In [4]:
vectorizer = TfidfVectorizer(min_df=0.01)
mlb = MultiLabelBinarizer(classes=range(21))

In [5]:
X_train = vectorizer.fit_transform(train_dataset.data.document)
y_train = mlb.fit_transform(train_dataset.data.labels)

In [6]:
X_validation = vectorizer.transform(validation_dataset.data.document)
y_validation = mlb.transform(validation_dataset.data.labels)

In [7]:
X_test= vectorizer.transform(test_dataset.data.document)
y_test = mlb.transform(test_dataset.data.labels)

In [8]:
def train_and_validate(X_train, y_train, X_validation, y_validation, model, configs):
    best_accuracy = 0
    best_f1 = 0
    best_precision = 0
    best_recall = 0
    best_model = None

    best_config = {}
    cnt = 0

    for config in configs:
        print(cnt)
        cnt += 1
        clf = MultiOutputClassifier(model(**config)).fit(X_train, y_train)
        y_pred = clf.predict(X_validation)

        f1 = f1_score(y_true=y_validation, y_pred=y_pred, average='samples')
        acc = accuracy_score(y_true=y_validation.ravel(), y_pred=y_pred.ravel())
        prec = precision_score(y_true=y_validation, y_pred=y_pred, average='samples')
        recall = recall_score(y_true=y_validation, y_pred=y_pred, average='samples')

        if f1 > best_f1 :
            best_f1 = f1
            best_accuracy = acc
            best_recall = recall
            best_precision = prec
            best_model = clf
            best_config = config

    print(f'Validation \n'
      f'f1: {best_f1} \n'
      f'accuracy: {best_accuracy} \n'
      f'recall: {best_recall} \n'
      f'precision: {best_precision} \n'
      f'best configuration: {best_config}')

    return best_model, best_config

In [None]:
model = LogisticRegression
penalties = ['none', 'l2']
Cs = [1e-3, 1e-2, 1e-1, 1.]
max_iters = [100, 1000]
list(it.product(penalties, Cs, max_iters))
LogReg_configs = [{'penalty': penalty, 'C': C, 'max_iter': max_iter} for penalty, C, max_iter in it.product(penalties, Cs, max_iters)]

best_model, best_config = train_and_validate(X_train=X_train,
                                             y_train=y_train,
                                             X_validation=X_validation,
                                             y_validation=y_validation,
                                             model=model,
                                             configs=LogReg_configs)
print(best_config)

In [None]:
print(classification_report(y_true=y_test, y_pred=best_model.predict(X_test)))

In [None]:
model = SVC
Cs = [1e-3, 1e-2, 1e-1, 1.]
kernels = ['poly', 'rbf', 'sigmoid']
degrees = range(1, 4)
max_iters = [100, 1000]
list(it.product(penalties, Cs, max_iters))
SVM_configs = [{'C': C,
                'kernel': kernel,
                'degree': degree,
                'max_iter': max_iter} for C, kernel, degree, max_iter in it.product(Cs, kernels, degrees, max_iters)]

best_model, best_config = train_and_validate(X_train=X_train,
                                             y_train=y_train,
                                             X_validation=X_validation,
                                             y_validation=y_validation,
                                             model=model,
                                             configs=SVM_configs)
print(best_config)

0


  _warn_prf(average, modifier, msg_start, len(result))


1




In [None]:
print(classification_report(y_true=y_test, y_pred=best_model.predict(X_test)))