In [1]:
import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.classifiers import Classifier, Evaluation
from weka.core.classes import Random
import numpy as np

In [2]:
def load_data(file_path):
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(file_path)
    data.class_is_last()
    return data


def check_class_distribution(data):
    class_attribute = data.class_attribute
    counts = [data.attribute_stats(data.class_index).nominal_counts[i] for i in range(class_attribute.num_values)]
    class_labels = [class_attribute.value(i) for i in range(class_attribute.num_values)]
    distribution = dict(zip(class_labels, counts))
    print("Dystrybucja klas w zbiorze danych:", distribution)


def perform_cross_validation(classifier_name, data, folds, repeats, seed=1):
    cls = Classifier(classname=classifier_name)

    summed_confusion_matrix = None
    accuracy_scores = []
    tp_rates = []
    tn_rates = []
    gm_scores = []
    auc_scores = []

    random_instance = Random(seed)

    for _ in range(repeats):
        eval = Evaluation(data)
        eval.crossvalidate_model(cls, data, folds, random_instance)

        if summed_confusion_matrix is None:
            summed_confusion_matrix = np.array(eval.confusion_matrix)
        else:
            summed_confusion_matrix += eval.confusion_matrix

        accuracy_scores.append(eval.percent_correct)
        tp_rates.append(eval.true_positive_rate(1))
        tn_rates.append(eval.true_negative_rate(1))
        gm_scores.append(eval.kappa)
        auc_scores.append(eval.area_under_roc(1))

    summed_confusion_matrix = np.round(summed_confusion_matrix / repeats, 4)  # [0.0] - TN, [0.1] - FP, [1.0] - FN, [1.1] - TP
    mean_accuracy = np.round(np.mean(accuracy_scores), 4)
    mean_tprate = np.round(np.mean(tp_rates), 4)
    mean_tnrate = np.round(np.mean(tn_rates), 4)
    mean_gmean = np.round(np.mean(gm_scores), 4)
    mean_auc = np.round(np.mean(auc_scores), 4)

    return summed_confusion_matrix, mean_accuracy, mean_tprate, mean_tnrate, mean_gmean, mean_auc

In [3]:
jvm.start()

file_path = "input/L4._1_po_weka.arff"
data = load_data(file_path)
check_class_distribution(data)

classifier_name = "weka.classifiers.trees.J48"
folds = 10
repeats = 3

results = perform_cross_validation(classifier_name, data, folds, repeats)

print("Macierz błędów:")
print(results[0])
print("Średnia Accuracy:", results[1])
print("Średni TPrate:", results[2])
print("Średni TNrate:", results[3])
print("Średni GMean:", results[4])
print("Średni AUC:", results[5])

jvm.stop()

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['C:\\Users\\micha\\anaconda3\\envs\\weka_env\\Lib\\site-packages\\javabridge\\jars\\rhino-1.7R4.jar', 'C:\\Users\\micha\\anaconda3\\envs\\weka_env\\Lib\\site-packages\\javabridge\\jars\\runnablequeue.jar', 'C:\\Users\\micha\\anaconda3\\envs\\weka_env\\Lib\\site-packages\\javabridge\\jars\\cpython.jar', 'C:\\Users\\micha\\anaconda3\\envs\\weka_env\\lib\\site-packages\\weka\\lib\\arpack_combined.jar', 'C:\\Users\\micha\\anaconda3\\envs\\weka_env\\lib\\site-packages\\weka\\lib\\core.jar', 'C:\\Users\\micha\\anaconda3\\envs\\weka_env\\lib\\site-packages\\weka\\lib\\mtj.jar', 'C:\\Users\\micha\\anaconda3\\envs\\weka_env\\lib\\site-packages\\weka\\lib\\python-weka-wrapper.jar', 'C:\\Users\\micha\\anaconda3\\envs\\weka_env\\lib\\site-packages\\weka\\lib\\weka.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support disabled


Dystrybucja klas w zbiorze danych: {'dobry': 1362, 'zly': 219}
Macierz błędów:
[[1344.       18.    ]
 [ 172.6667   46.3333]]
Średnia Accuracy: 87.9401
Średni TPrate: 0.2116
Średni TNrate: 0.9868
Średni GMean: 0.2819
Średni AUC: 0.7851
