examples' descriptions added, test fixes, seaborn initial

firefly-cpp · Nov 30, 2020 · 7fe889a · 7fe889a
1 parent 992ce86
commit 7fe889a
Show file tree

Hide file tree

Showing 19 changed files with 142 additions and 13 deletions.
diff --git a/examples/classifier.py b/examples/classifier.py
@@ -3,6 +3,11 @@
 from niaaml.data import CSVDataReader
 import numpy
 
+"""
+In this example, we show how to individually use an implemented classifier and its methods. In this case we use AdaBoost for demonstration, but
+you can use any of the implemented classifiers in the same way.
+"""
+
 # prepare data reader using csv file
 data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True)
 

diff --git a/examples/export_pipeline_object.py b/examples/export_pipeline_object.py
@@ -3,6 +3,10 @@
 from niaaml.preprocessing.feature_selection import SelectKBest
 from niaaml.preprocessing.feature_transform import Normalizer
 
+"""
+In this example, we show how to export a pipeline object into a file that can later be loaded back into a Python program as a Pipeline object.
+"""
+
 # instantiate a Pipeline object with AdaBoost classifier, SelectKBest feature selection algorithm and Normalizer as feature transformation algorithm
 pipeline = Pipeline(
     feature_selection_algorithm=SelectKBest(),

diff --git a/examples/export_pipeline_text.py b/examples/export_pipeline_text.py
@@ -3,6 +3,11 @@
 from niaaml.preprocessing.feature_selection import SelectKBest
 from niaaml.preprocessing.feature_transform import Normalizer
 
+"""
+In this example, we show how to export a pipeline object into a text file in a user-friendly form. Text file cannot be loaded back into a Python program in
+a form of a Pipeline object.
+"""
+
 # instantiate a Pipeline object with AdaBoost classifier, SelectKBest feature selection algorithm and Normalizer as feature transformation algorithm
 pipeline = Pipeline(
     feature_selection_algorithm=SelectKBest(),

diff --git a/examples/factories.py b/examples/factories.py
@@ -3,6 +3,11 @@
 from niaaml.preprocessing.feature_transform import FeatureTransformAlgorithmFactory
 from niaaml.fitness import FitnessFactory
 
+"""
+In this example, we show how to use all of the implemented factories to create new object instances using their class names. You may also
+import and instantiate objects directly, but it more convenient to use factories in some cases.
+"""
+
 # instantiate all possible factories
 classifier_factory = ClassifierFactory()
 fsa_factory = FeatureSelectionAlgorithmFactory()

diff --git a/examples/feature_selection.py b/examples/feature_selection.py
@@ -3,6 +3,11 @@
 from niaaml.data import CSVDataReader
 from sklearn.feature_selection import chi2
 
+"""
+In this example, we show how to individually use an implemented feature selection algorithm and its methods. In this case we use SelectKBest for demonstration, but
+you can use any of the implemented feature selection algorithms in the same way.
+"""
+
 # prepare data reader using csv file
 data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True)
 

diff --git a/examples/feature_transform.py b/examples/feature_transform.py
@@ -2,6 +2,11 @@
 import os
 from niaaml.data import CSVDataReader
 
+"""
+In this example, we show how to individually use an implemented feature transform algorithm and its methods. In this case we use Normalizer for demonstration, but
+you can use any of the implemented feature transform algorithms in the same way.
+"""
+
 # prepare data reader using csv file
 data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True)
 

diff --git a/examples/fitness.py b/examples/fitness.py
@@ -3,6 +3,11 @@
 import os
 import numpy
 
+"""
+In this example, we show how to individually use an implemented fitness function and its method. In this case we use Precision for demonstration, but
+you can use any of the implemented fitness functions in the same way.
+"""
+
 # prepare data reader using csv file
 data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True)
 

diff --git a/examples/load_data_basic.py b/examples/load_data_basic.py
@@ -1,6 +1,11 @@
 from niaaml.data import BasicDataReader
 import numpy
 
+"""
+In this example, we show how to instantiate BasicDataReader and use its methods. You can use it to contain data in a single variable
+or as an input to an instance of the PipelineOptimizer class.
+"""
+
 # BasicDataReader instance uses arrays on the input (x and y arrays)
 data_reader = BasicDataReader(
     x=numpy.random.uniform(low=0.0, high=15.0, size=(50, 3)),

diff --git a/examples/load_data_csv.py b/examples/load_data_csv.py
@@ -1,6 +1,11 @@
 import os
 from niaaml.data import CSVDataReader
 
+"""
+In this example, we show how to instantiate CSVDataReader and use its methods. You can use it to contain data in a single variable
+or as an input to an instance of the PipelineOptimizer class.
+"""
+
 # CSVDataReader get a path to csv file on the input, reads and parses it into the x and y arrays
 # has_header and contains_classes arguments needs to be set according to the input csv file's structure
 data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True)

diff --git a/examples/load_pipeline_object_file.py b/examples/load_pipeline_object_file.py
@@ -1,6 +1,10 @@
 import os
 from niaaml import Pipeline
 
+"""
+In this example, we show how to load a saved Pipeline object from a file. You can use all of its methods after it's been successfully loaded.
+"""
+
 # load Pipeline object from a file
 pipeline = Pipeline.load(os.path.dirname(os.path.abspath(__file__)) + '/example_files/pipeline.ppln')
 

diff --git a/examples/optimize_run_pipeline.py b/examples/optimize_run_pipeline.py
@@ -6,6 +6,10 @@
 import os
 import numpy
 
+"""
+In this example, we show how to individually use the Pipeline class. You may use this if you want to test out a specific classification pipeline.
+"""
+
 # prepare data reader using csv file
 data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True)
 

diff --git a/examples/run_pipeline_optimizer_array_data.py b/examples/run_pipeline_optimizer_array_data.py
@@ -2,6 +2,11 @@
 from niaaml.data import BasicDataReader
 import numpy
 
+"""
+In this example, we show how to use the PipelineOptimizer class. This example is using an instance of BasicDataReader.
+The instantiated PipelineOptimizer will try and assemble the best pipeline with the components that are specified in its constructor.
+"""
+
 # prepare data reader using features and classes from arrays
 # in this case random dummy arrays are generated
 data_reader = BasicDataReader(

diff --git a/examples/run_pipeline_optimizer_csv_data.py b/examples/run_pipeline_optimizer_csv_data.py
@@ -2,6 +2,11 @@
 from niaaml import PipelineOptimizer, Pipeline
 from niaaml.data import CSVDataReader
 
+"""
+In this example, we show how to use the PipelineOptimizer class. This example is using an instance of CSVDataReader.
+The instantiated PipelineOptimizer will try and assemble the best pipeline with the components that are specified in its constructor.
+"""
+
 # prepare data reader using csv file
 data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset.csv', has_header=False, contains_classes=True)
 

diff --git a/niaaml/pipeline.py b/niaaml/pipeline.py
@@ -127,6 +127,12 @@ def optimize(self, x, y, population_size, number_of_evaluations, optimization_al
             optimization_algorithm (str): Name of the optimization algorithm to use.
             fitness_function (str): Name of the fitness function to use.
         
+        Notes:
+            Stratified K-Fold Cross Validation in our optimization process splits a dataset on the input 11 times, 
+            but we are actually running a stratified 10 fold cross validation since the first iteration is only used to fit 
+            feature selection and feature transform algorithms. This way the evaluation is faster with no difference in 
+            quality.
+        
         Returns:
             float: Best fitness value found in optimization process.
         """
@@ -277,6 +283,12 @@ def evaluate(D, sol):
                 D (uint): Number of dimensionas.
                 sol (numpy.ndarray[float]): Individual of population/ possible solution.
             
+            Notes:
+                Stratified K-Fold Cross Validation in our optimization process splits a dataset on the input 11 times, 
+                but we are actually running a stratified 10 fold cross validation since the first iteration is only used to fit 
+                feature selection and feature transform algorithms. This way the evaluation is faster with no difference in 
+                quality.
+
             Returns:
                 float: Fitness.
             """
@@ -350,7 +362,7 @@ def evaluate(D, sol):
                     self.__parent.set_feature_transform_algorithm(feature_transform_algorithm)
                     self.__parent.set_classifier(classifier)
                     self.__parent.set_selected_features_mask(selected_features_mask)
-                    self.__parent.set_stats(OptimizationStats(predictions, y_test))
+                    self.__parent.set_stats(OptimizationStats(predictions, y_test, scores))
 
                 return fitness
             except:

diff --git a/niaaml/tests/test_pipeline.py b/niaaml/tests/test_pipeline.py
@@ -81,7 +81,7 @@ def test_pipeline_setters_work_fine(self):
        'Class 1', 'Class 1', 'Class 1', 'Class 2', 'Class 1', 'Class 1',
        'Class 2', 'Class 2', 'Class 1', 'Class 2', 'Class 1', 'Class 2',
        'Class 2', 'Class 2'])
-        self.__pipeline.set_stats(OptimizationStats(self.__predicted, self.__y))
+        self.__pipeline.set_stats(OptimizationStats(self.__predicted, self.__y, numpy.array([0.88, 0.9, 0.91, 0.87, 0.7, 0.98, 0.95, 0.86, 0.88, 0.76])))
 
         self.assertIsInstance(self.__pipeline.get_classifier(), AdaBoost)
         self.assertIsInstance(self.__pipeline.get_feature_selection_algorithm(), SelectPercentile)

diff --git a/niaaml/tests/test_utilities.py b/niaaml/tests/test_utilities.py
@@ -1,6 +1,8 @@
 from unittest import TestCase
 from niaaml import ParameterDefinition, MinMax, OptimizationStats, get_bin_index
 import numpy as np
+import tempfile
+import os
 
 class UtilitiesTestCase(TestCase):
     def test_get_bin_index_works_fine(self):
@@ -21,21 +23,34 @@ def test_works_fine(self):
         self.assertEqual(parameter_definition.param_type, float)
 
 class OptimizationStatsTestCase(TestCase):
-    def test_works_fine(self):
-        self.__y = np.array(['Class 1', 'Class 1', 'Class 1', 'Class 2', 'Class 1', 'Class 2',
+    def setUp(self):
+        y = np.array(['Class 1', 'Class 1', 'Class 1', 'Class 2', 'Class 1', 'Class 2',
        'Class 2', 'Class 2', 'Class 2', 'Class 1', 'Class 1', 'Class 2',
        'Class 1', 'Class 2', 'Class 1', 'Class 1', 'Class 1', 'Class 1',
        'Class 2', 'Class 1'])
-        self.__predicted = np.array(['Class 1', 'Class 1', 'Class 1', 'Class 2', 'Class 2', 'Class 2',
+        predicted = np.array(['Class 1', 'Class 1', 'Class 1', 'Class 2', 'Class 2', 'Class 2',
        'Class 1', 'Class 1', 'Class 1', 'Class 2', 'Class 1', 'Class 1',
        'Class 2', 'Class 2', 'Class 1', 'Class 2', 'Class 1', 'Class 2',
        'Class 2', 'Class 2'])
 
-        stats = OptimizationStats(self.__predicted, self.__y)
-        self.assertEqual(stats._accuracy, 0.5)
-        self.assertEqual(stats._precision, 0.5199999999999999)
-        self.assertEqual(stats._cohen_kappa, 0.0)
-        self.assertEqual(stats._f1_score, 0.505050505050505)
+        self.__stats = OptimizationStats(predicted, y, np.array([0.88, 0.9, 0.91, 0.87, 0.7, 0.98, 0.95, 0.86, 0.88, 0.76]))
+
+    def test_works_fine(self):
+        self.assertEqual(self.__stats._accuracy, 0.5)
+        self.assertEqual(self.__stats._precision, 0.5199999999999999)
+        self.assertEqual(self.__stats._cohen_kappa, 0.0)
+        self.assertEqual(self.__stats._f1_score, 0.505050505050505)
+        self.assertTrue((np.array([0.88, 0.9, 0.91, 0.87, 0.7, 0.98, 0.95, 0.86, 0.88, 0.76]) == self.__stats._fitness_function_values).all())
+
+    def test_export_works_fine(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            self.__stats.export_boxplot(os.path.join(tmp, 'boxplot'))
+            self.assertTrue(os.path.exists(os.path.join(tmp, 'boxplot.png')))
+            self.assertEqual(1, len([name for name in os.listdir(tmp)]))
+
+            self.__stats.export_boxplot(os.path.join(tmp, 'boxplot.png'))
+            self.assertTrue(os.path.exists(os.path.join(tmp, 'boxplot.png')))
+            self.assertEqual(1, len([name for name in os.listdir(tmp)]))
 
 class MinMaxTestCase(TestCase):
     def test_works_fine(self):

diff --git a/niaaml/utilities.py b/niaaml/utilities.py
@@ -1,6 +1,8 @@
 from sklearn import preprocessing
 from sklearn.metrics import accuracy_score, precision_score, cohen_kappa_score, f1_score
 import numpy as np
+import seaborn as sns
+import os
 
 __all__ = [
     'MinMax',
@@ -146,12 +148,31 @@ class OptimizationStats:
         _f1_score (float): Calculated F1-score.
     """
 
-    def __init__(self, predicted, expected, **kwargs):
-        r"""Initialize the factory."""
+    def __init__(self, predicted, expected, fitness_function_values, **kwargs):
+        r"""Initialize the factory.
+
+        Arguments:
+            predicted (Iterable[any]): Array of predicted classes.
+            expected (Iterable[any]): Array of expected classes.
+            fitness_function_values (numpy.array[float]): Array of fitness function's values in the evaluation process.
+        """
         self._accuracy = accuracy_score(expected, predicted)
         self._precision = precision_score(expected, predicted, average='weighted')
         self._cohen_kappa = cohen_kappa_score(expected, predicted)
         self._f1_score = f1_score(expected, predicted, average='weighted')
+        self._fitness_function_values = fitness_function_values
+
+    def export_boxplot(self, file_name):
+        r"""Export boxplot of fitness function's values.
+
+        Arguments:
+            file_name (str): Output file name.
+        """
+        if len(os.path.splitext(file_name)[1]) == 0 or os.path.splitext(file_name)[1] != '.png':
+            file_name = file_name + '.png'
+
+        boxplot = sns.boxplot(data=[self._fitness_function_values])
+        boxplot.figure.savefig(file_name)
 
     def to_string(self):
         r"""User friendly representation of the object.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ python = "^3.8"
 numpy = "^1.19.1"
 scikit-learn = "^0.23.2"
 NiaPy = "^2.0.0rc11"
+seaborn = "^0.11.0"
 
 [tool.poetry.dev-dependencies]
 sphinx = "^3.3.1"