remove 10-fold cross validation, remove boxplot export

firefly-cpp · Dec 5, 2020 · 7d9c4fa · 7d9c4fa
1 parent d355128
commit 7d9c4fa
Show file tree

Hide file tree

Showing 10 changed files with 180 additions and 298 deletions.
diff --git a/README.md b/README.md
@@ -150,12 +150,6 @@ You can also save a user-friendly representation of a pipeline to a text file.
 pipeline.export_text('pipeline.txt')
 ```
 
-Or even export results of the fitness function after a 10-fold cross validation during the pipeline optimization process as a box plot.
-
-```python
-pipeline.export_boxplot('boxplot.png')
-```
-
 This is a very simple example with dummy data. It is only intended to give you a basic idea on how to use the framework.
 
 ### Example of a Pipeline Component Implementation

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
@@ -34,13 +34,12 @@ Create a new file, with name, for example *my_first_pipeline.py* and paste in th
     pipeline = pipeline_optimizer.run('Accuracy', 20, 20, 400, 400, 'ParticleSwarmAlgorithm', 'ParticleSwarmAlgorithm')
 
 **As you can see, pipeline components, fitness function and optimization algorithms are always passed into pipeline optimization using their class names.** The example below uses the Particle Swarm Algorithm as the optimization algorithm. You can find a list of all available algorithms in the `NiaPy's documentation <https://niapy.readthedocs.io/en/stable/>`_.
-Now you can run it using the command ``python my_first_pipeline.py``. The code currently does not do much as we want to save our pipeline to a file so we can use it later, save a user-friendly representation of it to a text file or even export a pipeline's 10-fold cross validation results into an image file in a form of box plot. You can choose one or all of the scenarios by adding the code below.
+Now you can run it using the command ``python my_first_pipeline.py``. The code currently does not do much, but we can save our pipeline to a file so we can use it later or save a user-friendly representation of it to a text file. You can choose one or both of the scenarios by adding the code below.
 
 .. code:: python
 
     pipeline.export('pipeline.ppln')
     pipeline.export_text('pipeline.txt')
-    pipeline.export_boxplot('boxplot.png')
 
 If you want to load and use the saved pipeline later, you can use the following code.
 

diff --git a/examples/export_pipeline_boxplot.py b/examples/export_pipeline_boxplot.py
diff --git a/examples/optimization_stats.py b/examples/optimization_stats.py
@@ -3,7 +3,7 @@
 
 """
 In this example, we show how the OptimizationStats class can be used. Normally, it is used in the background when the Pipeline's optimize method is called.
-You may also use it on its own if you find any use.
+You may also use it on its own if you find useful.
 """
 
 # dummy array with expected results of classification process
@@ -18,15 +18,8 @@
 'Class 2', 'Class 2', 'Class 1', 'Class 2', 'Class 1', 'Class 2',
 'Class 2', 'Class 2'])
 
-# let's say these are fitness scores of the 10-fold cross validation
-fitness_scores = np.array([0.5, 0.55, 0.45, 0.57, 0.6, 0.47, 0.53, 0.52, 0.58, 0.44])
-
 # instantiate OptimizationStats
-# let's say the used fitness function's name is Accuracy
-stats = OptimizationStats(predicted, y, fitness_scores, 'Accuracy')
-
-# export boxplot of the 10-fold cross validation scores
-stats.export_boxplot('boxplot.png')
+stats = OptimizationStats(predicted, y)
 
 # print user-friendly text representation
 print(stats.to_string())
diff --git a/niaaml/pipeline.py b/niaaml/pipeline.py
@@ -1,6 +1,5 @@
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score
-from sklearn.model_selection import StratifiedKFold
 from niaaml.utilities import MinMax, get_bin_index, OptimizationStats
 from niaaml.fitness import FitnessFactory
 from NiaPy.benchmarks import Benchmark
@@ -127,12 +126,6 @@ def optimize(self, x, y, population_size, number_of_evaluations, optimization_al
             optimization_algorithm (str): Name of the optimization algorithm to use.
             fitness_function (str): Name of the fitness function to use.
         
-        Notes:
-            Stratified K-Fold Cross Validation in our optimization process splits a dataset on the input 11 times, 
-            but we are actually running a stratified 10 fold cross validation since the first iteration is only used to fit 
-            feature selection and feature transform algorithms. This way the evaluation is faster with no difference in 
-            quality.
-        
         Returns:
             float: Best fitness value found in optimization process.
         """
@@ -210,18 +203,6 @@ def export_text(self, file_name):
         with open(file_name, 'w') as f:
             f.write(pipeline.to_string())
 
-    def export_boxplot(self, file_name):
-        r"""Export boxplot of fitness function's values in the 10-fold cross validation's process.
-        Uses OptimizationStats' export_boxplot method.
-
-        Arguments:
-            file_name (str): Output file name.
-        
-        See also:
-            * :func:`niaaml.utilities.OptimizationStats.export_boxplot`
-        """
-        self.__best_stats.export_boxplot(file_name)
-
     @staticmethod
     def load(file_name):
         r"""Loads Pipeline object from a file.
@@ -294,12 +275,6 @@ def evaluate(D, sol):
             Arguments:
                 D (uint): Number of dimensionas.
                 sol (numpy.ndarray[float]): Individual of population/ possible solution.
-            
-            Notes:
-                Stratified K-Fold Cross Validation in our optimization process splits a dataset on the input 11 times, 
-                but we are actually running a stratified 10 fold cross validation since the first iteration is only used to fit 
-                feature selection and feature transform algorithms. This way the evaluation is faster with no difference in 
-                quality.
 
             Returns:
                 float: Fitness.
@@ -341,40 +316,34 @@ def evaluate(D, sol):
                 x = copy.deepcopy(self.__x)
                 y = copy.deepcopy(self.__y)
 
-                scores = np.array([], dtype=float)
-                kf = StratifiedKFold(n_splits=11, random_state=0, shuffle=True)
                 selected_features_mask = None
-                fit_iteration = True
-                for train_index, test_index in kf.split(x, y):
-                    x_train, x_test, y_train, y_test = x[train_index], x[test_index], y[train_index], y[test_index]
-
-                    if fit_iteration:
-                        if feature_selection_algorithm is None:
-                            selected_features_mask = np.ones(x.shape[1], dtype=bool)
-                        else:
-                            selected_features_mask = feature_selection_algorithm.select_features(x_train, y_train)
-                        x = x[:, selected_features_mask]
-
-                        if feature_transform_algorithm is not None:
-                            x_train = x_train[:, selected_features_mask]
-                            feature_transform_algorithm.fit(x_train)
-                            feature_transform_algorithm.transform(x)
-
-                        fit_iteration = False
-                    else:
-                        classifier.fit(x_train, y_train)
-                        predictions = classifier.predict(x_test)
-                        scores = np.append(scores, self.__fitness_function.get_fitness(predictions, y_test))
+
+                x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
+
+                if feature_selection_algorithm is None:
+                    selected_features_mask = np.ones(x.shape[1], dtype=bool)
+                else:
+                    selected_features_mask = feature_selection_algorithm.select_features(x_train, y_train)
+
+                x_train = x_train[:, selected_features_mask]
+                x_test = x_test[:, selected_features_mask]
+
+                if feature_transform_algorithm is not None:
+                    feature_transform_algorithm.fit(x_train)
+                    x_train = feature_transform_algorithm.transform(x_train)
+                    x_test = feature_transform_algorithm.transform(x_test)
 
-                fitness = np.mean(scores) * -1
+                classifier.fit(x_train, y_train)
+                predictions = classifier.predict(x_test)
+                fitness = self.__fitness_function.get_fitness(predictions, y_test) * -1
 
                 if fitness < self.__current_best_fitness:
                     self.__current_best_fitness = fitness
                     self.__parent.set_feature_selection_algorithm(feature_selection_algorithm)
                     self.__parent.set_feature_transform_algorithm(feature_transform_algorithm)
                     self.__parent.set_classifier(classifier)
                     self.__parent.set_selected_features_mask(selected_features_mask)
-                    self.__parent.set_stats(OptimizationStats(predictions, y_test, scores, self.__fitness_function.Name))
+                    self.__parent.set_stats(OptimizationStats(predictions, y_test))
 
                 return fitness
             except:

diff --git a/niaaml/tests/test_pipeline.py b/niaaml/tests/test_pipeline.py
@@ -43,19 +43,6 @@ def test_pipeline_run_works_fine(self):
         s2 = set(predicted)
         self.assertTrue(s2.issubset(s1))
         self.assertTrue(len(s2) > 0 and len(s2) <= 2)
-
-    def test_pipeline_export_boxplot_works_fine(self):
-        data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes.csv', has_header=True, contains_classes=True)
-        self.__pipeline.optimize(data_reader.get_x(), data_reader.get_y(), 20, 40, 'ParticleSwarmAlgorithm', 'Accuracy')
-
-        with tempfile.TemporaryDirectory() as tmp:
-            self.__pipeline.export_boxplot(os.path.join(tmp, 'boxplot'))
-            self.assertTrue(os.path.exists(os.path.join(tmp, 'boxplot.png')))
-            self.assertEqual(1, len([name for name in os.listdir(tmp)]))
-
-            self.__pipeline.export_boxplot(os.path.join(tmp, 'boxplot.png'))
-            self.assertTrue(os.path.exists(os.path.join(tmp, 'boxplot.png')))
-            self.assertEqual(1, len([name for name in os.listdir(tmp)]))
 
     def test_pipeline_export_works_fine(self):
         with tempfile.TemporaryDirectory() as tmp:
@@ -94,7 +81,7 @@ def test_pipeline_setters_work_fine(self):
        'Class 1', 'Class 1', 'Class 1', 'Class 2', 'Class 1', 'Class 1',
        'Class 2', 'Class 2', 'Class 1', 'Class 2', 'Class 1', 'Class 2',
        'Class 2', 'Class 2'])
-        self.__pipeline.set_stats(OptimizationStats(self.__predicted, self.__y, numpy.array([0.88, 0.9, 0.91, 0.87, 0.7, 0.98, 0.95, 0.86, 0.88, 0.76]), 'Accuracy'))
+        self.__pipeline.set_stats(OptimizationStats(self.__predicted, self.__y))
 
         self.assertIsInstance(self.__pipeline.get_classifier(), AdaBoost)
         self.assertIsInstance(self.__pipeline.get_feature_selection_algorithm(), SelectPercentile)

diff --git a/niaaml/tests/test_utilities.py b/niaaml/tests/test_utilities.py
@@ -33,25 +33,13 @@ def setUp(self):
        'Class 2', 'Class 2', 'Class 1', 'Class 2', 'Class 1', 'Class 2',
        'Class 2', 'Class 2'])
 
-        self.__stats = OptimizationStats(predicted, y, np.array([0.88, 0.9, 0.91, 0.87, 0.7, 0.98, 0.95, 0.86, 0.88, 0.76]), 'Accuracy')
+        self.__stats = OptimizationStats(predicted, y)
 
     def test_works_fine(self):
         self.assertEqual(self.__stats._accuracy, 0.5)
         self.assertEqual(self.__stats._precision, 0.5199999999999999)
         self.assertEqual(self.__stats._cohen_kappa, 0.0)
         self.assertEqual(self.__stats._f1_score, 0.505050505050505)
-        self.assertTrue((np.array([0.88, 0.9, 0.91, 0.87, 0.7, 0.98, 0.95, 0.86, 0.88, 0.76]) == self.__stats._fitness_function_values).all())
-        self.assertEqual(self.__stats._fitness_function_name, 'Accuracy')
-
-    def test_export_works_fine(self):
-        with tempfile.TemporaryDirectory() as tmp:
-            self.__stats.export_boxplot(os.path.join(tmp, 'boxplot'))
-            self.assertTrue(os.path.exists(os.path.join(tmp, 'boxplot.png')))
-            self.assertEqual(1, len([name for name in os.listdir(tmp)]))
-
-            self.__stats.export_boxplot(os.path.join(tmp, 'boxplot.png'))
-            self.assertTrue(os.path.exists(os.path.join(tmp, 'boxplot.png')))
-            self.assertEqual(1, len([name for name in os.listdir(tmp)]))
 
 class MinMaxTestCase(TestCase):
     def test_works_fine(self):

diff --git a/niaaml/utilities.py b/niaaml/utilities.py
@@ -1,8 +1,5 @@
-from sklearn import preprocessing
 from sklearn.metrics import accuracy_score, precision_score, cohen_kappa_score, f1_score
 import numpy as np
-import os
-import matplotlib.pyplot as plt
 
 __all__ = [
     'MinMax',
@@ -157,45 +154,24 @@ class OptimizationStats:
         _precision (float): Calculated precision.
         _cohen_kappa (float): Calculated Cohen's kappa.
         _f1_score (float): Calculated F1-score.
-        _fitness_function_values (numpy.array[float]): Array of fitness function's values in the evaluation process (10-fold cross validation's results).
-        _fitness_function_name (str): Name of the used fitness function.
     """
 
-    def __init__(self, predicted, expected, fitness_function_values, fitness_function_name, **kwargs):
+    def __init__(self, predicted, expected, **kwargs):
         r"""Initialize the factory.
 
         Arguments:
             predicted (Iterable[any]): Array of predicted classes.
             expected (Iterable[any]): Array of expected classes.
-            fitness_function_values (numpy.array[float]): Array of fitness function's values in the evaluation process (10-fold cross validation's results).
-            fitness_function_name (str): Name of the used fitness function.
         """
         self._accuracy = accuracy_score(expected, predicted)
         self._precision = precision_score(expected, predicted, average='weighted')
         self._cohen_kappa = cohen_kappa_score(expected, predicted)
         self._f1_score = f1_score(expected, predicted, average='weighted')
-        self._fitness_function_values = fitness_function_values
-        self._fitness_function_name = fitness_function_name
-
-    def export_boxplot(self, file_name):
-        r"""Export boxplot of fitness function's values.
-
-        Arguments:
-            file_name (str): Output file name.
-        """
-        if len(os.path.splitext(file_name)[1]) == 0 or os.path.splitext(file_name)[1] != '.png':
-            file_name = file_name + '.png'
-
-        fig, ax = plt.subplots()
-        ax.set_title(self._fitness_function_name)
-        ax.boxplot(self._fitness_function_values)
-        ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
-        plt.savefig(file_name)
 
     def to_string(self):
         r"""User friendly representation of the object.
 
         Returns:
             str: User friendly representation of the object.
         """
-        return 'Accuracy: {acc},\nPrecision: {prc},\nCohen\'s kappa: {ck},\nF1-score: {f1},\n\nFitness function\'s ({fn}) 10-fold cross validation results: {arr}'.format(acc=self._accuracy, prc=self._precision, ck=self._cohen_kappa, f1=self._f1_score, fn=self._fitness_function_name, arr=np.array2string(self._fitness_function_values, separator=', '))
+        return 'Accuracy: {acc},\nPrecision: {prc},\nCohen\'s kappa: {ck},\nF1-score: {f1}'.format(acc=self._accuracy, prc=self._precision, ck=self._cohen_kappa, f1=self._f1_score)