mljar · pplonski · Aug 25, 2020 · Aug 12, 2020 · Aug 13, 2020 · Aug 14, 2020
diff --git a/requirements.txt b/requirements.txt
@@ -14,4 +14,6 @@ pyarrow==0.16.0
 tabulate==0.8.7
 matplotlib>=3.2.2
 dtreeviz==0.8.2
-shap==0.35.0
+shap==0.35.0
+seaborn==0.10.1
+wordcloud==1.7.0
diff --git a/supervised/automl.py b/supervised/automl.py
@@ -27,6 +27,8 @@
 from supervised.utils.config import LOG_LEVEL
 from supervised.utils.leaderboard_plots import LeaderboardPlots
 from supervised.utils.metric import Metric
+from supervised.preprocessing.eda import EDA
+
 
 logging.basicConfig(
     format="%(asctime)s %(name)s %(levelname)s %(message)s", level=logging.ERROR
@@ -481,9 +483,7 @@ def _get_learner_time_limit(self, model_type):
                 - self._time_spend["default_algorithms"]
             )
             if self._stack_models:
-                tt *= (
-                    0.6
-                )  # leave some time for stacking (approx. 40% for stacking of time left)
+                tt *= 0.6  # leave some time for stacking (approx. 40% for stacking of time left)
             tt /= 2.0  # leave some time for hill-climbing
             tt /= tune_algs_cnt  # give time equally for each algorithm
             tt /= k_folds  # time is per learner (per fold)
@@ -497,9 +497,7 @@ def _get_learner_time_limit(self, model_type):
                 - self._time_spend["not_so_random"]
             )
             if self._stack_models:
-                tt *= (
-                    0.4
-                )  # leave some time for stacking (approx. 60% for stacking of time left)
+                tt *= 0.4  # leave some time for stacking (approx. 60% for stacking of time left)
             tt /= tune_algs_cnt  # give time equally for each algorithm
             tt /= k_folds  # time is per learner (per fold)
             return tt
@@ -1008,6 +1006,14 @@ def fit(self, X_train, y_train, X_validation=None, y_validation=None):
                     "AutoML needs X_train matrix to be a Pandas DataFrame"
                 )
 
+            ## EDA
+            if self._explain_level == 2:
+
+                os.mkdir(os.path.join(self._results_path, "EDA"))
+                eda_path = os.path.join(self._results_path, "EDA")
+
+                EDA.compute(X_train, y_train, eda_path)
+
             self._set_ml_task(y_train)
 
             if X_train is not None:

diff --git a/supervised/preprocessing/eda.py b/supervised/preprocessing/eda.py
@@ -0,0 +1,141 @@
+import os
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from wordcloud import WordCloud
+from wordcloud import STOPWORDS
+from collections import defaultdict
+
+from supervised.algorithms.registry import BINARY_CLASSIFICATION
+from supervised.algorithms.registry import MULTICLASS_CLASSIFICATION
+from supervised.algorithms.registry import REGRESSION
+from supervised.exceptions import AutoMLException
+from supervised.preprocessing.preprocessing_utils import PreprocessingUtils
+
+
+class EDA:
+    @staticmethod
+    def compute(X_train, y_train, eda_path):
+
+        inform = defaultdict(list)
+
+        if isinstance(y_train, pd.Series):
+
+            if PreprocessingUtils.get_type(y_train) in ("categorical"):
+
+                plt.figure(figsize=(5, 5))
+                sns.countplot(y_train, color="blue")
+                plt.title("Target class distribution")
+                plt.tight_layout(pad=2.0)
+                plot_path = os.path.join(eda_path, "target.png")
+                plt.savefig(plot_path)
+                plt.close("all")
+
+            else:
+
+                plt.figure(figsize=(5, 5))
+                sns.distplot(y_train, color="blue")
+                plt.title("Target class distribution")
+                plt.tight_layout(pad=2.0)
+                plot_path = os.path.join(eda_path, "target.png")
+                plt.savefig(plot_path)
+                plt.close("all")
+
+            inform["missing"].append(pd.isnull(y_train).sum() / y_train.shape[0])
+            inform["unique"].append(y_train.nunique())
+            inform["feature_type"].append(PreprocessingUtils.get_type(y_train))
+            inform["plot"].append("![](target.png)")
+            inform["feature"].append("target")
+            inform["desc"].append(y_train.describe().to_dict())
+
+        for col in X_train.columns:
+
+            inform["feature_type"].append(PreprocessingUtils.get_type(X_train[col]))
+
+            if PreprocessingUtils.get_type(X_train[col]) in ("categorical", "discrete"):
+
+                plt.figure(figsize=(5, 5))
+                sns.countplot(
+                    X_train[col],
+                    order=X_train[col].value_counts().iloc[:10].index,
+                    color="blue",
+                )
+                plt.title(f"{col} class distribution")
+                plt.tight_layout(pad=2.0)
+                plot_path = os.path.join(eda_path, f"{col}.png")
+                plt.savefig(plot_path)
+                plt.close("all")
+
+            elif PreprocessingUtils.get_type(X_train[col]) in ("continous"):
+
+                plt.figure(figsize=(5, 5))
+                sns.distplot(X_train[col], color="blue")
+                plt.title(f"{col} value distribution")
+                plt.tight_layout(pad=2.0)
+                plot_path = os.path.join(eda_path, f"{col}.png")
+                plt.savefig(plot_path)
+                plt.close("all")
+
+            elif PreprocessingUtils.get_type(X_train[col]) in ("text"):
+
+                plt.figure(figsize=(5, 5), dpi=70)
+                word_string = " ".join(X_train[col].str.lower())
+                wordcloud = WordCloud(
+                    width=500,
+                    height=500,
+                    stopwords=STOPWORDS,
+                    background_color="white",
+                    max_words=400,
+                    max_font_size=None,
+                ).generate(word_string)
+
+                plt.imshow(wordcloud, aspect="auto", interpolation="nearest")
+                plt.axis("off")
+                plot_path = os.path.join(eda_path, f"{col}.png")
+                plt.savefig(plot_path)
+
+            elif PreprocessingUtils.get_type(X_train[col]) in ("datetime"):
+
+                plt.figure(figsize=(5, 5))
+                pd.to_datetime(X_train[col]).plot(grid="True", color="blue")
+                plt.tight_layout(pad=2.0)
+                plot_path = os.path.join(eda_path, f"{col}.png")
+                plt.savefig(plot_path)
+                plt.close("all")
+
+            inform["missing"].append(
+                pd.isnull(X_train[col]).sum() * 100 / X_train.shape[0]
+            )
+
+            inform["unique"].append(int(X_train[col].nunique()))
+            inform["plot"].append(f"![]({col}.png)")
+            inform["feature"].append(str(col))
+            inform["desc"].append(X_train[col].describe().to_dict())
+
+        df = pd.DataFrame(inform)
+
+        with open(os.path.join(eda_path, "Readme.md"), "w") as fout:
+
+            for i, row in df.iterrows():
+
+                fout.write(f"## Feature : {row['feature']}\n")
+                fout.write(f"- **Feature type** : {row['feature_type']}\n")
+                fout.write(f"- **Missing** : {row['missing']}%\n")
+                fout.write(f"- **Unique** : {row['unique']}\n")
+
+                for key in row["desc"].keys():
+
+                    if key in ("25%", "50%", "75%"):
+
+                        fout.write(
+                            f"- **{key.capitalize()}th Percentile** : {row['desc'][key]}\n"
+                        )
+                    else:
+
+                        fout.write(f"- **{key.capitalize()}** :{row['desc'][key]}\n")
+
+                fout.write(f"- {row['plot']}\n")
+
+        fout.close()
diff --git a/tests/tests_preprocessing/test_eda.py b/tests/tests_preprocessing/test_eda.py
@@ -0,0 +1,54 @@
+import os
+import unittest
+import tempfile
+import json
+import numpy as np
+import pandas as pd
+import shutil
+from sklearn import datasets
+
+from supervised import AutoML
+
+
+class EDATest(unittest.TestCase):
+
+    automl_dir = "automl_1"
+
+    def tearDown(self):
+        shutil.rmtree(self.automl_dir, ignore_errors=True)
+
+    def test_explain_default(self):
+        a = AutoML(
+            results_path=self.automl_dir,
+            total_time_limit=1,
+            algorithms=["Random Forest"],
+            train_ensemble=False,
+            explain_level=2,
+        )
+
+        X, y = datasets.make_classification(n_samples=100, n_features=5,)
+
+        X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
+        y = pd.Series(y, name="class")
+
+        a.fit(X, y)
+
+        result_files = os.listdir(os.path.join(a._results_path, "EDA"))
+
+        for col in X.columns:
+
+            produced = True
+            if "".join((col, ".png")) not in result_files:
+                produced = False
+                break
+        self.assertTrue(produced)
+
+        if "target.png" not in result_files:
+            produced = False
+        self.assertTrue(produced)
+
+        if "Readme.md" not in result_files:
+            produced = False
+        self.assertTrue(produced)
+
+        self.tearDown()