Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add EDA for input data set #125 #145

Merged
merged 30 commits into from
Aug 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,6 @@ pyarrow==0.16.0
tabulate==0.8.7
matplotlib>=3.2.2
dtreeviz==0.8.2
shap==0.35.0
shap==0.35.0
seaborn==0.10.1
wordcloud==1.7.0
18 changes: 12 additions & 6 deletions supervised/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
from supervised.utils.config import LOG_LEVEL
from supervised.utils.leaderboard_plots import LeaderboardPlots
from supervised.utils.metric import Metric
from supervised.preprocessing.eda import EDA


logging.basicConfig(
format="%(asctime)s %(name)s %(levelname)s %(message)s", level=logging.ERROR
Expand Down Expand Up @@ -481,9 +483,7 @@ def _get_learner_time_limit(self, model_type):
- self._time_spend["default_algorithms"]
)
if self._stack_models:
tt *= (
0.6
) # leave some time for stacking (approx. 40% for stacking of time left)
tt *= 0.6 # leave some time for stacking (approx. 40% for stacking of time left)
tt /= 2.0 # leave some time for hill-climbing
tt /= tune_algs_cnt # give time equally for each algorithm
tt /= k_folds # time is per learner (per fold)
Expand All @@ -497,9 +497,7 @@ def _get_learner_time_limit(self, model_type):
- self._time_spend["not_so_random"]
)
if self._stack_models:
tt *= (
0.4
) # leave some time for stacking (approx. 60% for stacking of time left)
tt *= 0.4 # leave some time for stacking (approx. 60% for stacking of time left)
tt /= tune_algs_cnt # give time equally for each algorithm
tt /= k_folds # time is per learner (per fold)
return tt
Expand Down Expand Up @@ -1008,6 +1006,14 @@ def fit(self, X_train, y_train, X_validation=None, y_validation=None):
"AutoML needs X_train matrix to be a Pandas DataFrame"
)

## EDA
if self._explain_level == 2:

os.mkdir(os.path.join(self._results_path, "EDA"))
eda_path = os.path.join(self._results_path, "EDA")

EDA.compute(X_train, y_train, eda_path)

self._set_ml_task(y_train)

if X_train is not None:
Expand Down
141 changes: 141 additions & 0 deletions supervised/preprocessing/eda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from wordcloud import WordCloud
from wordcloud import STOPWORDS
from collections import defaultdict

from supervised.algorithms.registry import BINARY_CLASSIFICATION
from supervised.algorithms.registry import MULTICLASS_CLASSIFICATION
from supervised.algorithms.registry import REGRESSION
from supervised.exceptions import AutoMLException
from supervised.preprocessing.preprocessing_utils import PreprocessingUtils


class EDA:
@staticmethod
def compute(X_train, y_train, eda_path):

inform = defaultdict(list)

if isinstance(y_train, pd.Series):

if PreprocessingUtils.get_type(y_train) in ("categorical"):

plt.figure(figsize=(5, 5))
sns.countplot(y_train, color="blue")
plt.title("Target class distribution")
plt.tight_layout(pad=2.0)
plot_path = os.path.join(eda_path, "target.png")
plt.savefig(plot_path)
plt.close("all")

else:

plt.figure(figsize=(5, 5))
sns.distplot(y_train, color="blue")
plt.title("Target class distribution")
plt.tight_layout(pad=2.0)
plot_path = os.path.join(eda_path, "target.png")
plt.savefig(plot_path)
plt.close("all")

inform["missing"].append(pd.isnull(y_train).sum() / y_train.shape[0])
inform["unique"].append(y_train.nunique())
inform["feature_type"].append(PreprocessingUtils.get_type(y_train))
inform["plot"].append("![](target.png)")
inform["feature"].append("target")
inform["desc"].append(y_train.describe().to_dict())

for col in X_train.columns:

inform["feature_type"].append(PreprocessingUtils.get_type(X_train[col]))

if PreprocessingUtils.get_type(X_train[col]) in ("categorical", "discrete"):

plt.figure(figsize=(5, 5))
sns.countplot(
X_train[col],
order=X_train[col].value_counts().iloc[:10].index,
color="blue",
)
plt.title(f"{col} class distribution")
plt.tight_layout(pad=2.0)
plot_path = os.path.join(eda_path, f"{col}.png")
plt.savefig(plot_path)
plt.close("all")

elif PreprocessingUtils.get_type(X_train[col]) in ("continous"):

plt.figure(figsize=(5, 5))
sns.distplot(X_train[col], color="blue")
plt.title(f"{col} value distribution")
plt.tight_layout(pad=2.0)
plot_path = os.path.join(eda_path, f"{col}.png")
plt.savefig(plot_path)
plt.close("all")

elif PreprocessingUtils.get_type(X_train[col]) in ("text"):

plt.figure(figsize=(5, 5), dpi=70)
word_string = " ".join(X_train[col].str.lower())
wordcloud = WordCloud(
width=500,
height=500,
stopwords=STOPWORDS,
background_color="white",
max_words=400,
max_font_size=None,
).generate(word_string)

plt.imshow(wordcloud, aspect="auto", interpolation="nearest")
plt.axis("off")
plot_path = os.path.join(eda_path, f"{col}.png")
plt.savefig(plot_path)

elif PreprocessingUtils.get_type(X_train[col]) in ("datetime"):

plt.figure(figsize=(5, 5))
pd.to_datetime(X_train[col]).plot(grid="True", color="blue")
plt.tight_layout(pad=2.0)
plot_path = os.path.join(eda_path, f"{col}.png")
plt.savefig(plot_path)
plt.close("all")

inform["missing"].append(
pd.isnull(X_train[col]).sum() * 100 / X_train.shape[0]
)

inform["unique"].append(int(X_train[col].nunique()))
inform["plot"].append(f"![]({col}.png)")
inform["feature"].append(str(col))
inform["desc"].append(X_train[col].describe().to_dict())

df = pd.DataFrame(inform)

with open(os.path.join(eda_path, "Readme.md"), "w") as fout:

for i, row in df.iterrows():

fout.write(f"## Feature : {row['feature']}\n")
fout.write(f"- **Feature type** : {row['feature_type']}\n")
fout.write(f"- **Missing** : {row['missing']}%\n")
fout.write(f"- **Unique** : {row['unique']}\n")

for key in row["desc"].keys():

if key in ("25%", "50%", "75%"):

fout.write(
f"- **{key.capitalize()}th Percentile** : {row['desc'][key]}\n"
)
else:

fout.write(f"- **{key.capitalize()}** :{row['desc'][key]}\n")

fout.write(f"- {row['plot']}\n")

fout.close()
54 changes: 54 additions & 0 deletions tests/tests_preprocessing/test_eda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import unittest
import tempfile
import json
import numpy as np
import pandas as pd
import shutil
from sklearn import datasets

from supervised import AutoML


class EDATest(unittest.TestCase):

automl_dir = "automl_1"

def tearDown(self):
shutil.rmtree(self.automl_dir, ignore_errors=True)

def test_explain_default(self):
a = AutoML(
results_path=self.automl_dir,
total_time_limit=1,
algorithms=["Random Forest"],
train_ensemble=False,
explain_level=2,
)

X, y = datasets.make_classification(n_samples=100, n_features=5,)

X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
y = pd.Series(y, name="class")

a.fit(X, y)

result_files = os.listdir(os.path.join(a._results_path, "EDA"))

for col in X.columns:

produced = True
if "".join((col, ".png")) not in result_files:
produced = False
break
self.assertTrue(produced)

if "target.png" not in result_files:
produced = False
self.assertTrue(produced)

if "Readme.md" not in result_files:
produced = False
self.assertTrue(produced)

self.tearDown()