In [1]:
import time
# Obróbka danych
import pandas as pd
import numpy as np

# Wizualizacja 
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mgno 

# Przygotowanie danych 
from hyperopt import hp, fmin, tpe, STATUS_OK, space_eval
from hyperopt import Trials
from sklearn.model_selection import cross_validate,train_test_split,cross_val_score,KFold 
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer,make_column_transformer
from imblearn.over_sampling import SMOTE
from sklearn.impute import  KNNImputer
from imblearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler, FunctionTransformer, PowerTransformer
from IPython.display import display, HTML
from sklearn.metrics import confusion_matrix,accuracy_score,make_scorer,precision_score,recall_score,f1_score,classification_report
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn import set_config


# Modele
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import xgboost as xgb

#Ziarno losowe
random_state=42

#Tworzenie logów
import logging

# Wyświetlanie notebooka
display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

# Wizualizacje wykresów
fontdict = {'family': 'Times New Roman',
        'color':  'black',
        'weight': 'normal',
        'size': 25,
        }
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
def log(path, file):
    """[Create a log file to record the experiment's logs]
    
    Arguments:
        path {string} -- path to the directory
        file {string} -- file name
    
    Returns:
        [func] -- [logger that record logs]
    """

    log_file = os.path.join(path, file)

    if not os.path.isfile(log_file):
        open(log_file, "w+").close()

    console_logging_format = "%(levelname)s %(message)s"
    file_logging_format = "%(levelname)s: %(asctime)s: %(message)s"

    logging.basicConfig(level=logging.INFO, format=console_logging_format)
    logger = logging.getLogger()
    
    handler = logging.FileHandler(log_file)

    handler.setLevel(logging.INFO)
    
    formatter = logging.Formatter(file_logging_format)
    handler.setFormatter(formatter)

    logger.addHandler(handler)

    return logger

In [3]:
df = pd.read_csv("./waterQuality1.csv")
df.sample(10)

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
4934,0.09,0.83,0.04,0.08,0.03,0.06,0.07,0.04,1.35,0.05,...,0.059,6.74,0.91,0.003,8.68,0.25,0.03,0.01,0.03,0
1326,3.72,9.82,0.58,3.4,0.03,6.2,0.1,1.09,0.37,0.0,...,0.075,16.11,1.22,0.009,28.0,5.69,0.05,0.38,0.09,0
2730,0.12,5.64,0.03,3.93,0.006,7.86,0.49,0.47,0.89,0.78,...,0.038,1.74,1.16,0.009,55.58,3.71,0.02,0.36,0.06,0
4371,0.02,3.95,0.07,0.61,0.03,0.33,0.04,0.03,0.83,0.0,...,0.194,8.16,0.43,0.006,1.75,0.72,0.1,0.04,0.05,0
1532,4.9,11.33,0.01,1.6,0.03,6.95,0.87,0.92,1.2,0.0,...,0.01,19.13,2.07,0.002,23.68,4.58,0.01,0.2,0.04,0
2225,0.02,8.71,0.05,2.46,0.002,7.33,0.43,0.7,0.8,0.97,...,0.1,18.78,1.99,0.001,27.53,5.96,0.05,0.08,0.0,0
1115,3.83,19.07,0.92,1.61,0.11,6.66,0.7,0.29,0.5,0.0,...,0.136,18.97,2.25,0.001,33.42,3.09,0.02,0.38,0.02,0
3148,0.01,9.24,0.29,2.14,0.06,7.61,0.19,1.22,1.06,0.27,...,0.103,3.71,1.89,0.007,21.24,4.35,0.01,0.41,0.07,0
3204,0.13,12.22,0.58,4.18,0.1,4.0,0.07,1.89,0.17,0.66,...,0.124,16.63,2.02,0.009,19.98,3.75,0.07,0.32,0.02,0
6735,0.05,28.63,0.08,0.16,0.05,0.38,0.09,0.72,0.53,0.35,...,0.03,6.98,1.8,0.009,0.14,0.49,0.01,0.08,0.09,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7999 entries, 0 to 7998
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   aluminium    7999 non-null   float64
 1   ammonia      7999 non-null   object 
 2   arsenic      7999 non-null   float64
 3   barium       7999 non-null   float64
 4   cadmium      7999 non-null   float64
 5   chloramine   7999 non-null   float64
 6   chromium     7999 non-null   float64
 7   copper       7999 non-null   float64
 8   flouride     7999 non-null   float64
 9   bacteria     7999 non-null   float64
 10  viruses      7999 non-null   float64
 11  lead         7999 non-null   float64
 12  nitrates     7999 non-null   float64
 13  nitrites     7999 non-null   float64
 14  mercury      7999 non-null   float64
 15  perchlorate  7999 non-null   float64
 16  radium       7999 non-null   float64
 17  selenium     7999 non-null   float64
 18  silver       7999 non-null   float64
 19  uraniu

In [5]:
df[df['ammonia'] == '#NUM!']

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
7551,0.03,#NUM!,0.08,0.79,0.07,0.08,0.05,0.58,0.34,0.0,...,0.183,4.37,1.43,0.007,0.62,2.54,0.07,0.05,0.05,#NUM!
7568,0.06,#NUM!,0.07,1.72,0.08,0.32,0.01,1.11,0.61,0.0,...,0.178,12.1,2.03,0.008,1.37,2.05,0.06,0.1,0.07,#NUM!
7890,0.01,#NUM!,0.08,0.49,0.0,0.07,0.09,0.06,0.72,0.57,...,0.088,9.57,1.45,0.009,7.67,7.7,0.03,0.05,0.02,#NUM!


In [6]:
df=df[df['ammonia'] != '#NUM!']
df.shape

(7996, 21)

In [7]:
df['ammonia'] = df['ammonia'].astype(float)
df['is_safe'] = df['is_safe'].astype(float)

In [None]:
desc = df.describe().T
f,ax = plt.subplots(figsize=(12,8))
sns.heatmap(desc,annot=True,cmap = "Blues", fmt= '.0f',
            ax=ax,linewidths = 5, cbar = False,
            annot_kws={"size": 16})
plt.xticks(size = 18)
plt.yticks(size = 14, rotation = 0)
plt.title("Statystyki opisowe", fontdict=fontdict)
plt.show()

In [None]:
mgno.bar(df, figsize=(10,5), fontsize=12,sort="ascending");
plt.title('Wykres pustych wartości',fontdict=fontdict)
plt.show()

In [None]:
plt.figure(figsize=(15,8))
corr = df.corr(method="spearman")
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, annot=True,linewidths=0.2,vmin=-1,vmax=1,mask=mask, cmap='Blues',center=0,cbar_kws={'alpha':0.1})
plt.title('Macierz Korelacji',fontdict=fontdict)
plt.show()

In [None]:
cols = df.columns[:-1]
for col in cols:
    print("{} = {}".format(col,df[col].skew()))

In [None]:
cols = df.columns[:-1]
threshold = 0.2

for col in cols:
    if df[col].skew() > threshold:
        df[col] = np.log1p(df[col]) #np.log1p
    else:
        pass

In [None]:
cols = df.columns
for col in cols:
    fig, axes = plt.subplots(2, 1,figsize=(12, 7))
    
    axes[0].axvline(x=df[col].mean(), linewidth=3, color='r', label="mean", alpha=0.5,)
    axes[0].axvline(x=df[col].median(), linewidth=3, color='y', label="median", alpha=0.5,)
    sns.histplot(df[col],ax=axes[0])
    
    axes[1].axvline(x=df[col].mean(), linewidth=3, color='r', label="mean", alpha=0.5,)
    axes[1].axvline(x=df[col].median(), linewidth=3, color='y', label="median", alpha=0.5,)
    sns.boxplot(x=df[col],ax=axes[1])

    axes[0].set_ylabel('Ilość')
    axes[0].set_xlabel('')
    axes[0].set_title("Rozkład zmiennej {}".format(col))
    axes[0].legend(["średnia", "mediana"])
    axes[1].legend(["średnia", "mediana"])
    axes[1].set_xlabel('')

In [None]:
cols = df.columns
for col in cols:
    print("{} = {}".format(col,df[col].skew()))

In [None]:
fig = plt.subplots(figsize=(12, 7))
ax.set_title("Ilość próbki is_safe")
sns.countplot(x ='is_safe', data = df);

In [8]:
X  = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state,shuffle=True,stratify=y)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(5597, 20) (5597,)
(2399, 20) (2399,)


In [None]:
xgb_model = xgb.XGBClassifier(random_state = random_state,eval_metric = "mlogloss")
xgb_model.fit(X_train, y_train)
plt.rcParams["figure.figsize"] = (14, 7)
print("Feature Importances : ", xgb_model.feature_importances_)

xgb.plot_importance(xgb_model,height=0.5,)
plt.show();

In [None]:
#Oversampling
smote = SMOTE(random_state=random_state)
X_train, y_train = smote.fit_resample(X_train, y_train)
print(X_train.shape,y_train.shape)
full = X_train.copy()
full["is_safe"] = y_train
sns.countplot(x ='is_safe', data = full);

In [10]:
numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns

In [11]:
pipeline = Pipeline([
    ('transformation', PowerTransformer()),
    ('smote', SMOTE(random_state=random_state)),
    ('feature_selection', SelectFromModel(xgb.XGBClassifier(random_state = random_state,eval_metric = "mlogloss"),threshold=0.015)),
    ('numerical_imputer', KNNImputer(missing_values=np.nan, n_neighbors=3)),
    ('numerical_transformer', StandardScaler()),
])

"""
    ('smote', SMOTE(random_state=random_state)),
    ('feature_selection', SelectFromModel(xgb.XGBClassifier(random_state = random_state,eval_metric = "mlogloss"),threshold=0.015)),
"""
#pipeline.fit_transform(X_train, y_train)

'\n    (\'smote\', SMOTE(random_state=random_state)),\n    (\'feature_selection\', SelectFromModel(xgb.XGBClassifier(random_state = random_state,eval_metric = "mlogloss"),threshold=0.015)),\n'

In [12]:
data_processing_pipeline = ColumnTransformer([
    ("numerical_preprocessor", pipeline, numerical_columns)
])

data_processing_pipeline.fit_transform(X_train, y_train)

array([[-0.73857156,  0.59678752,  0.48324066, ..., -1.02620647,
        -0.96442723,  1.47475982],
       [-0.90629472,  0.09889112, -0.93115239, ...,  0.4869188 ,
        -0.81030177, -1.59479346],
       [-0.79247208, -1.89536852,  0.3517586 , ..., -0.12049388,
        -1.30392778,  1.08972864],
       ...,
       [ 0.98863577, -0.39009601, -0.87856874, ..., -0.57393193,
         1.45563908, -1.25777155],
       [ 0.59575825, -0.85369007,  1.9162457 , ...,  0.59996826,
        -0.81030177, -1.21245956],
       [ 1.204561  ,  0.60936976, -0.40581835, ..., -0.02731892,
         1.26545375, -0.38453684]])

In [13]:
data_processing_pipeline.fit_transform(X_train, y_train).shape

(9918, 15)

In [14]:
    pipe = Pipeline(steps = [
    ('data_processing_pipeline', data_processing_pipeline),
    ('classifier', xgb.XGBClassifier(eval_metric = "mlogloss",random_state = random_state))
    ])

In [15]:
score = cross_val_score(pipe, X_train, y_train, cv=10, scoring='accuracy',n_jobs=-1,error_score="raise").mean()

XGBoostError: [18:51:50] ..\src\data\data.cc:556: Check failed: labels_.Size() == num_row_ (5037 vs. 8926) : Size of labels must equal to number of rows.

In [None]:
#TEST -----------------------------------------------------------------------------------------

In [16]:
set_config(display='diagram')
display(data_processing_pipeline)
set_config(display='text')

In [None]:
classifiers = [
        {
        'name': 'BaggingClassifier()',
        'class': BaggingClassifier(),
        'max_evals': 100,
        'params': {
            'n_estimators': hp.uniformint('n_estimators', 100, 500),
            'random_state' : 42
           }
    },
    {
        'name': 'RandomForestClassifier()',
        'class': RandomForestClassifier(),
        'max_evals': 100,
        'params': {
            'n_estimators': hp.uniformint('n_estimators', 100, 500),
            'max_depth': hp.uniformint('max_depth', 2, 20),
            'min_samples_leaf':hp.uniformint('min_samples_leaf', 1, 5),
            'min_samples_split':hp.uniformint('min_samples_split', 2, 10),
            'bootstrap': hp.choice('bootstrap', [True,  False]),
            'max_features': hp.choice('max_features', ['auto', 'sqrt']),
            'random_state' : 42
           }
    },
     {
        'name': 'GradientBoostingClassifier()',
        'class': GradientBoostingClassifier(),
        'max_evals': 100,
        'params': {
            'n_estimators': hp.uniformint('n_estimators', 100, 500),
            'max_depth': hp.uniformint('max_depth', 2, 20),
            'random_state' : 42
           }
    },
    {
        'name': 'XGBClassifier()',
        'class': xgb.XGBClassifier(eval_metric = "mlogloss"),
        'max_evals': 100,
        'eval_metric': "mlogloss",
        'params' : {
            'learning_rate': hp.loguniform ('learning_rate', 0.01, 0.5),
            'max_depth': hp.choice('max_depth', np.arange(2, 11).tolist()),
            'min_child_weight': hp.choice('min_child_weight', np.arange(0, 101).tolist()),
            'gamma': hp.loguniform('gamma', 0.0, 2.0),
            'subsample': hp.uniform('subsample', 0.5, 1.0),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
            'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1.0),
            'reg_alpha': hp.loguniform('reg_alpha', 0.0, 2.0),
            'reg_lambda': hp.loguniform('reg_lambda', 0.0, 2.0),
            'random_state' : 42
        }
    },    
]

In [None]:
classifiers = [
     {
        'name': 'XGBClassifier()',
        'class': xgb.XGBClassifier(eval_metric = "mlogloss"),
        'max_evals': 100,
        'eval_metric': "mlogloss",
        'params' : {
            'learning_rate': hp.loguniform ('learning_rate', 0.01, 0.5),
            'max_depth': hp.choice('max_depth', np.arange(2, 11).tolist()),
            'min_child_weight': hp.choice('min_child_weight', np.arange(0, 101).tolist()),
            'gamma': hp.loguniform('gamma', 0.0, 2.0),
            'subsample': hp.uniform('subsample', 0.5, 1.0),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
            'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1.0),
            'reg_alpha': hp.loguniform('reg_alpha', 0.0, 2.0),
            'reg_lambda': hp.loguniform('reg_lambda', 0.0, 2.0),
            'random_state' : 42
        }
    },    
]

In [None]:
import os
logger = log(path="./logs/", file="score.logs")

In [None]:
def objective(space):
    pipe_params = {}
    
    for s in space:
        pipe_params[f"classifier__{s}"] = space[s]
        
    pipe.set_params(**pipe_params)

    score = cross_val_score(pipe, X_train, y_train, cv=10, scoring='accuracy',n_jobs=-1,error_score="raise").mean()
    return {'loss': 1- score, 'status': STATUS_OK, 'accuracy': score}

In [None]:
trials_df = []
 
for cl in classifiers:
    cl_name = cl['class'].__class__.__name__
    print(f"\n\n{cl_name}")
    
    pipe = Pipeline(steps = [
    ('data_processing_pipeline', data_processing_pipeline),
    ('classifier', cl['class'])
    ])
    
    space = {}
    for k in cl['params']:
        space[k] = cl['params'][k]
    
    max_evals = cl['max_evals']
    
    trials = Trials()
    best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)
     
    best_params = space_eval(space, best)
    print('\nThe best params:')
    print ("{:<30} {}".format('Parameter','Selected'))
    for k, v in best_params.items():
        print ("{:<30} {}".format(k, v))
    
    for trial in trials.trials:
        trials_df.append({
            'classifier': cl_name,
            'loss': trial['result']['loss'],
            'accuracy': trial['result']['accuracy'],
            'params': trial['misc']['vals']
            })

In [None]:
trials_df = pd.DataFrame(trials_df)
trials_df.sort_values('accuracy', ascending=False)

In [None]:
top_models = trials_df.sort_values('accuracy', ascending=False)
logger.info("-------------------------------")
logger.info("TOP 10 MODELS WITH Imputation: Mean")
logger.info(top_models.head(10))

In [None]:
plt.figure(figsize=(15,8))
sns.swarmplot(data=trials_df, y='classifier', x='accuracy');

In [None]:
print("The best model " + top_models.iloc[0][0])
parameters=top_models.iloc[0][3]

keys = []
values = []
for key, value in parameters.items():
    keys.append(key)
    values.append(int(value[0]))
params = dict(zip(keys, values))

In [None]:
pipe_finall = Pipeline(steps = [
    ('data_processing_pipeline', data_processing_pipeline),
    ('classifier', XGBClassifier(**params))
    ])


pipe_finall.fit(X_train, y_train)

In [None]:
data_processing_pipeline.transform(X_test)
y_pred = pipe_finall.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print(accuracy_score(y_test,y_pred))

In [None]:
print(sns.heatmap(confusion_matrix(y_test,y_pred),vmin=0,vmax=500,annot=True,cmap='Blues',fmt='g'));