# Drinking Water Potability Project

**Charles Serve-Catelin** - **Samuel Pujade** - **Mathieu Ract**

## Récupération et Nettoyage des données

In [None]:
import project
from importlib import reload
reload(project)

project.avoid_warnings()
project.load_data("./data/drinking_water_potability.csv", disp=False)
project.display_explanatory_variables(disp=False)
project.check_null_values(disp=False)
project.cleaning_dataset('delete') # 'mean' or 'delete'
project.cope_outliers('delete') # 'delete' or 'Q+1.5' or 'Q'

project.split_dataset(ratio=0.8, disp=False)
project.scaling_trainset()
project.set_metric('accuracy') # 'accuracy'  or 'f1_score'

project.fitting_KNN_model()
project.testing_KNN_model()

project.fitting_LR_model()
project.testing_LR_model()

project.fitting_RF_model()
project.testing_RF_model()

project.fitting_SVM_model()
project.testing_SVM_model()

project.fitting_XGboost_model()
project.testing_XGboost_model()

## Tuning kNN hyperparameters

We need to specify a parameter grid to sample from during fitting :

In [None]:
param_grid_kNN = {'n_neighbors' : list(range(1, 31)), # Number of neighbors to use
    'weights': ['uniform', 'distance'], # Weight function used in prediction
    'leaf_size' : list(range(1, 51)), # Leaf size passed to BallTree or KDTree
    'p' : [1, 2]} # Power parameter for the Minkowski metric

param_grid_kNN_small = {'n_neighbors' : list(range(20, 30)), # Number of neighbors to use
    'weights': ['uniform'], # Weight function used in prediction
    'leaf_size' : list(range(20, 30)), # Leaf size passed to BallTree or KDTree
    'p' : [2]} # Power parameter for the Minkowski metric

best_params_kNN_RS = project.tuning_kNN_hyperparameters(param_grid_kNN, 'RandomizedSearchCV')
best_params_kNN_GS = project.tuning_kNN_hyperparameters(param_grid_kNN_small, 'GridSearchCV')

In [None]:
project.fitting_kNN_tuned_model(best_params_kNN_RS)
project.fitting_kNN_tuned_model(best_params_kNN_GS)

## Tuning Logistic Regression hyperparameters

We need to specify a parameter grid to sample from during fitting :

In [None]:
param_grid_LR = {'C': list(range(1, 100, 20)),  # penalty strength
    'penalty': ['l2'], # Norm of the penalty
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']} # Algorithm to use in the optimization problem

best_params_LR_RS = project.tuning_LR_hyperparameters(param_grid_LR, 'RandomizedSearchCV')
best_params_LR_GS = project.tuning_LR_hyperparameters(param_grid_LR, 'GridSearchCV')

In [None]:
project.fitting_LR_tuned_model(best_params_LR_RS)
project.fitting_LR_tuned_model(best_params_LR_GS)

## Tuning RF hyperparameters

We need to specify a parameter grid to sample from during fitting :

In [None]:
param_grid_RF = {'n_estimators' : list(range(200, 2000, 200)), # The number of trees in the forest
    'max_depth' : list(range(10, 110, 10)) + [None], # max number of levels in each decision tree
    'min_samples_split' : [2, 5, 10], # min number of data points placed in a node before the node is split
    'min_samples_leaf' : [1, 2, 4], # min number of data points allowed in a leaf node
    'bootstrap' : [True, False]} # method for sampling data points (with or without replacement)

param_grid_RF_small = {'n_estimators' : list(range(550, 650, 50)), # The number of trees in the forest
    'max_depth' : list(range(20, 40, 10)) + [None], # max number of levels in each decision tree
    'min_samples_split' : [5], # min number of data points placed in a node before the node is split
    'min_samples_leaf' : [4], # min number of data points allowed in a leaf node
    'bootstrap' : [True]} # method for sampling data points (with or without replacement)

best_params_RF_RS = project.tuning_RF_hyperparameters(param_grid_RF, 'RandomizedSearchCV')
best_params_RF_GS = project.tuning_RF_hyperparameters(param_grid_RF_small, 'GridSearchCV')

In [None]:
project.fitting_RF_tuned_model(best_params_RF_RS)
project.fitting_RF_tuned_model(best_params_RF_GS)

## Tuning SVM hyperparameters

We need to specify a parameter grid to sample from during fitting :

In [None]:
param_grid_SVM = {'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], # Kernel type
    'C': [0.1, 1, 10, 100], # Regularization parameter
    'gamma': [1, 0.1, 0.01, 0.001]}

param_grid_SVM_small = {'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], # Kernel type
    'C': [0.1, 1, 10, 100], # Regularization parameter
    'gamma': [1, 0.1, 0.01, 0.001]}

best_params_SVM_RS = project.tuning_SVM_hyperparameters(param_grid_SVM, 'RandomizedSearchCV')
best_params_SVM_GS = project.tuning_SVM_hyperparameters(param_grid_SVM_small, 'GridSearchCV')

In [None]:
project.fitting_testing_best_SVM_model(best_params_SVM_RS)
project.fitting_testing_best_SVM_model(best_params_SVM_GS)

## Tuning XGboost hyperparameters

We need to specify a parameter grid to sample from during fitting :

In [None]:
param_grid_XGboost = {'min_child_weight': [1, 5, 10],
                      'gamma': [0.5, 1, 1.5, 2, 5],
                      'subsample': [0.6, 0.8, 1.0],
                      'colsample_bytree': [0.6, 0.8, 1.0],
                      'max_depth': [3, 4, 5]}

best_params_XGboost_GS = project.tuning_XGboost_hyperparameters(param_grid_XGboost, 'GridSearchCV')

In [None]:
project.fitting_XGboost_tuned_model(best_params_XGboost_GS)