In [None]:
from platform import python_version
print(python_version())

In [None]:
######################################## Read Data ########################################################"
import pandas as pd
import scipy

##############################################
VERBOSE_VALUE = 0 # Display GridSearch execution logs
TEST_SIZE_VALUE = 0.1 # Dataset rate to ue in the test step
CROSS_VALID = 5 # Number of K for cross validation
###############################################


data = pd.read_csv("/home/manel/Bureau/BD/donnee+pluvio/smv-complet/donnee_brute_sans_na/smv.csv")
### Remove All empty values
data = data.fillna(0) 

X = data.drop(["Ecoli","EI"],axis=1)
y = data.iloc[:,2]
data=data.drop(["EI"],axis=1)

print(X)
print(y)


In [None]:

######################################## Data Information ########################################################"
print("  * * * * * * * I)  Dataframe information * * * * * * *  ")
print(" 1) Statistics  _  describe ()  ")
print(data.describe())
print("______________________________________")
print(" 2) Informaton (Column, Non-Null, Count, Dtype)  _  info ()  ")
print(data.info())

print("TYPE === ",type(data))


In [None]:
# Descriptive analysis of the data
#! pip install pandas_profiling
from pandas_profiling import ProfileReport
prof = ProfileReport(data, title='Analyse du jeu de la Marne', html={'style':{'full_width':True}})
prof.to_notebook_iframe()

In [None]:
##############################################################################################"
######################################## Data Split  #########################################"
##############################################################################################"

# Split data on training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE_VALUE, random_state=0)
# The split is done randomly by dividing into training and test data 
# The random state allows to have a seed for reproducibility. 
# Changing the value modifies the split applied to the data in order to have different training and test sets. 


# Remove the date column from the data training
X_train = X_train.drop(["date"],axis=1)

# Create the data test to export
DataToExport = X_test
df_Ytest = pd.DataFrame(data=y_test.values, columns=['list'])

pieces = {'x': DataToExport, 'y': df_Ytest}

DataToExport = pd.concat(pieces)

# Print the data to export 
print(DataToExport)

print("  * * * * * * * -- Export -- * * * * * * *  ")
X_test.to_csv ('data_Test.csv', index = False, header=True, sep=";")
DataToExport = pd.read_csv("data_Test.csv", sep=";")
print(type(DataToExport))
df_Ytest = pd.DataFrame(data=y_test.values)
DataToExport["Ecolireal"] = df_Ytest
print("  * * * * * * * -- DATA TEST building -- OK -- * * * * * * *  ")
X_test = X_test.drop(["date"],axis=1)

### Data standardization

Standardization allows scaling before training. It will be performed on the training data so that the models do not have access to the values in the test dataset.
Standardization by the mean: Subtract the mean and divide by the standard deviation for each feature.

In [None]:
# Calculating mean and standard deviation values
X_train, X_test, y_train, y_test
train_mean = X_train.mean()
train_std = X_train.std()
y_mean = y_train.mean()
y_std = y_train.std()

X_train = (X_train - train_mean) / train_std
X_test = (X_test - train_mean) / train_std
y_train = (y_train - y_mean) / y_std
y_test = (y_test - y_mean) / y_std

In [None]:
X_train

In [None]:
y_train

In [None]:
# Determination of the standard deviation after standardization. 
import numpy as np
sd=np.std(y_test)
print(sd)

In [None]:
##### ---------------------- Imports ----------------------------------------------------------------------
from sklearn.model_selection import GridSearchCV # Create the parameter grid based on the results of random search
import numpy as np


##############################################################################################"
######################################## MODEL KNN Split  ####################################"
##############################################################################################"


## --- Local Params
modelName = "KNN"
MethodID = "A"

print( " ################# ------------- ########################## ")
print( "Model " ,modelName," is runing ........ !  ")
from sklearn.neighbors import KNeighborsRegressor
param_gridKNN = {'n_neighbors': np.arange(1, 30, 2),
              'weights': ['uniform', 'distance']
         }


# Selection of the best model
modelKNN =  KNeighborsRegressor()
best_model_searchKNN = GridSearchCV(estimator = modelKNN, param_grid = param_gridKNN, cv = CROSS_VALID,
                           verbose = VERBOSE_VALUE)
best_model_searchKNN.fit(X_train,y_train)


#  Show which is the best model
best_grid = best_model_searchKNN.best_estimator_
print("  ------------------------------------  ")
print (MethodID+"-1) BEST Configuration ("+modelName+") is  ==== ",best_grid )
print("  ------------------------------------  ")


# Predict the values using best_model
Y_pred_KNN = best_model_searchKNN.predict(X_test)
Y=y_test
num_data = X.shape[0]


# Errors for performance evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import math

mse = mean_squared_error(Y,Y_pred_KNN)
rmse = math.sqrt(mse)
rse = math.sqrt(mse/(num_data-2))
mae=mean_absolute_error(Y,Y_pred_KNN)

print(MethodID+"-2) Evaluation  "+modelName+" Results : ")
print("--> RMSE (("+modelName+"))=",rmse)
print("--> MAE (("+modelName+"))=",mae)
r = scipy.stats.pearsonr(Y,Y_pred_KNN)
print("--> Pearson Correlation (("+modelName+"))=",r)


# Export the model
import pickle
filename = 'bestModel'+modelName+'-SMV-nrml-Ecoli.pickle'
pickle.dump(best_model_searchKNN, open(filename, 'wb'))
print("  ------------------------------------  ")
print( MethodID+"-3) Model " ,modelName," is generated and exported -- OK --  ")
print( " ################# ------------- ########################## ")

#Destandardization of predicted values
Y_pred_KNN = (Y_pred_KNN*y_std)+ y_mean
DataToExport["KNN_Ecoli_pred"] = Y_pred_KNN



In [None]:
##############################################################################################"
######################################## MODEL RF  Split  ####################################"
##############################################################################################"

from sklearn.ensemble import RandomForestRegressor
## --- Local Params
modelName = "RF"
MethodID = "B"

print( " ################# ------------- ########################## ")
print( "Model " ,modelName," is runing ........ !  ")
from sklearn.neighbors import KNeighborsRegressor


param_gridRF = {
    'bootstrap': [True],
    'max_features': [2,3,4,5,6,7,8,9,10,11],
    'n_estimators': [10, 50, 200]
}


# Selection of the best model
modelRF =  RandomForestRegressor()
best_model_searchRF = GridSearchCV(estimator = modelRF, param_grid = param_gridRF, cv = CROSS_VALID,
                           verbose = VERBOSE_VALUE)
best_model_searchRF.fit(X_train,y_train)


#  Show which is the best model
best_grid = best_model_searchRF.best_estimator_
print("  ------------------------------------  ")
print (MethodID+"-1) BEST Configuration ("+modelName+") is  ==== ",best_grid )
print("  ------------------------------------  ")


# Predict the values using best_model
Y_pred_RF = best_model_searchRF.predict(X_test)
Y=y_test
num_data = X.shape[0]


# Errors for performance evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import math

mse = mean_squared_error(Y,Y_pred_RF)
rmse = math.sqrt(mse)
rse = math.sqrt(mse/(num_data-2))
mae=mean_absolute_error(Y,Y_pred_RF)

print(MethodID+"-2) Evaluation  "+modelName+" Results : ")
print("--> RMSE (("+modelName+"))=",rmse)
print("--> MAE (("+modelName+"))=",mae)
r = scipy.stats.pearsonr(Y,Y_pred_RF)
print("--> Pearson Correlation (("+modelName+"))=",r)


# Export the model
import pickle
filename = 'bestModel'+modelName+'-SMV-nrml-Ecoli.pickle'
pickle.dump(best_model_searchRF, open(filename, 'wb'))
print("  ------------------------------------  ")
print( MethodID+"-3) Model " ,modelName," is generated and exported -- OK --  ")
print( " ################# ------------- ########################## ")

#Destandardization of predicted values
Y_pred_RF = (Y_pred_RF*y_std)+ y_mean
DataToExport["RF_Ecoli_pred"] = Y_pred_RF




In [None]:
##############################################################################################"
######################################## MODEL DT  Split  ####################################"
####################################### Arbre de decision ####################################"

from sklearn.ensemble import RandomForestRegressor
## --- Local Params
modelName = "DT"
MethodID = "C"

print( " ################# ------------- ########################## ")
print( "Model " ,modelName," is runing ........ !  ")
from sklearn.neighbors import KNeighborsRegressor


from sklearn import tree
param_gridDT = {"max_depth": [1, 2, 10, 100],
              "random_state":[1, 2, 10, 100],
              "min_samples_leaf":[1, 2, 10, 100]
         }


# Selection of the best model
modelDT =  tree.DecisionTreeRegressor()
best_model_searchDT = GridSearchCV(estimator = modelDT, param_grid = param_gridDT, cv = CROSS_VALID,
                           verbose = VERBOSE_VALUE)
best_model_searchDT.fit(X_train,y_train)


#  Show which is the best model
best_grid = best_model_searchDT.best_estimator_
print("  ------------------------------------  ")
print (MethodID+"-1) BEST Configuration ("+modelName+") is  ==== ",best_grid )
print("  ------------------------------------  ")


# Predict the values using best_model
Y_pred_DT = best_model_searchDT.predict(X_test)
Y=y_test
num_data = X.shape[0]


# Errors for performance evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import math

mse = mean_squared_error(Y,Y_pred_DT)
rmse = math.sqrt(mse)
rse = math.sqrt(mse/(num_data-2))
mae=mean_absolute_error(Y,Y_pred_DT)

print(MethodID+"-2) Evaluation  "+modelName+" Results : ")
print("--> RMSE (("+modelName+"))=",rmse)
print("--> MAE (("+modelName+"))=",mae)
r = scipy.stats.pearsonr(Y,Y_pred_DT)
print("--> Pearson Correlation (("+modelName+"))=",r)


# Export the model
import pickle
filename = 'bestModel'+modelName+'-SMV-nrml-Ecoli.pickle'
pickle.dump(best_model_searchDT, open(filename, 'wb'))
print("  ------------------------------------  ")
print( MethodID+"-3) Model " ,modelName," is generated and exported -- OK --  ")
print( " ################# ------------- ########################## ")

#Destandardization of predicted values
Y_pred_DT = (Y_pred_DT*y_std)+ y_mean
DataToExport["DT_Ecoli_pred"] = Y_pred_DT



In [None]:
##############################################################################################"
######################################## MODEL SVM  Split  ####################################"
################################ Machine a vecteurs de support  ###############################"
######################################### non lineaire ########################################"
 
from sklearn.ensemble import RandomForestRegressor
## --- Local Params
modelName = "SVM"
MethodID = "D"

print( " ################# ------------- ########################## ")
print( "Model " ,modelName," is runing ........ !  ")
from sklearn.neighbors import KNeighborsRegressor


from sklearn.svm import SVR
param_gridSVM = {'kernel' : ('sigmoid', 'rbf'),
         'coef0' : [0.01,10,0.5],
         'gamma' : ('auto','scale')
         }


# Selection of the best model
modelSVM = SVR()
best_model_searchSVM = GridSearchCV(estimator = modelSVM, param_grid = param_gridSVM, cv = CROSS_VALID,
                           verbose = VERBOSE_VALUE)
best_model_searchSVM.fit(X_train,y_train)


#  Show which is the best model
best_grid = best_model_searchSVM.best_estimator_
print("  ------------------------------------  ")
print (MethodID+"-1) BEST Configuration ("+modelName+") is  ==== ",best_grid )
print("  ------------------------------------  ")


# Predict the values using best_model
Y_pred_SVM = best_model_searchSVM.predict(X_test)
Y=y_test
num_data = X.shape[0]


# Errors for performance evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import math

mse = mean_squared_error(Y,Y_pred_SVM)
rmse = math.sqrt(mse)
rse = math.sqrt(mse/(num_data-2))
mae=mean_absolute_error(Y,Y_pred_SVM)

print(MethodID+"-2) Evaluation  "+modelName+" Results : ")
print("--> RMSE (("+modelName+"))=",rmse)
print("--> MAE (("+modelName+"))=",mae)
r = scipy.stats.pearsonr(Y,Y_pred_SVM)
print("--> Pearson Correlation (("+modelName+"))=",r)


# Export the model
import pickle
filename = 'bestModel'+modelName+'-SMV-nrml-Ecoli.pickle'
pickle.dump(best_model_searchSVM, open(filename, 'wb'))
print("  ------------------------------------  ")
print( MethodID+"-3) Model " ,modelName," is generated and exported -- OK --  ")
print( " ################# ------------- ########################## ")

#Destandardization of predicted values 
Y_pred_SVM = (Y_pred_SVM*y_std)+ y_mean
DataToExport["SVM_Ecoli_pred"] = Y_pred_SVM


In [None]:
##############################################################################################"
#################################### MODEL AdaBoost  Split  ##################################"
##############################################################################################"

from sklearn.ensemble import RandomForestRegressor
## --- Local Params
modelName = "AdaBoost"
MethodID = "E"

print( " ################# ------------- ########################## ")
print( "Model " ,modelName," is runing ........ !  ")
#from sklearn.neighbors import KNeighborsRegressor

from sklearn.ensemble import AdaBoostRegressor
param_gridADaBoost = {'base_estimator__min_samples_leaf':[1,2, 5],
              "learning_rate": [0.2,0.5],
              "n_estimators": [20, 50, 100]
         }


# Selection of the best model
from sklearn import tree
DTC = tree.DecisionTreeRegressor(random_state = 11, max_features = "auto",
                                 max_depth = None)
ModelAdaBoostDTC = AdaBoostRegressor(base_estimator = DTC)
best_model_searchAdaBoost = GridSearchCV(estimator = ModelAdaBoostDTC, param_grid = param_gridADaBoost, cv = CROSS_VALID,
                           verbose = VERBOSE_VALUE)
best_model_searchAdaBoost.fit(X_train,y_train)


#  Show which is the best model
best_grid = best_model_searchAdaBoost.best_estimator_
print("  ------------------------------------  ")
print (MethodID+"-1) BEST Configuration ("+modelName+") is  ==== ",best_grid )
print("  ------------------------------------  ")


# Predict the values using best_model
Y_pred_AdaBoost = best_model_searchAdaBoost.predict(X_test)
Y=y_test
num_data = X.shape[0]


# Errors for performance evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import math

mse = mean_squared_error(Y,Y_pred_AdaBoost)
rmse = math.sqrt(mse)
rse = math.sqrt(mse/(num_data-2))
mae=mean_absolute_error(Y,Y_pred_AdaBoost)

print(MethodID+"-2) Evaluation  "+modelName+" Results : ")
print("--> RMSE (("+modelName+"))=",rmse)
print("--> MAE (("+modelName+"))=",mae)
r = scipy.stats.pearsonr(Y,Y_pred_AdaBoost)
print("--> Pearson Correlation (("+modelName+"))=",r)


# Export the model
import pickle
filename = 'bestModel'+modelName+'-SMV-nrml-Ecoli.pickle'
pickle.dump(best_model_searchAdaBoost, open(filename, 'wb'))
print("  ------------------------------------  ")
print( MethodID+"-3) Model " ,modelName," is generated and exported -- OK --  ")
print( " ################# ------------- ########################## ")

#Destandardization of predicted values 
Y_pred_AdaBoost = (Y_pred_AdaBoost*y_std)+ y_mean
DataToExport["AdaBoost_Ecoli_pred"] = Y_pred_AdaBoost



In [None]:
##############################################################################################"
############################## MODEL Bagging avec estimator RF ###############################"
##############################################################################################"


from sklearn.ensemble import RandomForestRegressor
## --- Local Params
modelName = "BaggingRF"
MethodID = "H"

print( " ################# ------------- ########################## ")
print( "Model " ,modelName," is runing ........ !  ")


from sklearn.ensemble import BaggingRegressor
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=100, n_features=11,n_informative=2, n_targets=1, random_state=0, shuffle=False)

param_gridBag = {
    'bootstrap': [True],
    'max_features': [0, 11],
    'n_estimators': [10, 50, 200]
}


# Selection of the best model
modelRF =  RandomForestRegressor()
modelBaggingRF = BaggingRegressor(base_estimator=modelRF, n_estimators=10, random_state=0).fit(X, y) # base estimator par defaut DesicionTreeRegressor

best_model_searchBag2 = GridSearchCV(estimator = modelBaggingRF, param_grid = param_gridBag, cv = CROSS_VALID,
                           verbose = VERBOSE_VALUE)
best_model_searchBag2.fit(X_train,y_train)


#  Show which is the best model
best_grid = best_model_searchBag2.best_estimator_
print("  ------------------------------------  ")
print (MethodID+"-1) BEST Configuration ("+modelName+") is  ==== ",best_grid )
print("  ------------------------------------  ")


# Predict the values using best_model
Y_pred_Bag2 = best_model_searchBag2.predict(X_test)
Y=y_test
num_data = X.shape[0]


# Errors for performance evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import math

mse = mean_squared_error(Y,Y_pred_Bag2)
rmse = math.sqrt(mse)
rse = math.sqrt(mse/(num_data-2))
mae=mean_absolute_error(Y,Y_pred_Bag2)

print(MethodID+"-2) Evaluation  "+modelName+" Results : ")
print("--> RMSE (("+modelName+"))=",rmse)
print("--> MAE (("+modelName+"))=",mae)
r = scipy.stats.pearsonr(Y,Y_pred_Bag2)
print("--> Pearson Correlation (("+modelName+"))=",r)


# Export the model
import pickle
filename = 'bestModel'+modelName+'-SMV-nrml-Ecoli.pickle'
pickle.dump(best_model_searchBag2, open(filename, 'wb'))
print("  ------------------------------------  ")
print( MethodID+"-3) Model " ,modelName," is generated and exported -- OK --  ")
print( " ################# ------------- ########################## ")

#Destandardization of predicted values 
Y_pred_Bag2 = (Y_pred_Bag2*y_std)+ y_mean
DataToExport["Bagging_Ecoli_pred"] = Y_pred_Bag2


In [None]:
# The data to export 
DataToExport

In [None]:
######################################## Export the data  ####################################"

# Export the data with the prediction values for all models 
print( " ======================================================= ")
print( " ======================================================= ")
print("  * * * * * * * IV) Export CSV  * * * * * * *  ")
DataToExport.to_csv ('/home/manel/Bureau/BD/donnee+pluvio/smv-complet/ML/data_prediValuesML_ecoli_smv.csv', index = False, header=True, sep=";")


In [None]:
##############################################################################################"
################################### Analyse the RF model  ####################################"
##############################################################################################"

#! pip install treeinterpreter
from treeinterpreter import treeinterpreter as ti

rf = RandomForestRegressor(max_features=8, n_estimators=200)
rf.fit(X_train,y_train)

prediction, bias, contributions = ti.predict(rf, X_test)

In [None]:
prediction, bias, contributions