
# Deep Learning Model Implementation for Binary Classification

This notebook implements a deep learning model for binary classification using a small numerical dataset. The model architecture will consist of dense layers with dropout and regularization to prevent overfitting.


In [5]:
# %conda install tensorflow
# Necessary imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l1_l2


In [15]:

# Load and preprocess the data
data = pd.read_csv('/home/matt/Projects/water-ml/datasets/labeled.csv')  # Update with actual path
data.replace('ND', 0, inplace=True)

# Encoding categorical features
categorical_columns = ['Taxa A1', 'Taxa A2', 'Taxa A3', 'Taxa A4', 'Taxa A5', 'Taxa B1', 'Taxa B2', 'Taxa B3']  # Update as needed
for col in categorical_columns:
    data[col] = data[col].astype(str)
    
encoder = OneHotEncoder(sparse=False)    
data_encoded = pd.DataFrame(encoder.fit_transform(data[categorical_columns]))
data_encoded.columns = encoder.get_feature_names_out(categorical_columns)
data.drop(categorical_columns ,axis=1, inplace=True)
data = pd.concat([data, data_encoded], axis=1)

# Separating features and target variable
X = data.drop(['Scheme', 'Sample'], axis=1)  # Update target and identifier columns as needed
y = data['Scheme'].map({'Stable': 0, 'Failure': 1})

# Normalizing the dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



In [21]:
unlabeled_data = pd.read_csv('/home/matt/Projects/water-ml/datasets/unlabeled.csv')  # Update with actual path
unlabeled_data.replace('ND', 0, inplace=True)

categorical_columns = ['Taxa A1', 'Taxa A2', 'Taxa A3', 'Taxa A4', 'Taxa A5', 'Taxa B1', 'Taxa B2', 'Taxa B3']
for col in categorical_columns:
    unlabeled_data[col] = unlabeled_data[col].astype(str)

encoder = OneHotEncoder(sparse=False)    
data_encoded = pd.DataFrame(encoder.fit_transform(unlabeled_data[categorical_columns]))
data_encoded.columns = encoder.get_feature_names_out(categorical_columns)
unlabeled_data.drop(categorical_columns ,axis=1, inplace=True)
unlabeled_data = pd.concat([unlabeled_data, data_encoded], axis=1)

X_unlabeled = unlabeled_data.drop(['Scheme', 'Sample'], axis=1)  # Update target and identifier columns as needed
y_unlabeled = unlabeled_data['Scheme'].map({'Stable': 0, 'Failure': 1})

scaler = StandardScaler()
X_unlabeled_scaled = scaler.fit_transform(X_unlabeled)

y_unlabeled



0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
225   NaN
226   NaN
227   NaN
228   NaN
229   NaN
Name: Scheme, Length: 230, dtype: float64

In [20]:
from imblearn.over_sampling import SMOTE

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Augmenting the dataset with SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# TODO: Fix the SMOTE for the unlabeled , need a y, currentl all NaN see the rnb+ impelmentation for fix
# Balance the unlabeled data
# X_unlabeled_smote = smote.fit_resample(X_unlabeled_scaled)

TypeError: BaseSampler.fit_resample() missing 1 required positional argument: 'y'

In [7]:
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import GaussianNoise

def create_model(l1_rate=0.001, l2_rate=0.001, learning_rate=0.001, noise_level=0.01):
    model = Sequential()
    model.add(GaussianNoise(noise_level, input_shape=(X_smote.shape[1],)))
    model.add(Dense(32, activation='relu', kernel_regularizer=l1_l2(l1=l1_rate, l2=l2_rate)))
    model.add(Dropout(0.3))
    model.add(Dense(16, activation='relu', kernel_regularizer=l1_l2(l1=l1_rate, l2=l2_rate)))
    model.add(Dense(1, activation='sigmoid'))
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [8]:
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0)

param_grid = {
    'l1_rate': [0.001, 0.01, 0.1],
    'l2_rate': [0.001, 0.01, 0.1],
    'learning_rate': [0.001, 0.01, 0.1], 
    'noise_level': [0.01, 0.05, 0.1]
}

  model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0)


In [9]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=10, scoring='accuracy')
grid_result = grid.fit(X_smote, y_smote)

2024-02-20 15:30:12.493920: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-20 15:30:12.498335: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-20 15:30:12.562660: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_B

KeyboardInterrupt: 

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Current Best:  
# 0.882428 (0.052831) with: {'l1_rate': 0.001, 'l2_rate': 0.1, 'learning_rate': 0.001} CV:3 noise_level=0.01

# Previous Best:


Best: 0.901732 using {'l1_rate': 0.01, 'l2_rate': 0.01, 'learning_rate': 0.01, 'noise_level': 0.05}
0.877706 (0.063573) with: {'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 0.001, 'noise_level': 0.01}
0.887446 (0.075460) with: {'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 0.001, 'noise_level': 0.05}
0.888095 (0.089538) with: {'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 0.001, 'noise_level': 0.1}
0.882900 (0.085887) with: {'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 0.01, 'noise_level': 0.01}
0.850216 (0.091181) with: {'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 0.01, 'noise_level': 0.05}
0.849784 (0.106529) with: {'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 0.01, 'noise_level': 0.1}
0.636147 (0.278322) with: {'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 0.1, 'noise_level': 0.01}
0.450216 (0.281327) with: {'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 0.1, 'noise_level': 0.05}
0.371212 (0.250367) with: {'l1_rate': 0.001, 