
# Deep Learning Model Implementation for Binary Classification

This notebook implements a deep learning model for binary classification using a small numerical dataset. The model architecture will consist of dense layers with dropout and regularization to prevent overfitting.


In [572]:
# %conda install tensorflow
# Necessary imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l1_l2


In [601]:
# Load and preprocess the data
data = pd.read_csv('/home/matt/Projects/water-ml/datasets/labeled.csv')
data.replace('ND', 0, inplace=True)

data_psudolabeled = pd.read_csv('/home/matt/Projects/water-ml/datasets/sheet_3_psudo_labeled.csv')
stable_fail_labels = data_psudolabeled['Scheme'].map({1: 'Stable', 0: 'Failure'})
data_psudolabeled['Scheme'] = stable_fail_labels
data_psudolabeled.replace('ND', 0, inplace=True)

data = pd.concat([data, data_psudolabeled], ignore_index=True)

data.head(10)


Unnamed: 0,Sample,Scheme,Taxa A1,Taxa A2,Taxa A3,Taxa A4,Taxa A5,Taxa B1,Taxa B2,Taxa B3
0,SA PT 1.5,Stable,0.0,,0,,0.0,,,0.0
1,SA-1 CT,Failure,0.0121,,0,,0.0001,,,0.002
2,SA-2,Stable,0.0003,,0,,0.0,,,0.0
3,SA-2 CT,Failure,0.0284,,0,,0.0,,,0.0
4,SA-3,Stable,0.0002,,0,,0.0,,,0.0
5,SA-4,Stable,0.0,,0,,0.0,,,0.0
6,SA-5 CT,Stable,0.0,,0,,0.0,,,0.0001
7,WA-1A,Stable,0.0,,0,,0.0,,,0.0
8,WA-1B,Stable,0.0008,,0,,0.0,,,0.0
9,WA-2,Stable,0.0002,,0,,0.0,,,0.0


In [594]:
# Encoding categorical features
categorical_columns = ['Taxa A1', 'Taxa A2', 'Taxa A3', 'Taxa A4', 'Taxa A5', 'Taxa B1', 'Taxa B2', 'Taxa B3']  # Update as needed
for col in categorical_columns:
    data[col] = data[col].astype(str)
    
encoder = OneHotEncoder(sparse=False)    
data_encoded = pd.DataFrame(encoder.fit_transform(data[categorical_columns]))
data_encoded.columns = encoder.get_feature_names_out(categorical_columns)
data.drop(categorical_columns ,axis=1, inplace=True)
data = pd.concat([data, data_encoded], axis=1)

# Separating features and target variable
X = data.drop(['Scheme', 'Sample'], axis=1)  # Update target and identifier columns as needed
y = data['Scheme'].map({'Stable': 0, 'Failure': 1})

# Normalizing the dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



Unnamed: 0,Sample,Scheme,Taxa A1,Taxa A2,Taxa A3,Taxa A4,Taxa A5,Taxa B1,Taxa B2,Taxa B3
0,Arkansas and Lower Mississippi,Stable,0.0,0.0,0,0.0,0,0.0,0.0,0.0056
1,Arkansas and Lower Mississippi,Stable,0.0,0.0,0,0.0,0,0.0,0.0,0.0095
2,Arkansas and Lower Mississippi,Stable,0.0,0.0,0,0.0021,0,0.0031,0.0,0.5849
3,Arkansas and Lower Mississippi,Stable,0.0,0.0008,0,0.003,0,0.0,0.0008,0.003
4,Arkansas and Lower Mississippi,Stable,0.0,0.0077,0,0.0,0,0.0,0.0346,0.2308
5,Arkansas and Lower Mississippi,Stable,0.0,0.0018,0,0.0018,0,0.0,0.014,0.0595
6,Arkansas and Lower Mississippi,Stable,0.0,0.0052,0,0.0009,0,0.0061,0.0506,0.5681
7,Arkansas and Lower Mississippi,Stable,0.0,0.0032,0,0.0011,0,0.0032,0.0485,0.4335
8,Arkansas and Lower Mississippi,Failure,0.0011,0.1171,0,0.0,0,0.0,0.0,0.7341
9,Arkansas and Lower Mississippi,Failure,0.0011,0.0995,0,0.0011,0,0.0,0.0,0.569


In [574]:
unlabeled_data = pd.read_csv('/home/matt/Projects/water-ml/datasets/unlabeled.csv')  # Update with actual path
unlabeled_data.replace('ND', 0, inplace=True)

categorical_columns = ['Taxa A1', 'Taxa A2', 'Taxa A3', 'Taxa A4', 'Taxa A5', 'Taxa B1', 'Taxa B2', 'Taxa B3']
for col in categorical_columns:
    unlabeled_data[col] = unlabeled_data[col].astype(str)

encoder = OneHotEncoder(sparse=False)    
data_encoded = pd.DataFrame(encoder.fit_transform(unlabeled_data[categorical_columns]))
data_encoded.columns = encoder.get_feature_names_out(categorical_columns)
unlabeled_data.drop(categorical_columns ,axis=1, inplace=True)
unlabeled_data = pd.concat([unlabeled_data, data_encoded], axis=1)

X_unlabeled = unlabeled_data.drop(['Scheme', 'Sample'], axis=1)  # Update target and identifier columns as needed
y_unlabeled = unlabeled_data['Scheme'].map({'Stable': 0, 'Failure': 1})

scaler = StandardScaler()
X_unlabeled_scaled= scaler.fit_transform(X_unlabeled)




In [575]:
from imblearn.over_sampling import SMOTE

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.15, random_state=42)

# Augmenting the dataset with SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# TODO: Apply SMOTE after psuedo labeling
# Balance the unlabeled data
# X_unlabeled_smote, y_unlabeled_smote = smote.fit_resample(X_unlabeled_scaled, y_unlabeled)

In [576]:
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import GaussianNoise

# Current Best:  
# Best: 0.901732 using {'l1_rate': 0.01, 'l2_rate': 0.01, 'learning_rate': 0.01, 'noise_level': 0.05}

# def create_model(l1_rate=0.01, l2_rate=0.01, learning_rate=0.01, noise_level=0.05):
#     model = Sequential()
#     # model.add(GaussianNoise(noise_level, input_shape=(X_smote.shape[1],)))
#     model.add(Dense(30, activation='relu', kernel_regularizer=l1_l2(l1=l1_rate, l2=l2_rate)))
#     # model.add(Dropout(0.3))
#     model.add(Dense(15, activation='relu', kernel_regularizer=l1_l2(l1=l1_rate, l2=l2_rate)))
#     model.add(Dense(1, activation='sigmoid'))
    
#     optimizer = Adam(learning_rate=learning_rate)
#     model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# Previous: l1_rate=0.01, l2_rate=0.05, learning_rate=0.05, noise_level=0.01
# Best: 0.911039 using {'l1_rate': 0.001, 'l2_rate': 0.1, 'learning_rate': 0.001, 'noise_level': 0.1}
# Best: 0.901299 using {'dropout_rate_1': 0.5, 'dropout_rate_2': 0.5, 'l1_rate': 0.01, 'l2_rate': 0.001, 'learning_rate': 0.01, 'noise_level': 0.05}
def create_model(l1_rate=0.01, l2_rate=0.001, learning_rate=0.01, noise_level=0.05, dropout_rate_1=0.5, dropout_rate_2=0.5):
    model = Sequential()
    model.add(GaussianNoise(noise_level, input_shape=(X_smote.shape[1],)))
    
    model.add(Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=l1_rate, l2=l2_rate)))
    model.add(Dropout(dropout_rate_1))
    model.add(Dense(32, activation='relu', kernel_regularizer=l1_l2(l1=l1_rate, l2=l2_rate)))
    model.add(Dropout(dropout_rate_2))
    # model.add(Dense(30, activation='tanh', kernel_regularizer=l1_l2(l1=l1_rate, l2=l2_rate)))
    
    # model.add(Dense(64, activation='relu', input_shape=(X_scaled.shape[1],), kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
    # model.add(Dropout(0.5))
    # model.add(Dense(32, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
    # model.add(Dropout(0.5))
    
    model.add(Dense(1, activation='sigmoid'))
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [577]:
# def create_model(l1_rate=0.01, l2_rate=0.01, learning_rate=0.01, noise_level=0.05, 
#                  include_blur=False, blur_size=1, num_layers=2, units_per_layer=[8, 16], dropout_rate=0.3):
#     model = Sequential()
    
#     # Initial layer with GaussianNoise
#     model.add(GaussianNoise(noise_level, input_shape=(X_smote.shape[1],)))
    
#     if include_blur:
#         # Add GaussianBlur layer if included
#         model.add(GaussianBlur(blur_size))

#     for i in range(num_layers):
#         # Add the specified number of Dense layers
#         model.add(Dense(units_per_layer[i], activation='relu', kernel_regularizer=l1_l2(l1=l1_rate, l2=l2_rate)))
#         model.add(Dropout(dropout_rate))
    
#     # Output layer
#     model.add(Dense(1, activation='sigmoid'))
    
#     optimizer = Adam(learning_rate=learning_rate)
#     model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# TODO: Just psudeolabel the unlabaled data with rnb+ and rnb models. See if this achieves better results than just the rnb and rnb+ models. Even failure would highlight the 
# need to publish more labeled sets for training. How much psudo labeling is too much? Utilizing traditional ML  models for psudeolabeling traiing data for deep learning models. 
# TODO: Look into contrastive learning for semi-supervised learning of unlabeled data

In [578]:
# from keras.wrappers.scikit_learn import KerasClassifier
# from sklearn.model_selection import GridSearchCV

# # Assuming X_smote and y_smote are your features and labels

# # Wrap the model with KerasClassifier
# model = KerasClassifier(build_fn=create_model, verbose=0)

# # Define the parameter grid
# param_grid = {
#     'l1_rate': [0.01, 0.001],
#     'l2_rate': [0.01, 0.001],
#     'learning_rate': [0.01, 0.001],
#     'noise_level': [0.01, 0.05],
#     'include_blur': [True, False],
#     'blur_size': [1, 2],
#     'num_layers': [1, 2, 3],  # Number of layers
#     'units_per_layer': [[8], [8, 16], [8, 16, 32]],  # Corresponding units in each layer
#     'dropout_rate': [0.3, 0.5],
#     'batch_size': [32, 64],  # Example of how to include batch size in the grid
#     'epochs': [10, 20]  # Example of how to include epochs in the grid
# }

# # Initialize GridSearchCV
# grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=3)

# # Fit the model
# grid_result = grid.fit(X_smote, y_smote)

# # Summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [579]:
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0)

param_grid = {
    'l1_rate': [0.001, 0.01, 0.1],
    'l2_rate': [0.001, 0.01, 0.1],
    'learning_rate': [0.00001, 0.0001, 0.01], 
    'noise_level': [0.01, 0.05, 0.1], 
    'dropout_rate_1': [0.3, 0.5],
    'dropout_rate_2': [0.3, 0.5]
}

  model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0)


In [580]:
from tensorflow.keras.callbacks import EarlyStopping

# Implementing Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

# Model training with Early Stopping
# history = model.fit(X_smote, y_smote, epochs=100, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=1)
# history = model.fit(X_smote, y_smote, epochs=100, validation_data=(X_test, y_test), verbose=1)

In [581]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=10, scoring='accuracy')
grid_result = grid.fit(X_smote, y_smote)

2024-02-22 15:10:31.114041: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-22 15:10:31.126762: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-22 15:10:31.143552: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_B















KeyboardInterrupt: 

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Current Best:  Switch to matthews_corrcoef for grid search scoring
# Best: 0.901732 using {'l1_rate': 0.01, 'l2_rate': 0.01, 'learning_rate': 0.01, 'noise_level': 0.05}
# Best: 0.911039 using {'l1_rate': 0.001, 'l2_rate': 0.1, 'learning_rate': 0.001, 'noise_level': 0.1}

# Previous Best:
# 0.882428 (0.052831) with: {'l1_rate': 0.001, 'l2_rate': 0.1, 'learning_rate': 0.001} CV:3 noise_level=0.01

Best: 0.900000 using {'dropout_rate_1': 0.3, 'dropout_rate_2': 0.3, 'l1_rate': 0.001, 'l2_rate': 0.01, 'learning_rate': 0.0001, 'noise_level': 0.01}
0.647826 (0.146090) with: {'dropout_rate_1': 0.3, 'dropout_rate_2': 0.3, 'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 1e-05, 'noise_level': 0.01}
0.660870 (0.192290) with: {'dropout_rate_1': 0.3, 'dropout_rate_2': 0.3, 'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 1e-05, 'noise_level': 0.05}
0.652174 (0.104710) with: {'dropout_rate_1': 0.3, 'dropout_rate_2': 0.3, 'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 1e-05, 'noise_level': 0.1}
0.895652 (0.073271) with: {'dropout_rate_1': 0.3, 'dropout_rate_2': 0.3, 'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 0.0001, 'noise_level': 0.01}
0.878261 (0.088679) with: {'dropout_rate_1': 0.3, 'dropout_rate_2': 0.3, 'l1_rate': 0.001, 'l2_rate': 0.001, 'learning_rate': 0.0001, 'noise_level': 0.05}
0.873913 (0.096145) with: {'dropout_rate_1': 0.3, 'dropout_rate_2': 0.3, 'l1_rat