<a href="https://colab.research.google.com/github/mutabazichristian/water-quality-model/blob/christian/dropout%26reg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [197]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from google.colab import drive


In [198]:
# Google Drive
drive.mount('/content/drive')
np.random.seed(42)
tf.random.set_seed(42)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/ALU/datasets/water_potability.csv')

In [212]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

In [213]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train_resampled.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l1(0.0001)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l1(0.0001)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l1(0.0001)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.003,
    decay_steps=5000,
    decay_rate=0.95,
    staircase=True
)
optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr_schedule)

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=30,
    min_delta=0.0001,
    restore_best_weights=True
)

class_weights = {0: 1., 1: len(y_train) / (2 * len(y_train[y_train == 1]))}

In [214]:
history = model.fit(
    X_train_resampled, y_train_resampled,
    validation_data=(X_test_preprocessed, y_test),
    epochs=150,
    batch_size=32,
    callbacks=[early_stopping],
    class_weight=class_weights
)

Epoch 1/150
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.5339 - loss: 0.8777 - val_accuracy: 0.6189 - val_loss: 0.7246
Epoch 2/150
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6227 - loss: 0.8070 - val_accuracy: 0.6143 - val_loss: 0.7152
Epoch 3/150
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6478 - loss: 0.7769 - val_accuracy: 0.6159 - val_loss: 0.7143
Epoch 4/150
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6531 - loss: 0.7699 - val_accuracy: 0.6280 - val_loss: 0.6974
Epoch 5/150
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6729 - loss: 0.7525 - val_accuracy: 0.6143 - val_loss: 0.7069
Epoch 6/150
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6754 - loss: 0.7417 - val_accuracy: 0.6128 - val_loss: 0.6993
Epoch 7/150
[1m100/1

In [215]:
y_pred = (model.predict(X_test_preprocessed) > 0.5).astype(int).flatten()
print("Classification Report:")
print(classification_report(y_test, y_pred))
loss, accuracy = model.evaluate(X_test_preprocessed, y_test)
print(f"\nTest Accuracy: {accuracy}")
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.68      0.69       400
           1       0.52      0.55      0.53       256

    accuracy                           0.63       656
   macro avg       0.61      0.61      0.61       656
weighted avg       0.63      0.63      0.63       656

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6118 - loss: 0.7069 

Test Accuracy: 0.6280487775802612
F1 Score: 0.5343511450381679
Precision: 0.5223880597014925
Recall: 0.546875
