In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE, SelectKBest, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

file_path = '/content/drive/MyDrive/water_potability_preprocessed.csv'
df = pd.read_csv(file_path)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Step 3: Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Random Forest Feature Importance
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X, y)
rf_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
print("\nRandom Forest Feature Importances:")
print(rf_importances.sort_values(ascending=False))

# Step 5: Recursive Feature Elimination (RFE)
rfe_model = LogisticRegression(max_iter=1000, random_state=42)
rfe = RFE(estimator=rfe_model, n_features_to_select=5)
rfe.fit(X_scaled, y)
rfe_features = X.columns[rfe.support_]
print("\nFeatures selected by RFE:")
print(rfe_features)

# Step 6: Lasso Regularization (L1-based)
lasso = LassoCV(cv=5, random_state=42).fit(X_scaled, y)
lasso_importances = pd.Series(lasso.coef_, index=X.columns)
print("\nLasso Feature Importances:")
print(lasso_importances[lasso_importances != 0])

# Step 7: Mutual Information (MI)
mi_scores = mutual_info_classif(X_scaled, y, random_state=42)
mi_results = pd.Series(mi_scores, index=X.columns)
print("\nMutual Information Scores:")
print(mi_results.sort_values(ascending=False))

# Step 8: ANOVA F-Test
anova_selector = SelectKBest(score_func=f_classif, k='all')
anova_scores = anova_selector.fit(X, y)
anova_results = pd.Series(anova_scores.scores_, index=X.columns)
print("\nANOVA F-test Scores:")
print(anova_results.sort_values(ascending=False))



Random Forest Feature Importances:
ph                 0.129176
Sulfate            0.128030
Hardness           0.122093
Solids             0.115762
Chloramines        0.115576
Organic_carbon     0.099438
Conductivity       0.098670
Trihalomethanes    0.097173
Turbidity          0.094082
dtype: float64

Features selected by RFE:
Index(['Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Organic_carbon'], dtype='object')

Lasso Feature Importances:
Solids    1.084467e-17
dtype: float64

Mutual Information Scores:
Hardness           0.026560
Conductivity       0.007077
Organic_carbon     0.004014
Turbidity          0.003075
Sulfate            0.002004
Solids             0.001066
ph                 0.000149
Chloramines        0.000000
Trihalomethanes    0.000000
dtype: float64

ANOVA F-test Scores:
Solids             3.732062
Organic_carbon     2.949523
Chloramines        1.852296
Sulfate            1.392522
Hardness           0.626928
Conductivity       0.216326
Trihalomethanes    0.158615
p

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score


file_path = '/content/drive/MyDrive/water_potability_preprocessed.csv'
df = pd.read_csv(file_path)

# Use selected features (from RFE/ANOVA)
selected_features = ['Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Organic_carbon']
X = df[selected_features]
y = df['Potability']

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Step 4: Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "KNN": KNeighborsClassifier()
}

# Step 5: Train and Evaluate Models
results = {}
for model_name, model in models.items():
    # Cross-Validation Score
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Evaluate Metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None

    results[model_name] = {
        "Accuracy": accuracy,
        "ROC-AUC": roc_auc,
        "CV Accuracy (Mean)": scores.mean(),
        "CV Accuracy (Std Dev)": scores.std()
    }

# Step 6: Display Results
for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")

# Step 7: Classification Report for Best Model
best_model = RandomForestClassifier(random_state=42)
best_model.fit(X_train, y_train)
y_best_pred = best_model.predict(X_test)
print("\nClassification Report for Best Model:")
print(classification_report(y_test, y_best_pred))



Model: Logistic Regression
Accuracy: 0.6103763987792472
ROC-AUC: 0.5347693646649261
CV Accuracy (Mean): 0.6096811941661672
CV Accuracy (Std Dev): 0.00041713870475798104

Model: Random Forest
Accuracy: 0.6266531027466938
ROC-AUC: 0.6150348128807658
CV Accuracy (Mean): 0.6258231774029359
CV Accuracy (Std Dev): 0.009391343885950308

Model: SVM
Accuracy: 0.6429298067141404
ROC-AUC: 0.6076196692776328
CV Accuracy (Mean): 0.649365908420622
CV Accuracy (Std Dev): 0.006888886828704235

Model: Gradient Boosting
Accuracy: 0.6236012207527976
ROC-AUC: 0.581792863359443
CV Accuracy (Mean): 0.6310547896985093
CV Accuracy (Std Dev): 0.014263427827714953

Model: KNN
Accuracy: 0.5971515768056969
ROC-AUC: 0.5707811140121846
CV Accuracy (Mean): 0.6057719934164835
CV Accuracy (Std Dev): 0.021267635205332175

Classification Report for Best Model:
              precision    recall  f1-score   support

         0.0       0.64      0.87      0.74       600
         1.0       0.55      0.25      0.34       38