In [3]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import make_scorer, accuracy_score, classification_report

# Load your dataset
# Replace 'your_dataset.csv' with the actual file path or URL of your dataset
data = pd.read_csv('waterPotability_updated.csv')

# Select features and target variable
features = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
            'Organic_carbon', 'Trihalomethanes', 'Turbidity']
target = 'Potability'

# Drop rows with missing values for simplicity
data = data.dropna(subset=[target] + features)

# Convert 'Potability' column to numerical labels if needed
# (Assuming 'Potability' is binary: 0 or 1)
data[target] = data[target].astype(int)

# Create Bernoulli Naive Bayes classifier
nb_classifier = BernoulliNB()

# Define cross-validation strategy (Stratified K-Folds for binary classification)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform 10-fold cross-validation
scores = cross_val_score(nb_classifier, data[features], data[target],
                         scoring=make_scorer(accuracy_score), cv=cv)

# Print cross-validation results
print(f'Accuracy for each fold: {scores}')
print(f'Mean accuracy: {scores.mean()}')

# Additional: Display classification metrics for each fold
for i, (train_idx, test_idx) in enumerate(cv.split(data[features], data[target])):
    X_train, X_test = data.iloc[train_idx][features], data.iloc[test_idx][features]
    y_train, y_test = data.iloc[train_idx][target], data.iloc[test_idx][target]

    nb_classifier.fit(X_train, y_train)
    y_pred = nb_classifier.predict(X_test)

    print(f'\nClassification Report - Fold {i + 1}:')
    print(classification_report(y_test, y_pred))


Accuracy for each fold: [0.6097561  0.6097561  0.6097561  0.6097561  0.6097561  0.6097561
 0.6116208  0.6116208  0.60856269 0.60856269]
Mean accuracy: 0.6098903557842918

Classification Report - Fold 1:
              precision    recall  f1-score   support

           0       0.61      1.00      0.76       200
           1       0.00      0.00      0.00       128

    accuracy                           0.61       328
   macro avg       0.30      0.50      0.38       328
weighted avg       0.37      0.61      0.46       328


Classification Report - Fold 2:
              precision    recall  f1-score   support

           0       0.61      1.00      0.76       200
           1       0.00      0.00      0.00       128

    accuracy                           0.61       328
   macro avg       0.30      0.50      0.38       328
weighted avg       0.37      0.61      0.46       328


Classification Report - Fold 3:
              precision    recall  f1-score   support

           0       0.6

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

              precision    recall  f1-score   support

           0       0.61      1.00      0.76       200
           1       0.00      0.00      0.00       127

    accuracy                           0.61       327
   macro avg       0.31      0.50      0.38       327
weighted avg       0.37      0.61      0.46       327


Classification Report - Fold 8:
              precision    recall  f1-score   support

           0       0.61      1.00      0.76       200
           1       0.00      0.00      0.00       127

    accuracy                           0.61       327
   macro avg       0.31      0.50      0.38       327
weighted avg       0.37      0.61      0.46       327


Classification Report - Fold 9:
              precision    recall  f1-score   support

           0       0.61      1.00      0.76       199
           1       0.00      0.00      0.00       128

    accuracy                           0.61       327
   macro avg       0.30      0.50      0.38       327
weighted

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
