In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Step 2: Load Dataset
df = pd.read_csv("water_potability.csv")  # Ensure the file is in the same directory
df.head()

In [None]:
# Step 3: Data Exploration
df.info()
df.describe()
df.isnull().sum()

In [None]:
# Step 4: Handle Missing Values
df.fillna(df.mean(), inplace=True)
df.isnull().sum()

In [None]:
# Step 5: Data Visualization
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation")
plt.show()

sns.countplot(x='Potability', data=df)
plt.title("Class Distribution (0: Not Drinkable, 1: Drinkable)")
plt.show()

In [None]:
# Step 6: Feature & Target Split
X = df.drop('Potability', axis=1)
y = df['Potability']

In [None]:
# Step 7: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Step 8: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Step 9: Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Step 10: Predictions & Evaluation
y_pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# Step 11: Sample Prediction
sample_data = np.array([[7.2, 150, 12000, 7.0, 330, 420, 15.0, 66.0, 4.5]])
sample_scaled = scaler.transform(sample_data)
sample_pred = model.predict(sample_scaled)

if sample_pred[0] == 1:
    print("The water is likely drinkable (Potable).")
else:
    print("The water is NOT drinkable.")