In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv('water_quality_data.csv')  # Your dataset file
print(df.head())
print(df.info())
print(df.describe())

         pH  Temperature  Turbidity  Electrical_Conductivity  Potable  \
0  6.810890    27.454043   1.851329               802.668588        0   
1  8.827500    23.402409   5.419009               744.813723        0   
2  8.061979    17.738190   8.729458                87.180995        0   
3  7.595305    30.344875   7.322249               544.809350        0   
4  6.046065    27.118279   8.065611               601.283647        0   

   Good_for_Agriculture  
0                     1  
1                     0  
2                     0  
3                     1  
4                     1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   pH                       500 non-null    float64
 1   Temperature              500 non-null    float64
 2   Turbidity                500 non-null    float64
 3   Electrical_Conductivity  5

In [3]:
df = df.dropna()  # or use df.fillna()

In [4]:
X = df[['pH', 'Temperature', 'Turbidity', 'Electrical_Conductivity']]
y_potable = df['Potable']
y_agri = df['Good_for_Agriculture']

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
X_train, X_test, y_potable_train, y_potable_test = train_test_split(X_scaled, y_potable, test_size=0.3, random_state=42)
_, _, y_agri_train, y_agri_test = train_test_split(X_scaled, y_agri, test_size=0.3, random_state=42)

In [7]:
model_potable = RandomForestClassifier(random_state=42)
model_potable.fit(X_train, y_potable_train)

In [8]:
model_agri = RandomForestClassifier(random_state=42)
model_agri.fit(X_train, y_agri_train)

In [9]:
y_potable_pred = model_potable.predict(X_test)
print("Potable Water Prediction Results:")
print(classification_report(y_potable_test, y_potable_pred))
print(confusion_matrix(y_potable_test, y_potable_pred))

Potable Water Prediction Results:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       126
           1       1.00      0.92      0.96        24

    accuracy                           0.99       150
   macro avg       0.99      0.96      0.97       150
weighted avg       0.99      0.99      0.99       150

[[126   0]
 [  2  22]]


In [10]:
y_agri_pred = model_agri.predict(X_test)
print("Agricultural Water Prediction Results:")
print(classification_report(y_agri_test, y_agri_pred))
print(confusion_matrix(y_agri_test, y_agri_pred))

Agricultural Water Prediction Results:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        66
           1       1.00      1.00      1.00        84

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150

[[66  0]
 [ 0 84]]


In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib

# Load dataset
df = pd.read_csv('water_quality_data.csv')

# Features and targets
X = df[['pH', 'Temperature', 'Turbidity', 'Electrical_Conductivity']]
y1 = df['Potable']
y2 = df['Good_for_Agriculture']

# Split for both targets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y1, test_size=0.2, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42)

# Train models
model_potable = RandomForestClassifier()
model_agri = RandomForestClassifier()

model_potable.fit(X_train1, y_train1)
model_agri.fit(X_train2, y_train2)

# Save models
joblib.dump(model_potable, 'model_potable.pkl')
joblib.dump(model_agri, 'model_agri.pkl')

print("Models saved successfully.")

Models saved successfully.


In [1]:
import joblib

model = joblib.load('model_potable.pkl')
print(type(model))  # Expected: sklearn.tree._classes.DecisionTreeClassifier

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
