In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [10]:
df = pd.read_csv("global_water_consumption.csv")
print("Original Shape:", df.shape)
df.head()

Original Shape: (500, 10)


Unnamed: 0,Country,Year,Total Water Consumption (Billion Cubic Meters),Per Capita Water Use (Liters per Day),Agricultural Water Use (%),Industrial Water Use (%),Household Water Use (%),Rainfall Impact (Annual Precipitation in mm),Groundwater Depletion Rate (%),Water Scarcity Level
0,Argentina,2000,481.49,235.431429,48.55,20.844286,30.1,1288.698571,3.255714,Moderate
1,Argentina,2001,455.063,299.551,48.465,26.943,22.55,1371.729,3.12,Moderate
2,Argentina,2002,482.749231,340.124615,50.375385,29.042308,23.349231,1590.305385,2.733846,Moderate
3,Argentina,2003,452.66,326.756667,49.086667,30.476,24.44,1816.012667,2.708,Moderate
4,Argentina,2004,634.566,230.346,38.67,36.67,23.924,815.998,1.902,Moderate


In [4]:
df = df.drop_duplicates()
print("After removing duplicates:", df.shape)

After removing duplicates: (500, 10)


In [5]:
le = LabelEncoder()
df["Water Scarcity Level"] = le.fit_transform(df["Water Scarcity Level"])
print("Target Classes Mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
df = pd.get_dummies(df, columns=["Country"], drop_first=True)

Target Classes Mapping: {'High': np.int64(0), 'Low': np.int64(1), 'Moderate': np.int64(2)}


In [6]:
X = df.drop("Water Scarcity Level", axis=1)
y = df["Water Scarcity Level"]

In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train distribution:\n", y_train.value_counts())
print("y_test distribution:\n", y_test.value_counts())

X_train shape: (400, 27)
X_test shape: (100, 27)
y_train distribution:
 Water Scarcity Level
2    288
1     98
0     14
Name: count, dtype: int64
y_test distribution:
 Water Scarcity Level
2    72
1    24
0     4
Name: count, dtype: int64
