<a href="https://colab.research.google.com/github/nano-rayhan/Water-Scarcity-Label-App/blob/main/water_scarcity_label.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv('/content/global_water_consumption_2000_2025.csv')
df.head()

Unnamed: 0,Country,Year,Total Water Consumption (Billion m3),Per Capita Water Use (L/Day),Agricultural Water Use (%),Industrial Water Use (%),Household Water Use (%),Rainfall Impact (mm),Groundwater Depletion Rate (%),Water Scarcity Level
0,China,2000,586.04,146.6,63.8,23.1,13.1,633.8,3.99,High
1,China,2001,590.09,161.4,63.7,23.2,13.2,854.9,4.16,High
2,China,2002,600.0,150.2,63.6,23.2,13.2,1280.2,4.24,Moderate
3,China,2003,610.45,162.5,63.5,23.3,13.3,900.1,4.4,High
4,China,2004,617.08,148.7,63.4,23.3,13.3,641.2,4.57,High


In [None]:
df['Country'].unique()

array(['China', 'India', 'USA', 'Indonesia', 'Pakistan', 'Brazil',
       'Nigeria', 'Bangladesh', 'Russia', 'Mexico', 'Japan', 'Ethiopia',
       'Philippines', 'Egypt', 'Vietnam', 'DR Congo', 'Turkey', 'Iran',
       'Germany', 'Thailand', 'United Kingdom', 'France', 'Italy',
       'Tanzania', 'South Africa', 'Myanmar', 'Kenya', 'South Korea',
       'Colombia', 'Spain', 'Uganda', 'Argentina', 'Algeria', 'Sudan',
       'Ukraine', 'Iraq', 'Afghanistan', 'Poland', 'Canada', 'Morocco',
       'Saudi Arabia', 'Uzbekistan', 'Malaysia', 'Peru', 'Angola',
       'Ghana', 'Mozambique', 'Yemen', 'Nepal', 'Venezuela', 'Madagascar',
       'Cameroon', "Côte d'Ivoire", 'North Korea', 'Australia', 'Niger',
       'Taiwan', 'Sri Lanka', 'Burkina Faso', 'Mali', 'Romania', 'Malawi',
       'Chile', 'Kazakhstan', 'Zambia', 'Guatemala', 'Ecuador', 'Syria',
       'Netherlands', 'Senegal', 'Cambodia', 'Chad', 'Somalia',
       'Zimbabwe', 'Guinea', 'Rwanda', 'Benin', 'Burundi', 'Tunisia',
       'Bol

In [None]:
label_map = {
    'Low' : 0,
    'Moderate' : 1,
    'High' : 2,
    'Critical' : 3
}

df['Water Scarcity Level'] = df['Water Scarcity Level'].map(label_map)

In [None]:
X = df.drop(columns='Water Scarcity Level')
y = df['Water Scarcity Level']

# Pipeline

In [None]:
num_features = X.select_dtypes(include=['float64', 'int64']).columns
cat_features = X.select_dtypes(include=['object']).columns

In [None]:
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [None]:
cat_transformer = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_features),
        ('num', num_transformer, num_features)
    ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest classifier

In [None]:
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    criterion='entropy',
    random_state=42
)


In [None]:

rf_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
y_pred = rf_clf.predict(X_train)
accuracy_score(y_train, y_pred)

0.9977564102564103

In [None]:
y_pred = rf_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9935897435897436

In [None]:
confusion_matrix(y_test, y_pred)

array([[595,   0,   0,   0],
       [  0, 111,   0,   0],
       [  0,   2,  62,   0],
       [  2,   1,   0,   7]])

# Grid search

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 3, 5, 9, 11],
    'criterion': ['gini', 'entropy']
}

grid = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs = -1
)

In [None]:
grid.fit(X_train, y_train)
grid.best_params_

{'criterion': 'entropy', 'max_depth': None, 'n_estimators': 200}

# Logistic Regression

In [None]:
model = LogisticRegression(
    C=1.0,
    penalty='l2',
    solver='lbfgs',
)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.95

# XGBoost classifier

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=4,
    n_estimators=300,
    max_depth=None,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [None]:
model_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb_model)
])

In [None]:
model_pipe.fit(X_train, y_train)
y_pred = model_pipe.predict(X_test)
accuracy_score(y_test, y_pred)


0.9961538461538462

In [None]:
confusion_matrix(y_test, y_pred_xgb)

array([[595,   0,   0,   0],
       [  0, 111,   0,   0],
       [  0,   1,  63,   0],
       [  1,   1,   0,   8]])

# Grid search

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 3, 5, 9, 11],
    'learning_rate': [0.01, 0.05, 0.1]
}

grid = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs = -1
)

In [None]:
grid.fit(X_train_scaled, y_train)

In [None]:
grid.best_params_

{'learning_rate': 0.05, 'max_depth': None, 'n_estimators': 300}