In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/content/cleaned_global_water_consumption.csv")
df.head()

Unnamed: 0,Country,Year,Total Water Consumption (Billion Cubic Meters),Per Capita Water Use (Liters per Day),Agricultural Water Use (%),Industrial Water Use (%),Household Water Use (%),Rainfall Impact (Annual Precipitation in mm),Groundwater Depletion Rate (%)
0,Argentina,2000,481.49,235.431429,48.55,20.844286,30.1,1288.698571,3.255714
1,Argentina,2001,455.063,299.551,48.465,26.943,22.55,1371.729,3.12
2,Argentina,2002,482.749231,340.124615,50.375385,29.042308,23.349231,1590.305385,2.733846
3,Argentina,2003,452.66,326.756667,49.086667,30.476,24.44,1816.012667,2.708
4,Argentina,2004,634.566,230.346,38.67,36.67,23.924,815.998,1.902


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Country                                         500 non-null    object 
 1   Year                                            500 non-null    int64  
 2   Total Water Consumption (Billion Cubic Meters)  500 non-null    float64
 3   Per Capita Water Use (Liters per Day)           500 non-null    float64
 4   Agricultural Water Use (%)                      500 non-null    float64
 5   Industrial Water Use (%)                        500 non-null    float64
 6   Household Water Use (%)                         500 non-null    float64
 7   Rainfall Impact (Annual Precipitation in mm)    500 non-null    float64
 8   Groundwater Depletion Rate (%)                  500 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory us

In [None]:
df.describe()

Unnamed: 0,Year,Total Water Consumption (Billion Cubic Meters),Per Capita Water Use (Liters per Day),Agricultural Water Use (%),Industrial Water Use (%),Household Water Use (%),Rainfall Impact (Annual Precipitation in mm),Groundwater Depletion Rate (%)
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,2012.0,501.22443,276.004782,50.180829,27.792837,24.832515,1544.8243,2.573037
std,7.218324,96.078937,42.669593,5.566886,4.36166,2.956135,292.786579,0.48063
min,2000.0,129.636667,111.708333,28.905,13.276667,13.668333,700.23,1.3
25%,2006.0,441.447385,250.225406,46.513611,25.018,23.015355,1353.734583,2.235
50%,2012.0,502.197154,276.430556,50.318482,27.648539,25.071944,1537.537724,2.555778
75%,2018.0,563.849594,300.22175,54.061964,30.698333,26.720167,1746.402425,2.887202
max,2024.0,798.418,404.35,66.52,43.583333,34.202,2533.678,4.322


In [None]:
df.isnull().sum()

Unnamed: 0,0
Country,0
Year,0
Total Water Consumption (Billion Cubic Meters),0
Per Capita Water Use (Liters per Day),0
Agricultural Water Use (%),0
Industrial Water Use (%),0
Household Water Use (%),0
Rainfall Impact (Annual Precipitation in mm),0
Groundwater Depletion Rate (%),0


In [None]:
# Step 2: Preprocess Data
# Encode categorical column 'State'
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Country'] = label_encoder.fit_transform(df['Country'])

In [None]:
df.drop(columns=["Year"],inplace=True)

In [None]:
df.head()

Unnamed: 0,Country,Total Water Consumption (Billion Cubic Meters),Per Capita Water Use (Liters per Day),Agricultural Water Use (%),Industrial Water Use (%),Household Water Use (%),Rainfall Impact (Annual Precipitation in mm),Groundwater Depletion Rate (%)
0,0,481.49,235.431429,48.55,20.844286,30.1,1288.698571,3.255714
1,0,455.063,299.551,48.465,26.943,22.55,1371.729,3.12
2,0,482.749231,340.124615,50.375385,29.042308,23.349231,1590.305385,2.733846
3,0,452.66,326.756667,49.086667,30.476,24.44,1816.012667,2.708
4,0,634.566,230.346,38.67,36.67,23.924,815.998,1.902


In [None]:
# Step 3: Seperate x and y (features and target variable)
from sklearn.model_selection import train_test_split
x = df.drop(columns=["Groundwater Depletion Rate (%)"])  # Features
y = df["Groundwater Depletion Rate (%)"]  # Target variable

In [None]:
x

Unnamed: 0,Country,Total Water Consumption (Billion Cubic Meters),Per Capita Water Use (Liters per Day),Agricultural Water Use (%),Industrial Water Use (%),Household Water Use (%),Rainfall Impact (Annual Precipitation in mm)
0,0,481.490000,235.431429,48.550000,20.844286,30.100000,1288.698571
1,0,455.063000,299.551000,48.465000,26.943000,22.550000,1371.729000
2,0,482.749231,340.124615,50.375385,29.042308,23.349231,1590.305385
3,0,452.660000,326.756667,49.086667,30.476000,24.440000,1816.012667
4,0,634.566000,230.346000,38.670000,36.670000,23.924000,815.998000
...,...,...,...,...,...,...,...
495,19,418.097000,292.970000,47.448000,25.266000,27.538000,1510.662000
496,19,572.094000,275.978000,46.195000,32.223000,26.720000,754.615000
497,19,440.978000,292.039000,54.810000,30.918000,22.638000,2119.898000
498,19,566.865000,261.197500,62.945000,25.207500,21.632500,1439.155000


In [None]:
y

Unnamed: 0,Groundwater Depletion Rate (%)
0,3.255714
1,3.120000
2,2.733846
3,2.708000
4,1.902000
...,...
495,2.431000
496,2.628000
497,2.871000
498,1.597500


In [None]:
# Step 4: Split data into training and testing sets (80% train, 20% test)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Step 5: Normalize the data to improve model performance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [None]:
xtrain

array([[-0.07800558, -0.87778046,  1.11137372, ..., -0.49488843,
        -0.87656438, -0.11974328],
       [ 1.34838214,  0.60359294, -0.69859757, ..., -0.1477491 ,
         1.18824755, -0.22177743],
       [-1.68269177, -0.20223398, -0.1726071 , ...,  1.51132502,
         0.65397103,  0.27236726],
       ...,
       [ 0.63518828,  1.06409874, -0.17855716, ...,  1.4323534 ,
        -0.371179  , -0.15412602],
       [ 1.34838214,  0.58688761,  0.45095648, ..., -0.38757642,
        -2.4616519 , -1.64260181],
       [-0.9694979 , -0.65764089, -0.92835535, ...,  0.53211025,
         0.73351132, -0.0429757 ]])

In [18]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain, ytrain)

In [19]:
ypred = model.predict(xtest)

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Step 1: Generate dummy dataset with 5 features (X shape: [100, 5])
X = np.random.rand(100, 5) * 100  # 100 samples, 5 features
# Let's create a target with a custom formula and some noise
y = 2 * X[:, 0] + 3 * X[:, 1] - 1.5 * X[:, 2] + 0.5 * X[:, 3] + 4 * X[:, 4] + np.random.randn(100) * 10

# Step 2: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train model
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 4: Predict and evaluate
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", round(mae, 2))
print("Mean Squared Error:", round(mse, 2))
print("R-squared Score:", round(r2, 2))

# Step 5: Take user input for prediction
print("\nEnter values for prediction (5 features):")
TWC = float(input(" Enter TWC: "))
CWU = float(input("  ENTER CWU : "))
AWU = float(input(" ENTER AWU  : "))
IWU = float(input(" Enter IWU : "))
HWU = float(input(" Enter HWU : "))

user_prediction = model.predict([[TWC, CWU, AWU, IWU, HWU]])
print(f"\nPredicted Target Value: {user_prediction[0]:.2f}")


Mean Absolute Error: 62.17
Mean Squared Error: 6504.54
R-squared Score: 0.79

Enter values for prediction (5 features):
 Enter TWC: 567
  ENTER CWU : 765
 ENTER AWU  : 897
 Enter IWU : 456
 Enter HWU : 865

Predicted Target Value: 780.33
