In [None]:
import pandas as pd
import glob
import os

path = "./"

all_files = glob.glob(os.path.join(path, "*.csv"))

dfs = []
for file in all_files:
    df = pd.read_csv(file)
    df = df.drop(columns=["Код поста", "Условный знак"], errors="ignore")

    reservoir_name = os.path.splitext(os.path.basename(file))[0]

    df = df.rename(columns={"Значение": f"Value {reservoir_name}"})

    dfs.append(df)

final_df = dfs[0]
for df in dfs[1:]:
    final_df = pd.merge(final_df, df, on="Дата", how="outer")

final_df= final_df.rename(columns={"Дата": "Date"})
final_df.to_csv("water.csv", index=False, encoding="utf-8-sig")

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
DATA_PATH = 'water.csv'
TARGET_COL = 'Value balhash-alakol'
RANDOM_STATE = 42

In [None]:
splits = [(0.60,0.40), (0.70,0.30), (0.75,0.25), (0.80,0.20), (0.85,0.15)]

In [None]:
df = pd.read_csv(DATA_PATH)

In [None]:
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df['year'] = df['Date'].dt.year
    df['month'] = df['Date'].dt.month
    df['day'] = df['Date'].dt.day

    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['day_of_year'] = df['Date'].dt.dayofyear
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    df['quarter'] = df['Date'].dt.quarter

In [None]:
for c in df.columns:
  if c != 'Date':
    df[c] = pd.to_numeric(df[c], errors='coerce')

num_cols = [c for c in df.columns if c not in ['Date', TARGET_COL] and pd.api.types.is_numeric_dtype(df[c])]

for c in num_cols + [TARGET_COL]:
  if c in df.columns:
    df[c] = df[c].fillna(df[c].median())

In [None]:
for lag in [7, 14, 30]:
    df[f'{TARGET_COL}_lag_{lag}'] = df[TARGET_COL].shift(lag)

df[f'{TARGET_COL}_rolling_7'] = df[TARGET_COL].rolling(window=7).mean()
df[f'{TARGET_COL}_rolling_30'] = df[TARGET_COL].rolling(window=30).mean()

num_cols = [c for c in df.columns if c not in ['Date', TARGET_COL] and pd.api.types.is_numeric_dtype(df[c])]

for c in num_cols:
    if c in df.columns:
        df[c] = df[c].fillna(df[c].median())

In [None]:
X = df[num_cols]
y = df[TARGET_COL]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
algs = {
  'Decision Tree': DecisionTreeRegressor(random_state=RANDOM_STATE),
  'Random Forest': RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE),
  'KNN': KNeighborsRegressor(n_neighbors=5)
}

In [None]:
results = []

for i, (train_frac, test_frac) in enumerate(splits, start=1):
  X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, train_size=train_frac, random_state=RANDOM_STATE)
  for name, model in algs.items():
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
    r2 = r2_score(Y_test, Y_pred)
    results.append({
      'Algorithm': name,
      'Iteration': i,
      'Num_features': X.shape[1],
      'Train_size_%': int(train_frac*100),
      'Test_size_%': int(test_frac*100),
      'RMSE': rmse,
      'R2': r2
    })

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

avg_results = results_df.groupby('Algorithm')[['RMSE', 'R2']].mean().reset_index()

avg_results = avg_results[avg_results['Algorithm'].isin(['Decision Tree', 'Random Forest', 'KNN'])]

plt.figure(figsize=(8, 6))

colors = {'Decision Tree': 'C0', 'Random Forest': 'C2', 'KNN': 'C1'}
markers = {'Decision Tree': 'o', 'Random Forest': 's', 'KNN': 'X'}

for idx, row in avg_results.iterrows():
    alg = row['Algorithm']
    plt.scatter(row['RMSE'], row['R2'],
                color=colors[alg],
                marker=markers[alg],
                s=100,
                label=alg,
                linewidths=1.5)
    plt.annotate(alg,
                 xy=(row['RMSE'], row['R2']),
                 xytext=(5, 5),
                 textcoords='offset points',
                 fontsize=10)
plt.xlabel('RMSE', fontsize=12)
plt.ylabel('R²', fontsize=12)
plt.title('Average performance comparison of models by RMSE and R²', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(title='Algorithm', loc='best')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.neural_network import MLPRegressor

for i, (train_frac, test_frac) in enumerate(splits, start=1):
    X_train, X_test, Y_train, Y_test = train_test_split(
        X_scaled, y, train_size=train_frac, random_state=RANDOM_STATE
    )

    knn = KNeighborsRegressor(n_neighbors=5)
    knn.fit(X_train, Y_train)

    residuals = Y_train - knn.predict(X_train)

    nn = MLPRegressor(
        hidden_layer_sizes=(64, 32),
        activation='relu',
        solver='adam',
        max_iter=1000,
        random_state=RANDOM_STATE
    )
    nn.fit(X_train, residuals)

    Y_pred_hybrid = knn.predict(X_test) + nn.predict(X_test)

    rmse = np.sqrt(mean_squared_error(Y_test, Y_pred_hybrid))
    r2 = r2_score(Y_test, Y_pred_hybrid)

    results.append({
        'Algorithm': 'Hybrid KNN+NN',
        'Iteration': i,
        'Num_features': X.shape[1],
        'Train_size_%': int(train_frac * 100),
        'Test_size_%': int(test_frac * 100),
        'RMSE': rmse,
        'R2': r2
    })

In [None]:
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
for alg in results_df["Algorithm"].unique():
    subset = results_df[results_df["Algorithm"] == alg]
    plt.plot(subset["Train_size_%"], subset["RMSE"], marker="o", label=alg)

plt.xlabel("Training set size (%)")
plt.ylabel("RMSE")
plt.title("RMSE trends across train/test splits for different models")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

plt.figure(figsize=(8, 6))

subset = results_df[results_df["Algorithm"] == "KNN"]
plt.plot(subset["Train_size_%"], subset["RMSE"], marker="o", label="KNN")
subset = results_df[results_df["Algorithm"] == "Hybrid KNN+NN"]
plt.plot(subset["Train_size_%"], subset["RMSE"], marker="o", label="Hybrid KNN+MLP")

plt.xlabel("Training set size (%)")
plt.ylabel("RMSE")
plt.title("RMSE trends across train/test splits for different models")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
residuals_data = []

for i, (train_frac, test_frac) in enumerate(splits, start=1):
    X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, train_size=train_frac, random_state=RANDOM_STATE)
    rf = KNeighborsRegressor(n_neighbors=5)
    rf.fit(X_train, Y_train)
    Y_pred = rf.predict(X_test)
    residuals = Y_test - Y_pred
    for r in residuals:
        residuals_data.append({"Iteration": i, "Residual": r})

residuals_df = pd.DataFrame(residuals_data)

plt.figure(figsize=(8, 6))
sns.boxplot(x="Iteration", y="Residual", data=residuals_df, palette="Set2")
plt.axhline(0, color="red", linestyle="--")
plt.xlabel("Iteration (Train/Test split index)")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Distribution of residuals for KNN predictions")
plt.show()