In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pickle
import gzip

# Load the dataset
df = pd.read_csv("water_quality_data.csv")

# Select features and target
features = ["Salinity (ppt)", "Water Temp (?C)", "Secchi Depth (m)", "Air Temp-Celsius"]
target = "Dissolved Oxygen (mg/L)"

# Subset the dataframe
df = df[features + [target]]

# Handle missing values
for col in features + [target]:
    df[col].fillna(df[col].median(), inplace=True)

# Define X and y
X = df[features]
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f"Dissolved Oxygen Prediction Model - Mean Squared Error: {mse}")

# Save the model and scaler using pickle with gzip compression
# Save the model as a compressed file
with gzip.open("do_prediction_model.pkl.gz", "wb") as f:
    pickle.dump(rf_model, f)

# Save the scaler as a compressed file
with gzip.open("scaler_do.pkl.gz", "wb") as f:
    pickle.dump(scaler, f)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

Dissolved Oxygen Prediction Model - Mean Squared Error: 2.694695801366
