In [27]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer  # To enable the IterativeImputer
from sklearn.impute import IterativeImputer

# Load the dataset
file_path = 'Complete.csv'
df = pd.read_csv(file_path)

# List of columns to fill based on their correlation with Phytoplankton (cells/ml)
columns_to_fill = ['pH (units)', 'Ammonia (mg/L)', 'Nitrate (mg/L)', 'Inorganic Phosphate (mg/L)', 'BOD (mg/l)', 'Dissolved Oxygen (mg/l)']

# Define the predictors to be used
predictors = ['Phytoplankton (cells/ml)']

# Create a subset of the DataFrame including the predictors and the columns to fill
df_subset = df[predictors + columns_to_fill]

# Initialize the MICE (Iterative Imputer)
mice_imputer = IterativeImputer(max_iter=10, random_state=0)

# Apply MICE imputation to the relevant subset of the DataFrame
df_imputed = pd.DataFrame(mice_imputer.fit_transform(df_subset), columns=df_subset.columns)

# Replace the original columns with the imputed ones
df[columns_to_fill] = df_imputed[columns_to_fill]

# Check if missing values are filled
print(df[columns_to_fill].isnull().sum())

# Save the updated dataset with imputed values to a new CSV file
df.to_csv('Complete_MICE_Imputed.csv', index=False)


pH (units)                    0
Ammonia (mg/L)                0
Nitrate (mg/L)                0
Inorganic Phosphate (mg/L)    0
BOD (mg/l)                    0
Dissolved Oxygen (mg/l)       0
dtype: int64


In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
import wandb


# Load dataset
merged_df = pd.read_csv('Complete_MICE_Imputed.csv')
merged_df = merged_df.dropna()

# Select relevant features and target
features = ['Temperature', 'Humidity', 'Wind Speed', 'pH (units)', 'Ammonia (mg/L)', 'Inorganic Phosphate (mg/L)', 'BOD (mg/l)', 'Total coliforms (MPN/100ml)']
target = 'Phytoplankton (cells/ml)'

# Split data into features (X) and target (y)
X = merged_df[features]
y = merged_df[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'alpha': [0, 0.1, 0.5],  # L1 regularization
    'lambda': [1, 1.5, 2]    # L2 regularization
}

# Initialize the XGBoost model
xgb_model = XGBRegressor()

# Perform Grid Search Cross-Validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best model after hyperparameter tuning
best_xgb = grid_search.best_estimator_

# Fit the best model on the training data
best_xgb.fit(X_train_scaled, y_train)

# Make predictions with the tuned model
y_pred_xgb_tuned = best_xgb.predict(X_test_scaled)

# Calculate evaluation metrics
mse_xgb_tuned = mean_squared_error(y_test, y_pred_xgb_tuned)
mae_xgb_tuned = mean_absolute_error(y_test, y_pred_xgb_tuned)
r2_xgb_tuned = r2_score(y_test, y_pred_xgb_tuned)

# Print the tuned model performance
print(f'Tuned XGBoost - Mean Squared Error: {mse_xgb_tuned}')
print(f'Tuned XGBoost - Mean Absolute Error: {mae_xgb_tuned}')
print(f'Tuned XGBoost - R^2 Score: {r2_xgb_tuned}')



Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
Tuned XGBoost - Mean Squared Error: 4750472531.364605
Tuned XGBoost - Mean Absolute Error: 38158.233733167144
Tuned XGBoost - R^2 Score: 0.6946681057590547


In [5]:
import pandas as pd
df = pd.read_csv('Complete_MICE_Imputed.csv')

highest = df['Phytoplankton (cells/ml)'].max()
lowest = df['Phytoplankton (cells/ml)'].min()
mean = df['Phytoplankton (cells/ml)'].mean()

print('Mean:', mean)

print("Highest:", highest)
print("Lowest:", lowest)



Mean: 50138.43838028169
Highest: 1145724.0
Lowest: 10.0
