In [None]:
# @title
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pwd

In [None]:
try:
    import geopandas as gpd
except ModuleNotFoundError:
    if 'google.colab' in str(get_ipython()):
        !pip install geopandas --quiet
    else:
        print('geopandas not found, please install via conda in your environment')

In [None]:
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [None]:
MODEL_VERSION = "v2"
INFERENCE_METRICS = "inference"

# File Paths
DATASET_CSV = "/content/drive/MyDrive/IPAUA_Maz/dataset/synthetic_data_zones_4_and_9.csv"
MODEL_FILE = f"/content/drive/MyDrive/IPAUA_Maz/models/kmeans_model_{MODEL_VERSION}.pkl"
METRICS = f"/content/drive/MyDrive/IPAUA_Maz/models/kmeans_{INFERENCE_METRICS}_metrics.csv"
DATA_WITH_CLUSTERS = "/content/drive/MyDrive/IPAUA_Maz/dataset/data_with_clusters.csv"
ENCODERS_FILE = '/content/drive/MyDrive/IPAUA_Maz/models/encoders.pkl'

In [None]:
def preprocess_data(data, features, use_normalizer=False):
    # Separate numeric and categorical features
    numeric_features = data[features].select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = data[features].select_dtypes(include=['object', 'category']).columns.tolist()

    # Impute missing values for numeric features
    tqdm.pandas(desc="Imputing missing values for numeric features")
    imputer_numeric = SimpleImputer(strategy="mean")
    X_numeric_imputed = imputer_numeric.fit_transform(data[numeric_features].progress_apply(lambda x: x))

    # Scale numeric features
    tqdm.pandas(desc="Scaling numeric features")
    scaler = StandardScaler()
    X_numeric_scaled = scaler.fit_transform(X_numeric_imputed)

    if use_normalizer:
        tqdm.pandas(desc="Normalizing numeric features")
        normalizer = Normalizer()
        X_numeric_normalized = normalizer.fit_transform(X_numeric_scaled)
        X_numeric = X_numeric_normalized
    else:
        X_numeric = X_numeric_scaled

    # Impute missing values for categorical features
    tqdm.pandas(desc="Imputing missing values for categorical features")
    imputer_categorical = SimpleImputer(strategy="most_frequent")
    X_categorical_imputed = imputer_categorical.fit_transform(data[categorical_features].progress_apply(lambda x: x))

    # Encode categorical features
    tqdm.pandas(desc="Encoding categorical features")
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_categorical_encoded = encoder.fit_transform(X_categorical_imputed)

    # Combine numeric and categorical features
    X_combined = np.hstack((X_numeric, X_categorical_encoded))

    # return X_combined, data
    return X_combined


# Load the trained K-means model
def load_model(file_name):
    return joblib.load(file_name)


# Load and preprocess new data
def load_and_preprocess_data(file_path, features, use_normalizer=False):
    # Load the new data
    data = pd.read_csv(file_path)

    # Drop unnecessary columns which are not in features list
    columns_to_drop = [col for col in data.columns if col not in features]
    data = data.drop(columns=columns_to_drop)

    # Preprocess the new data
    X_scaled = preprocess_data(data, features)

    return X_scaled, data


# Calculate performance metrics
def calculate_performance_metrics(X_scaled, clusters, kmeans):
    metrics = {}
    # metrics['Inertia'] = kmeans.inertia_
    # metrics['Silhouette Score'] = silhouette_score(X_scaled, clusters)
    metrics['CHI Score'] = calinski_harabasz_score(X_scaled, clusters)
    metrics['DBI Score'] = davies_bouldin_score(X_scaled, clusters)
    return metrics


# Perform inference and calculate metrics
def perform_inference(model_file, data_file, features):
    # Load the trained model
    kmeans = load_model(model_file)

    # Load and preprocess the new data
    X_scaled, data = load_and_preprocess_data(data_file, features)

    # Predict the clusters for the new data
    clusters = kmeans.predict(X_scaled)

    # Calculate performance metrics
    metrics = calculate_performance_metrics(X_scaled, clusters, kmeans)
    for metric, value in tqdm(metrics.items(), desc="Calculating performance metrics"):
        print(f"{metric}: {value}")

    return X_scaled, data, clusters, metrics


# Function to visualize clusters
def visualize_clusters(X_scaled, clusters, title):
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette="viridis")
    plt.title(title)
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.legend(title="Cluster")
    plt.show()


# Function to visualize areas on map
def visualize_areas_on_map(best_areas, title):
    plt.figure(figsize=(10, 10))
    scatter = plt.scatter(
        best_areas["Longitude"], best_areas["Latitude"],
        c=best_areas["Cluster"], cmap="viridis", s=50
    )
    plt.title(title)
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.colorbar(scatter, label="Cluster")
    plt.show()

In [None]:
# Define features for preprocessing
features = [
    "soil_moisture",
    "NDBI",
    # "BU", # Dropped since BU is already present in LULC Classes Column
    "Roughness",
    # "Slope",
    "NDVI", "LST", "UHI", "UTFVI", "NDWI", "SAVI", "lulc_classes", "LandUse", "GHI",
    # "CH4", "CO", "HCHO", "NO2", "O3", "SO2", # AIR QUIALITY
    "Longitude", "Latitude"
]

# X_scaled, data = load_and_preprocess_data(DATASET_CSV, features)

# # Load the trained model
# kmeans = load_model(MODEL_FILE)

# # Predict the clusters for the new data
# clusters = kmeans.predict(X_scaled)

# Perform inference
X_scaled, data, clusters, metrics = perform_inference(MODEL_FILE, DATASET_CSV, features)

# Save the metrics to a CSV file
metrics_df = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])
metrics_df.to_csv(METRICS, index=False)
print(f"Metrics Saved to: {METRICS}")

# Add the cluster assignments to the original data
data['Cluster'] = clusters

# Save the data with cluster assignments to a new CSV file

data.to_csv(DATA_WITH_CLUSTERS, index=False)
print(f"Data with cluster assignments saved to: {DATA_WITH_CLUSTERS}")

In [None]:
data_cols = data.columns.to_list()

In [None]:
# Check if the lengths match
lengths_match = len(data_cols) == len(features)
print(f"Lengths match: {lengths_match}")

# Check if all items match (regardless of order)
items_match = set(data_cols) == set(features)
print(f"Items match: {items_match}")

# Check for any differences
missing_in_original = set(features) - set(data_cols)
missing_in_features = set(data_cols) - set(features)

if not lengths_match or not items_match:
    print("Differences found:")
    if missing_in_original:
        print(f"Items in 'features' but not in 'original_list': {missing_in_original}")
    if missing_in_features:
        print(f"Items in 'original_list' but not in 'features': {missing_in_features}")
else:
    print("Both lists match perfectly.")

In [None]:
data.info()

In [None]:
X_scaled

In [None]:
# Visualize clusters
visualize_clusters(X_scaled, clusters, "Clusters of Urban Agriculture Areas on New Data")

In [None]:
# Visualize areas on map
visualize_areas_on_map(data, "Clustered Areas for New Data")

In [None]:
# Display relevant data
print(
    data[
        [
    "soil_moisture",
    "NDBI",
    # "BU", # Dropped since BU is already present in LULC Classes Column
    "Roughness",
    # "Slope",
    "NDVI", "LST", "UHI", "UTFVI", "NDWI", "SAVI", "lulc_classes", "LandUse", "GHI",
    # "CH4", "CO", "HCHO", "NO2", "O3", "SO2", # AIR QUIALITY
    "Longitude", "Latitude"
    ]
        ]
    )