In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from google.colab import files
import io

# Uploading the dataset
uploaded = files.upload()

# Read the uploaded CSV file
file_name = list(uploaded.keys())[0]
print(file_name)
pd_data = pd.read_csv(io.StringIO(uploaded[file_name].decode('utf-8')))
print(pd_data.head())  # Displaying the first few rows of the dataset

# Separating numerical and categorical features
numeric_columns = pd_data.select_dtypes(include=['number']).columns
categorical_columns = pd_data.select_dtypes(include=['object']).columns

# Handling missing values for numerical columns
numerical_data = pd_data[numeric_columns].to_numpy()
numerical_imputer = SimpleImputer(strategy='mean')
numerical_data = numerical_imputer.fit_transform(numerical_data)

# Handling missing values for categorical columns (if any)
if len(categorical_columns) > 0:
    categorical_data = pd_data[categorical_columns].to_numpy()
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    categorical_data = categorical_imputer.fit_transform(categorical_data)
else:
    print("No categorical columns found.")

# Normalizing numerical data for clustering
scaler = StandardScaler()
numerical_data_normalized = scaler.fit_transform(numerical_data)

# Encoding categorical data (if any)
if len(categorical_columns) > 0:
    encoder = OneHotEncoder(handle_unknown='ignore')
    categorical_data_encoded = encoder.fit_transform(categorical_data)
    data_combined = pd.concat([pd.DataFrame(numerical_data_normalized),
                               pd.DataFrame(categorical_data_encoded.toarray())], axis=1)
else:
    data_combined = pd.DataFrame(numerical_data_normalized)

# Specify multiple values of num_clusters
num_clusters_values = [1, 3, 5, 7, 9]

# Loop over each value of num_clusters
for num_clusters in num_clusters_values:
    # Step 1: Clustering the Data (K-means)
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)
    cluster_labels = kmeans.fit_predict(data_combined)

    # Step 2: Train Random Forest for Each Cluster
    random_forest_per_cluster = {}  # Dictionary to store Random Forest for each cluster

    for cluster_id in range(num_clusters):
        cluster_data = data_combined[cluster_labels == cluster_id]
        cluster_target = pd_data[cluster_labels == cluster_id].iloc[:, -1]  # Replace with target column index or name

        # Check if the cluster has samples for the target class
        if cluster_target.nunique() > 1:
            # Splitting data for training and validation
            train_data, valid_data, train_target, valid_target = train_test_split(
                cluster_data, cluster_target, test_size=0.2, random_state=42
            )

            # Train a Random Forest classifier on each cluster
            random_forest = RandomForestClassifier(random_state=42)  #  adjust parameters
            random_forest.fit(train_data, train_target)

            # Store the trained Random Forest for each cluster
            random_forest_per_cluster[cluster_id] = random_forest

    # Step 3: Ensemble Vote for a Test Instance
    test_instance_idx = 0  # Choose a specific test instance index
    test_instance = data_combined.iloc[test_instance_idx]
    test_cluster_id = kmeans.predict([test_instance])[0]
    ensemble_votes = []

    if test_cluster_id in random_forest_per_cluster:
        random_forest = random_forest_per_cluster[test_cluster_id]
        test_data = data_combined[cluster_labels == test_cluster_id]

        cluster_predictions = random_forest.predict(test_data)
        ensemble_votes.extend(cluster_predictions)

    # Determine the most popular class among ensemble votes if votes exist
    if ensemble_votes:
        votes_counter = Counter(ensemble_votes)
        most_popular_class = votes_counter.most_common(1)[0][0]
        print(f"The ensemble vote for the test instance (k={num_clusters}) is: {most_popular_class}")

        # Calculate Accuracy, Precision, and Recall of Ensemble Voting
        actual_labels = pd_data[cluster_labels == test_cluster_id].iloc[:, -1]
        accuracy = accuracy_score(actual_labels, [most_popular_class] * len(actual_labels))
        print(f"Accuracy of the ensemble vote (k={num_clusters}): {accuracy:.4f}")

        # Calculate Precision and Recall using weighted average for multiclass
        precision = precision_score(actual_labels, [most_popular_class] * len(actual_labels), average='weighted', zero_division=0)
        recall = recall_score(actual_labels, [most_popular_class] * len(actual_labels), average='weighted', zero_division=0)

        print(f"Precision of the ensemble vote (k={num_clusters}): {precision:.4f}")
        print(f"Recall of the ensemble vote (k={num_clusters}): {recall:.4f}")
    else:
        print(f"No votes in the ensemble for k={num_clusters}.")
