<a href="https://colab.research.google.com/github/mohammadbadi/Clustering_FE_MCA/blob/main/Code%20Sections/5.6.1%20DBSCAN%20EPS%20%2B%20Min%20Sample%20Tuning%20Stage%201.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **5.6.1 DBSCAN EPS + Min Sample Tuning: Stage 1 - Coarse Tuning**
### **wider jumps of eps and min sample - APPROACH 3**

In [None]:
import pandas as pd                                                               # Parallel Processing Setup
start_chunk = 16                                                                  # Set these variables for each run. Edit as needed (1-indexed)
num_chunks_to_process = 5                                                         # Edit value to change number of chunks to be processed at 1 go
end_chunk = start_chunk + num_chunks_to_process - 1

feature_df = pd.read_csv("Feature_Combo_All_with_set_number.csv")                 # Load the previously saved feature combinations CSV.

import math                                                                       # Split the list of feature sets into chunks of 143 sets.
all_combinations = feature_df['features'].apply(eval).tolist()                    # Convert string repr back to list
chunk_size = 143
chunks = [all_combinations[i:i + chunk_size] for i in range(0, len(all_combinations), chunk_size)]
total_chunks = len(chunks)

if start_chunk < 1 or start_chunk > total_chunks:
    raise ValueError("start_chunk must be between 1 and total number of chunks")
if end_chunk > total_chunks:
    end_chunk = total_chunks

print(f"Processing chunks from {start_chunk} to {end_chunk} (each with 143 sets)")

url = "https://raw.githubusercontent.com/mohammadbadi/Clustering_FE_MCA/refs/heads/main/Output_CSV/FE_Encoded_New.csv" # URL of the Dataset
data = pd.read_csv(url)                                                           # Load the Dataset
data = data.sample(frac=0.1, random_state=42)                                     # 10% Sample from Dataset

import numpy as np                                                                # STAGE 1 - Coarse Tuning for Selected Chunks
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from google.colab import files

eps_values_coarse = np.array([1, 2, 3, 4, 5])
min_samples_values_coarse = [5, 10, 15]

for chunk_index in range(start_chunk, end_chunk+1):                               # Process each chunk in the specified range.
    chunk = chunks[chunk_index - 1]                                               # converting 1-indexed to 0-indexed
    results = []
    for local_set_number, feature_set in enumerate(chunk, start=1):               # Process each set in the current chunk.
        global_set_number = (chunk_index - 1) * chunk_size + local_set_number     # Compute global set_number for clarity.

        missing_features = [feat for feat in feature_set if feat not in data.columns]
        if missing_features:                                                      # If features are missing, record a row with Nones.
            results.append({
                "set_number": global_set_number,
                "features": feature_set,
                "best_eps": None,
                "best_min_samples": None,
                "best_silhouette_score": None,
                "top10_eps": None,
                "top10_min_samples": None,
                "top10_eps_min": None,
                "top10_eps_max": None,
                "top10_min_samples_min": None,
                "top10_min_samples_max": None
            })
            print(f"Set {global_set_number} evaluated. Missing features: {missing_features}")
            continue

        df_subset = data[feature_set]
        scaler = StandardScaler()
        df_subset = pd.DataFrame(scaler.fit_transform(df_subset), columns=df_subset.columns)

        score_list = []
        for eps in eps_values_coarse:
            for min_samples in min_samples_values_coarse:
                try:
                    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                    labels = dbscan.fit_predict(df_subset)                        # Only consider valid clustering results (at least 2 clusters and some non-noise labels)
                    if len(set(labels)) > 1 and any(label != -1 for label in labels):
                        score = silhouette_score(df_subset, labels)
                        score_list.append((score, eps, min_samples))
                except Exception:
                    continue

        if score_list:
            score_list.sort(key=lambda x: x[0], reverse=True)
            top_10 = score_list[:10]
            top10_eps = [entry[1] for entry in top_10]
            top10_min_samples = [entry[2] for entry in top_10]
            eps_range = (min(top10_eps), max(top10_eps))
            min_samples_range = (min(top10_min_samples), max(top10_min_samples))
            best_score, best_eps, best_min_samples = top_10[0]

            results.append({
                "set_number": global_set_number,
                "features": feature_set,
                "best_eps": best_eps,
                "best_min_samples": best_min_samples,
                "best_silhouette_score": best_score,
                "top10_eps": str(top10_eps),
                "top10_min_samples": str(top10_min_samples),
                "top10_eps_min": eps_range[0],
                "top10_eps_max": eps_range[1],
                "top10_min_samples_min": min_samples_range[0],
                "top10_min_samples_max": min_samples_range[1]
            })
            print(f"Set {global_set_number} evaluated. Best: (eps: {best_eps}, min_samples: {best_min_samples}), Score: {best_score:.4f}")
        else:
            results.append({
                "set_number": global_set_number,
                "features": feature_set,
                "best_eps": None,
                "best_min_samples": None,
                "best_silhouette_score": None,
                "top10_eps": None,
                "top10_min_samples": None,
                "top10_eps_min": None,
                "top10_eps_max": None,
                "top10_min_samples_min": None,
                "top10_min_samples_max": None
            })
            print(f"Set {global_set_number} evaluated. No valid clustering found.")

    results_df = pd.DataFrame(results)                                            # Save results for the current chunk.
    output_filename = f"DBSCAN_Coarse_Top10_chunk_{chunk_index}.csv"
    results_df.to_csv(output_filename, index=False)
    print(f"Chunk {chunk_index} processed and saved to '{output_filename}'.")
    files.download(output_filename)


Processing chunks from 16 to 20 (each with 143 sets)
Set 2146 evaluated. Best: (eps: 1, min_samples: 10), Score: 0.1655
Set 2147 evaluated. Best: (eps: 1, min_samples: 15), Score: -0.0360
Set 2148 evaluated. Best: (eps: 1, min_samples: 10), Score: -0.0129
Set 2149 evaluated. Best: (eps: 1, min_samples: 5), Score: 0.0016
Set 2150 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3792
Set 2151 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3461
Set 2152 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3461
Set 2153 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3199
Set 2154 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3842
Set 2155 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3508
Set 2156 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3507
Set 2157 evaluated. Best: (eps: 3, min_samples: 5), Score: 0.3531
Set 2158 evaluated. Best: (eps: 1, min_samples: 5), Score: 0.1515
Set 2159 evaluated. Best: (eps: 1, min_samples: 5), Score: -0.1079
Set 2160 evaluate

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Set 2289 evaluated. Best: (eps: 1, min_samples: 5), Score: 0.1086
Set 2290 evaluated. Best: (eps: 1, min_samples: 15), Score: 0.1525
Set 2291 evaluated. Best: (eps: 1, min_samples: 15), Score: 0.0649
Set 2292 evaluated. Best: (eps: 1, min_samples: 5), Score: 0.0615
Set 2293 evaluated. Best: (eps: 1, min_samples: 5), Score: 0.0713
Set 2294 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3865
Set 2295 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3517
Set 2296 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3516
Set 2297 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3540
Set 2298 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3916
Set 2299 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3564
Set 2300 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3562
Set 2301 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3587
Set 2302 evaluated. Best: (eps: 1, min_samples: 5), Score: 0.1771
Set 2303 evaluated. Best: (eps: 1, min_samples: 5), Score: 0.0472
Set 2304

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Set 2432 evaluated. Best: (eps: 1, min_samples: 5), Score: -0.0788
Set 2433 evaluated. Best: (eps: 1, min_samples: 15), Score: -0.0716
Set 2434 evaluated. Best: (eps: 1, min_samples: 10), Score: 0.0270
Set 2435 evaluated. Best: (eps: 1, min_samples: 15), Score: -0.1677
Set 2436 evaluated. Best: (eps: 1, min_samples: 5), Score: -0.1838
Set 2437 evaluated. Best: (eps: 1, min_samples: 15), Score: -0.2208
Set 2438 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3483
Set 2439 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3201
Set 2440 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3201
Set 2441 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.2949
Set 2442 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3531
Set 2443 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3245
Set 2444 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3245
Set 2445 evaluated. Best: (eps: 3, min_samples: 5), Score: 0.3268
Set 2446 evaluated. Best: (eps: 1, min_samples: 5), Score: -0.0540


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Set 2575 evaluated. Best: (eps: 1, min_samples: 10), Score: -0.1146
Set 2576 evaluated. Best: (eps: 1, min_samples: 10), Score: -0.0753
Set 2577 evaluated. Best: (eps: 1, min_samples: 5), Score: -0.1216
Set 2578 evaluated. Best: (eps: 1, min_samples: 5), Score: -0.0886
Set 2579 evaluated. Best: (eps: 2, min_samples: 15), Score: 0.1458
Set 2580 evaluated. Best: (eps: 2, min_samples: 10), Score: 0.1938
Set 2581 evaluated. Best: (eps: 2, min_samples: 10), Score: 0.1867
Set 2582 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3421
Set 2583 evaluated. Best: (eps: 2, min_samples: 10), Score: 0.2926
Set 2584 evaluated. Best: (eps: 2, min_samples: 10), Score: 0.2937
Set 2585 evaluated. Best: (eps: 2, min_samples: 10), Score: 0.2848
Set 2586 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3468
Set 2587 evaluated. Best: (eps: 3, min_samples: 5), Score: 0.3197
Set 2588 evaluated. Best: (eps: 3, min_samples: 5), Score: 0.3198
Set 2589 evaluated. Best: (eps: 3, min_samples: 5), Score: 0.322

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Set 2718 evaluated. Best: (eps: 1, min_samples: 15), Score: 0.1186
Set 2719 evaluated. Best: (eps: 1, min_samples: 15), Score: -0.1558
Set 2720 evaluated. Best: (eps: 1, min_samples: 10), Score: -0.1657
Set 2721 evaluated. Best: (eps: 1, min_samples: 10), Score: -0.1560
Set 2722 evaluated. Best: (eps: 1, min_samples: 5), Score: -0.2386
Set 2723 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.1734
Set 2724 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.1720
Set 2725 evaluated. Best: (eps: 2, min_samples: 10), Score: 0.1625
Set 2726 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3160
Set 2727 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.2691
Set 2728 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.2694
Set 2729 evaluated. Best: (eps: 2, min_samples: 15), Score: 0.2681
Set 2730 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3205
Set 2731 evaluated. Best: (eps: 3, min_samples: 5), Score: 0.2971
Set 2732 evaluated. Best: (eps: 3, min_samples: 5), Score: 0.2972


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>