<a href="https://colab.research.google.com/github/mohammadbadi/Clustering_Frequency/blob/main/Code%20Sections/5.6.1%20DBSCAN%20EPS%20%2B%20Min%20Sample%20Tuning%20Stage%201.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **STAGE 1 - Coarse Tuning  - wider jumps of eps and min sample**

In [None]:
                                                                                  # Parallel Processing Setup: Process 5 chunks per run
start_chunk = 26                                                                  # Set these variables for each run. Edit as needed (1-indexed)
num_chunks_to_process = 5
end_chunk = start_chunk + num_chunks_to_process - 1

feature_df = pd.read_csv("Feature_Combo_All_with_set_number.csv")                 # Load the previously saved feature combinations CSV.

import math                                                                       # Split the list of feature sets into chunks of 143 sets.
all_combinations = feature_df['features'].apply(eval).tolist()                    # Convert string repr back to list
chunk_size = 143
chunks = [all_combinations[i:i + chunk_size] for i in range(0, len(all_combinations), chunk_size)]
total_chunks = len(chunks)

if start_chunk < 1 or start_chunk > total_chunks:
    raise ValueError("start_chunk must be between 1 and total number of chunks")
if end_chunk > total_chunks:
    end_chunk = total_chunks

print(f"Processing chunks from {start_chunk} to {end_chunk} (each with 143 sets)")

url = "https://raw.githubusercontent.com/mohammadbadi/Clustering_Frequency/refs/heads/main/Output_CSV/FE_Encoded_New.csv" # Load the Dataset (10% sample)
data = pd.read_csv(url)
data = data.sample(frac=0.1, random_state=42)

import numpy as np                                                                # STAGE 1 - Coarse Tuning for Selected Chunks
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from google.colab import files

eps_values_coarse = np.array([1, 2, 3, 4, 5])
min_samples_values_coarse = [5, 10, 15]

for chunk_index in range(start_chunk, end_chunk+1):                               # Process each chunk in the specified range.
    chunk = chunks[chunk_index - 1]                                               # converting 1-indexed to 0-indexed
    results = []
    for local_set_number, feature_set in enumerate(chunk, start=1):               # Process each set in the current chunk.
        global_set_number = (chunk_index - 1) * chunk_size + local_set_number     # Compute global set_number for clarity.

        missing_features = [feat for feat in feature_set if feat not in data.columns]
        if missing_features:                                                      # If features are missing, record a row with Nones.
            results.append({
                "set_number": global_set_number,
                "features": feature_set,
                "best_eps": None,
                "best_min_samples": None,
                "best_silhouette_score": None,
                "top10_eps": None,
                "top10_min_samples": None,
                "top10_eps_min": None,
                "top10_eps_max": None,
                "top10_min_samples_min": None,
                "top10_min_samples_max": None
            })
            print(f"Set {global_set_number} evaluated. Missing features: {missing_features}")
            continue

        df_subset = data[feature_set]
        scaler = StandardScaler()
        df_subset = pd.DataFrame(scaler.fit_transform(df_subset), columns=df_subset.columns)

        score_list = []
        for eps in eps_values_coarse:
            for min_samples in min_samples_values_coarse:
                try:
                    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                    labels = dbscan.fit_predict(df_subset)                        # Only consider valid clustering results (at least 2 clusters and some non-noise labels)
                    if len(set(labels)) > 1 and any(label != -1 for label in labels):
                        score = silhouette_score(df_subset, labels)
                        score_list.append((score, eps, min_samples))
                except Exception:
                    continue

        if score_list:
            score_list.sort(key=lambda x: x[0], reverse=True)
            top_10 = score_list[:10]
            top10_eps = [entry[1] for entry in top_10]
            top10_min_samples = [entry[2] for entry in top_10]
            eps_range = (min(top10_eps), max(top10_eps))
            min_samples_range = (min(top10_min_samples), max(top10_min_samples))
            best_score, best_eps, best_min_samples = top_10[0]

            results.append({
                "set_number": global_set_number,
                "features": feature_set,
                "best_eps": best_eps,
                "best_min_samples": best_min_samples,
                "best_silhouette_score": best_score,
                "top10_eps": str(top10_eps),
                "top10_min_samples": str(top10_min_samples),
                "top10_eps_min": eps_range[0],
                "top10_eps_max": eps_range[1],
                "top10_min_samples_min": min_samples_range[0],
                "top10_min_samples_max": min_samples_range[1]
            })
            print(f"Set {global_set_number} evaluated. Best: (eps: {best_eps}, min_samples: {best_min_samples}), Score: {best_score:.4f}")
        else:
            results.append({
                "set_number": global_set_number,
                "features": feature_set,
                "best_eps": None,
                "best_min_samples": None,
                "best_silhouette_score": None,
                "top10_eps": None,
                "top10_min_samples": None,
                "top10_eps_min": None,
                "top10_eps_max": None,
                "top10_min_samples_min": None,
                "top10_min_samples_max": None
            })
            print(f"Set {global_set_number} evaluated. No valid clustering found.")

    results_df = pd.DataFrame(results)                                            # Save results for the current chunk.
    output_filename = f"DBSCAN_Coarse_Top10_chunk_{chunk_index}.csv"
    results_df.to_csv(output_filename, index=False)
    print(f"Chunk {chunk_index} processed and saved to '{output_filename}'.")
    files.download(output_filename)


Processing chunks from 26 to 30 (each with 143 sets)
Set 3576 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3559
Set 3577 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.4278
Set 3578 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3864
Set 3579 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3861
Set 3580 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3614
Set 3581 evaluated. Best: (eps: 1, min_samples: 10), Score: 0.2094
Set 3582 evaluated. Best: (eps: 1, min_samples: 10), Score: 0.0189
Set 3583 evaluated. Best: (eps: 1, min_samples: 15), Score: 0.0256
Set 3584 evaluated. Best: (eps: 1, min_samples: 15), Score: 0.0233
Set 3585 evaluated. Best: (eps: 1, min_samples: 5), Score: 0.1400
Set 3586 evaluated. Best: (eps: 1, min_samples: 15), Score: -0.0169
Set 3587 evaluated. Best: (eps: 1, min_samples: 5), Score: -0.0925
Set 3588 evaluated. Best: (eps: 1, min_samples: 10), Score: -0.1536
Set 3589 evaluated. Best: (eps: 2, min_samples: 5), Score: 0.3803
Set 3590 evalu