<a href="https://colab.research.google.com/github/mohammadbadi/Clustering_FE_MCA/blob/main/Code%20Sections/5.6.2%20DBSCAN%20EPS%20%2B%20Min%20Sample%20Tuning%20Stage%202.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **5.6.2 DBSCAN EPS + Min Sample Tuning: Stage 2 - Intermediate Tuning**
### **Narrower jumps of eps and min sample based on Stage 1 - APPROACH_1**

In [None]:
import warnings
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from google.colab import files
from IPython.display import display, HTML

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

url = "https://raw.githubusercontent.com/mohammadbadi/Clustering_FE_MCA/refs/heads/main/Output_CSV/FE_Encoded_New.csv" # URL of the Dataset
data = pd.read_csv(url)                                                           # Load the Dataset
data = data.sample(frac=0.1, random_state=42)                                     # 10% Sample from Dataset

                                                                                  # Set the range of chunks to process
start_chunk = 20                                                                  # Beginning value of Range
end_chunk = 20                                                                    # End Value of Range

expected_filenames = [f"DBSCAN_Coarse_Top10_chunk_{i}.csv" for i in range(start_chunk, end_chunk + 1)]

                                                                                  # Display upload prompt
html_output = f"""
<p>Please upload the following coarse tuning CSV files:</p>
<ul>
{''.join([f'<li><strong>{fname}</strong></li>' for fname in expected_filenames])}
</ul>
<p>No file chosen Cancel upload</p>
"""
display(HTML(html_output))

uploaded_files = files.upload()
missing_files = [fname for fname in expected_filenames if fname not in uploaded_files]  # Ensure all expected files are uploaded
if missing_files:
    print(f"Error: The following expected files were not uploaded: {missing_files}")
else:
    print("All files successfully uploaded. Processing...")

for expected_filename in expected_filenames:
    if expected_filename not in uploaded_files:
        continue

    coarse_df = pd.read_csv(expected_filename)
    intermediate_results = []
    for idx, row in coarse_df.iterrows():
        set_number = row["set_number"]
        feature_set = eval(row["features"])
        if pd.isnull(row["best_eps"]) or pd.isnull(row["best_min_samples"]):
            continue
        best_eps_coarse = row["best_eps"]
        best_min_samples_coarse = row["best_min_samples"]

        eps_grid = np.arange(best_eps_coarse - 0.5, best_eps_coarse + 0.5 + 0.1, 0.1) # Narrower eps jumps
        min_samples_grid = list(range(int(best_min_samples_coarse) - 2, int(best_min_samples_coarse) + 2 + 1))  # Narrower min sample jumps

        missing_features = [feat for feat in feature_set if feat not in data.columns]
        if missing_features:
            continue

        df_subset = data[feature_set]
        df_subset = pd.DataFrame(StandardScaler().fit_transform(df_subset), columns=df_subset.columns)

        score_list = []
        for eps in eps_grid:
            for min_samples in min_samples_grid:
                try:
                    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                    labels = dbscan.fit_predict(df_subset)
                    if len(set(labels)) > 1 and any(label != -1 for label in labels):
                        score = silhouette_score(df_subset, labels)
                        score_list.append((score, eps, min_samples))
                except Exception:
                    continue

        if score_list:
            score_list.sort(key=lambda x: x[0], reverse=True)
            top_10 = score_list[:10]
            top10_eps = [entry[1] for entry in top_10]
            top10_min_samples = [entry[2] for entry in top_10]
            eps_range = (min(top10_eps), max(top10_eps))
            min_samples_range = (min(top10_min_samples), max(top10_min_samples))
            best_score, best_eps, best_min_samples = top_10[0]
            intermediate_results.append({
                "set_number": set_number,
                "features": feature_set,
                "best_eps": best_eps,
                "best_min_samples": best_min_samples,
                "best_silhouette_score": best_score,
                "top10_eps": str(top10_eps),
                "top10_min_samples": str(top10_min_samples),
                "top10_eps_min": eps_range[0],
                "top10_eps_max": eps_range[1],
                "top10_min_samples_min": min_samples_range[0],
                "top10_min_samples_max": min_samples_range[1]
            })
            print(f"Intermediate tuning: Set {set_number} processed. Best: (eps: {best_eps}, min_samples: {best_min_samples}), Score: {best_score:.4f}")
        else:
            intermediate_results.append({
                "set_number": set_number,
                "features": feature_set,
                "best_eps": None,
                "best_min_samples": None,
                "best_silhouette_score": None,
                "top10_eps": None,
                "top10_min_samples": None,
                "top10_eps_min": None,
                "top10_eps_max": None,
                "top10_min_samples_min": None,
                "top10_min_samples_max": None
            })
            print(f"Intermediate tuning: Set {set_number} processed. No valid clustering found.")

    intermediate_df = pd.DataFrame(intermediate_results)
    output_filename = expected_filename.replace("Coarse_Top10", "Intermediate_Top10")
    intermediate_df.to_csv(output_filename, index=False)
    print(f"\nIntermediate tuning for '{expected_filename}' complete. Results saved to '{output_filename}'.")
    files.download(output_filename)

Saving DBSCAN_Coarse_Top10_chunk_20.csv to DBSCAN_Coarse_Top10_chunk_20.csv
All files successfully uploaded. Processing...
Intermediate tuning: Set 2718 processed. Best: (eps: 1.4999999999999998, min_samples: 13), Score: 0.1492
Intermediate tuning: Set 2719 processed. Best: (eps: 1.4999999999999998, min_samples: 13), Score: 0.1164
Intermediate tuning: Set 2720 processed. Best: (eps: 1.4999999999999998, min_samples: 11), Score: 0.1197
Intermediate tuning: Set 2721 processed. Best: (eps: 1.4999999999999998, min_samples: 11), Score: 0.1268
Intermediate tuning: Set 2722 processed. Best: (eps: 1.4999999999999998, min_samples: 7), Score: 0.1150
Intermediate tuning: Set 2723 processed. Best: (eps: 2.0000000000000004, min_samples: 3), Score: 0.1734
Intermediate tuning: Set 2724 processed. Best: (eps: 2.1000000000000005, min_samples: 7), Score: 0.1996
Intermediate tuning: Set 2725 processed. Best: (eps: 2.1000000000000005, min_samples: 12), Score: 0.1974
Intermediate tuning: Set 2726 processed.

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>