<a href="https://colab.research.google.com/github/mohammadbadi/Clustering_Frequency/blob/main/Code%20Sections/Approach2%20Iterations%20eps%20and%20minsample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Approach 2 Iterations to find eps and minsample**

In [43]:
import os
import pandas as pd
from google.colab import files
from IPython.display import display, HTML

# Define parameters for each stage
EPS_JUMP_COARSE = 0.5
MINSAMPLE_JUMP_COARSE = 2
EPS_JUMP_INTERMEDIATE = 0.1
MINSAMPLE_JUMP_INTERMEDIATE = 1

# Columns to check for NaN or empty values
required_columns = [
    "best_eps", "best_min_samples", "best_silhouette_score", "top10_eps", "top10_min_samples",
    "top10_eps_min", "top10_eps_max", "top10_min_samples_min", "top10_min_samples_max"
]

# Stage 1 (Coarse Tuning)
eps_iter_stage1 = ((5 - 1) + 1) / 1
min_iter_stage1 = ((15 - 5) + 5) / 5
iter_per_set_stage1 = eps_iter_stage1 * min_iter_stage1

coarse_files = [f"DBSCAN_Coarse_Top10_chunk_{i}.csv" for i in range(1, 31) if os.path.exists(f"DBSCAN_Coarse_Top10_chunk_{i}.csv")]
df_coarse = pd.concat([pd.read_csv(file) for file in coarse_files], ignore_index=True)
num_sets_coarse = df_coarse["set_number"].nunique()
stage1_total = iter_per_set_stage1 * num_sets_coarse

# Stage 2 (Intermediate Tuning)
def compute_values(row, eps_jump, minsample_jump):
    if row[required_columns].isna().all():  # If all required columns are NaN or empty
        return pd.Series([0, 0, 0, 0, 0], index=["Diff_eps", "Diff_minsample", "Iteration_eps", "Iteration_minsamp", "Iterations"])

    diff_eps = max(row["top10_eps_max"] - row["top10_eps_min"], 1)
    diff_minsample = max(row["top10_min_samples_max"] - row["top10_min_samples_min"], 1)
    iteration_eps = (diff_eps / eps_jump) + 1
    iteration_minsamp = (diff_minsample / minsample_jump) + 1
    iterations = iteration_eps * iteration_minsamp

    return pd.Series([diff_eps, diff_minsample, iteration_eps, iteration_minsamp, iterations],
                     index=["Diff_eps", "Diff_minsample", "Iteration_eps", "Iteration_minsamp", "Iterations"])

# Apply logic to Coarse Data
df_coarse[["Diff_eps", "Diff_minsample", "Iteration_eps", "Iteration_minsamp", "Iterations"]] = df_coarse.apply(
    lambda row: compute_values(row, EPS_JUMP_COARSE, MINSAMPLE_JUMP_COARSE), axis=1
)

stage2_total = df_coarse["Iterations"].sum()
coarse_output_filename = "Approach2_Stage2.csv"
df_coarse.to_csv(coarse_output_filename, index=False)
files.download(coarse_output_filename)

# Stage 3 (Fine Tuning)
intermediate_files = [f"DBSCAN_Intermediate_Top10_chunk_{i}.csv"
                      for i in range(1, 31)
                      if os.path.exists(f"DBSCAN_Intermediate_Top10_chunk_{i}.csv")]
df_intermediate = pd.concat([pd.read_csv(file) for file in intermediate_files], ignore_index=True)

# For Stage 3, use only the relevant required columns present in these files
required_columns_stage3 = ["top10_eps_max", "top10_eps_min", "top10_min_samples_max", "top10_min_samples_min"]

df_intermediate[["Diff_eps", "Diff_minsample", "Iteration_eps", "Iteration_minsamp", "Iterations"]] = \
    df_intermediate.apply(
        lambda row: compute_values(row, EPS_JUMP_INTERMEDIATE, MINSAMPLE_JUMP_INTERMEDIATE)
                    if not row[required_columns_stage3].isna().all()
                    else pd.Series([0, 0, 0, 0, 0],
                                   index=["Diff_eps", "Diff_minsample", "Iteration_eps", "Iteration_minsamp", "Iterations"]),
        axis=1
    )

stage3_total = df_intermediate["Iterations"].sum()
intermediate_output_filename = "Approach2_Stage3.csv"
df_intermediate.to_csv(intermediate_output_filename, index=False)
files.download(intermediate_output_filename)

total_iterations = stage1_total + stage2_total + stage3_total

# Display the results in formatted HTML
explanation = f"""
<p style="color: black; font-size: 18px; font-weight: bold;">
    <u>Approach 2 Stage 1</u><br>
    Stage 1 total = <span style='color: blue;'>{int(stage1_total)}</span>
</p>
<p style="color: black; font-size: 18px; font-weight: bold;">
    <u>Approach 2 Stage 2</u><br>
    Stage 2 total = <span style='color: blue;'>{int(stage2_total)}</span>
</p>
<p style="color: black; font-size: 18px; font-weight: bold;">
    <u>Approach 2 Stage 3</u><br>
    Stage 3 total = <span style='color: blue;'>{int(stage3_total)}</span>
</p>
<p style="color: black; font-size: 18px; font-weight: bold;">
    <u>Approach 2 total iterations to find best eps and min sample:</u>
    <span style='color: blue;'>{int(total_iterations)}</span>
</p>
"""
display(HTML(explanation))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>