<a href="https://colab.research.google.com/github/mohammadbadi/Clustering_Frequency/blob/main/Code%20Sections/5.5%20Creating%20Combinations%20of%20different%20features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Creating all possible combinations of different features**

In [None]:
import os
import warnings
import itertools
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML
from google.colab import files  # For automatic file download in Colab

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# ----- Helper function to split a list into chunks of given size -----
def chunk_list(lst, chunk_size):
    """Yield successive chunk_size-sized chunks from lst."""
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

# -------------------------------
# Generate Feature Combinations
# -------------------------------
base_features = ["OCC_YEAR", "OCC_DAY", "OCC_DOY", "OCC_MONTH_Num", "OCC_HOUR"]

group1_options = [
    ["OCC_DOW_Num"],
    ["DOW_Weekend", "DOW_Begin", "DOW_Mid"]
]

group2_options = [
    ["LONG_LAT_PCA"],
    ["LONG_WGS84", "LAT_WGS84"]
]

group3_options = [
    ["Division_Freq"],
    ["HOOD_Freq"],
    ["DIV_HOOD_Hier"]
]

group4_options = [
    ["LOCATION_Freq"],
    ["PREMISES_Freq"],
    ["Loca_Premi_Freq"]
]

all_combinations = []
for r in range(len(base_features) + 1):
    for base_subset in itertools.combinations(base_features, r):
        base_list = list(base_subset)
        for g1 in ([None] + group1_options):
            for g2 in ([None] + group2_options):
                for g3 in ([None] + group3_options):
                    for g4 in ([None] + group4_options):
                        current_set = base_list.copy()
                        if g1 is not None:
                            current_set += g1
                        if g2 is not None:
                            current_set += g2
                        if g3 is not None:
                            current_set += g3
                        if g4 is not None:
                            current_set += g4
                        # Remove duplicates while preserving order
                        current_set = list(dict.fromkeys(current_set))
                        if len(current_set) >= 4:
                            all_combinations.append(current_set)

# Create a DataFrame with an additional set_number column
feature_df = pd.DataFrame({
    "set_number": range(1, len(all_combinations)+1),
    "features": all_combinations
})
all_combos_filename = "Feature_Combo_All_with_set_number.csv"
feature_df.to_csv(all_combos_filename, index=False)
print(f"Saved all feature combinations to '{all_combos_filename}'. Total sets: {len(all_combinations)}")

# Split into chunks of 143 sets (as before)
chunk_size = 143
chunks = list(chunk_list(all_combinations, chunk_size))
total_chunks = len(chunks)
print(f"Total number of chunks (143 sets each): {total_chunks}")
files.download(all_combos_filename)

Saved all feature combinations to 'Feature_Combo_All_with_set_number.csv'. Total sets: 4284
Total number of chunks (143 sets each): 30


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>