# Training Model

## Importing packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
import implicit
from scipy.sparse import csr_matrix

## Loading datasets

In [3]:
ok=pd.read_csv('../raw_data/ok_clean.csv')

In [4]:
topics = pd.read_csv('../raw_data/text_and_topics_classified_encoded_all.csv', keep_default_na=False)

In [5]:
ok = pd.concat([ok, topics], axis=1)

In [96]:
#list(ok.columns)

## Defining X vector with features

In [97]:
num_cat_columns = ['age_scaled',
 'single',
 'female',
                   
 'orientation_bisexual',
 'orientation_gay',
 'orientation_straight',
                   
 'body_type_athletic',
 'body_type_average',
 'body_type_curvy',
 'body_type_fit',
 'body_type_not_disclosed',
 'body_type_other',
 'body_type_thin',
                   
 'diet_type_anything',
 'diet_type_not_disclosed',
 'diet_type_other',
 'diet_type_vegetarian',

 'drinks_not at all',
 'drinks_not_disclosed',
 'drinks_often',
 'drinks_rarely',
 'drinks_socially',

 'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed',
 'no_of_kids_one',
 'no_of_kids_zero',
                   
 'want_more_kids_maybe',
 'want_more_kids_no',
 'want_more_kids_not_disclosed',
 'want_more_kids_yes',
                   
 'has_dogs_no',
 'has_dogs_not_disclosed',
 'has_dogs_yes',
                   
 'like_dogs_no',
 'like_dogs_not_disclosed',
 'like_dogs_yes',
                   
 'has_cats_no',
 'has_cats_not_disclosed',
 'has_cats_yes',
                   
 'like_cats_no',
 'like_cats_not_disclosed',
 'like_cats_yes',
                   
 'religion_agnosticism',
 'religion_atheism',
 'religion_catholicism',
 'religion_christianity',
 'religion_judaism',
 'religion_not_disclosed',
 'religion_other',
                   
 'smokes_no',
 'smokes_not_disclosed',
 'smokes_yes',
]


text_columns = ['cluster_text']


X = ok[num_cat_columns + text_columns]

## Function to run several KNN Models

### K-Nearest Neighbors (KNN) Metrics Summary

| Metric                     | Description                                | Better When  |
|----------------------------|--------------------------------------------|-------------|
| **Sum of Squared Errors (SSE)**  | Measures overall spread                  | Lower       |
| **Mean Average Distance (MAD)**  | Avg. distance to k-nearest neighbors     | Lower       |
| **Maximum Distance (MaxD)**      | Largest distance found (outliers)        | Lower       |
| **Minimum Distance (MinD)**      | Smallest distance found (possible duplicates) | Higher*    |

**Interpretation**:  
- **Lower values** of *SSE, MAD, and MaxD* indicate tighter, more meaningful neighbor relationships.  
- **Higher MinD** values may indicate a lack of exact duplicates, but too high could suggest poor coverage.  


In [99]:
def run_unsupervised_knn(ok, X, k):
    """
    Runs unsupervised KNN to find the k nearest neighbors for all points in X,
    excluding the person itself. Uses Euclidean distance and adds the results
    directly to the input DataFrame `ok`. Also computes and prints the following
    evaluation metrics:

    Metrics computed:
        1️⃣ **Sum of Squared Errors (SSE)**:  
           - Measures how tightly the points are clustered.  
           - Formula: SSE = sum of squared distances to k-nearest neighbors.  
           - A lower SSE indicates more compact clusters.

        2️⃣ **Mean Average Distance (MAD)**:  
           - Computes the average neighbor distance per person.  
           - Formula: MAD = mean of all distances to k-nearest neighbors.  
           - Easier to interpret than SSE.

        3️⃣ **Maximum Distance (MaxD)**:  
           - Finds the farthest neighbor in the dataset.  
           - Helps detect **outliers** (large MaxD means someone has no close matches).  

        4️⃣ **Minimum Distance (MinD)**:  
           - Finds the closest neighbor distance in the dataset.  
           - Helps detect **duplicate or near-identical points** (very low MinD).  

    Parameters:
        ok (pd.DataFrame): The original DataFrame to which results will be added.
        X (pd.DataFrame or np.array): Feature matrix (already scaled).
        k (int): Number of neighbors to find (excluding the person itself).

    Returns:
        ok (pd.DataFrame): The original DataFrame with added columns for neighbor
                           indices and distances.
    """
    # Initialize the NearestNeighbors model with Euclidean distance
    knn = NearestNeighbors(n_neighbors=k + 1, algorithm='auto', metric='euclidean')
    knn.fit(X)

    # Find neighbors for all points in X
    distances, indices = knn.kneighbors(X)

    # Exclude the first neighbor (the person itself)
    indices = indices[:, 1:]  # Drop the first column (itself)
    distances = distances[:, 1:]  # Drop the first column (itself)

    # Compute metrics
    sse = np.sum(distances**2)  # Sum of Squared Errors (SSE)
    mad = np.mean(distances)  # Mean Average Distance (MAD)
    max_distance = np.max(distances)  # Maximum Distance (MaxD)
    min_distance = np.min(distances)  # Minimum Distance (MinD)

    # Print metrics
    print(f"📌 Sum of Squared Errors (SSE): {sse:.4f} → Measures overall spread")
    print(f"📌 Mean Average Distance (MAD): {mad:.4f} → Average distance to k-nearest neighbors")
    print(f"📌 Maximum Distance (MaxD): {max_distance:.4f} → Largest distance found (outlier detection)")
    print(f"📌 Minimum Distance (MinD): {min_distance:.4f} → Smallest distance found (possible duplicates)")

    # Add indices and distances to the original DataFrame `ok`
    for i in range(k):
        ok[f'neighbor_{i+1}_index'] = indices[:, i]
        ok[f'neighbor_{i+1}_distance'] = distances[:, i]

    return   # Keeps the function output exactly the same as before


### Baseline model: KNN 

In [131]:
%%time
run_unsupervised_knn(ok,X, 5)

📌 Sum of Squared Errors (SSE): 435436.8577 → Measures overall spread
📌 Mean Average Distance (MAD): 0.9998 → Average distance to k-nearest neighbors
📌 Maximum Distance (MaxD): 3.1361 → Largest distance found (outlier detection)
📌 Minimum Distance (MinD): 0.0000 → Smallest distance found (possible duplicates)
CPU times: user 20.4 s, sys: 164 ms, total: 20.5 s
Wall time: 2.79 s


## Function to run several Kmean Models

In [10]:
def cluster_and_find_similar(dataset, X, n_clusters=12, n_similar=10):
    """
    Runs K-Means clustering and finds the specified number of most similar individuals within each cluster.
    
    Parameters:
    - dataset: DataFrame, the dataset to use
    - X: DataFrame, the feature vector
    - n_clusters: int, number of clusters for K-Means (default=12)
    - n_similar: int, number of similar individuals to return per person (default=10)
    
    Returns:
    - Updated DataFrame with assigned clusters and specified number of most similar individuals per person
    """
    if X is None:
        raise ValueError("Feature vector X must be provided")
    
    # Fit K-Means model
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    dataset['cluster'] = kmeans.fit_predict(X)
    
    # Compute evaluation metrics
    silhouette_avg = silhouette_score(X, dataset['cluster'])
    inertia = kmeans.inertia_
    davies_bouldin = davies_bouldin_score(X, dataset['cluster'])
    calinski_harabasz = calinski_harabasz_score(X, dataset['cluster'])
    
    print(f"Silhouette Score: {silhouette_avg:.4f}")
    print(f"Inertia (SSE): {inertia:.4f}")
    print(f"Davies-Bouldin Score: {davies_bouldin:.4f}")
    print(f"Calinski-Harabasz Score: {calinski_harabasz:.4f}")
    
    # Create a dictionary to store the most similar people for each individual
    similar_people = {}
    avg_match_distances = []
    
    # Find the closest people within each cluster
    for cluster in range(n_clusters):
        cluster_indices = dataset[dataset['cluster'] == cluster].index
        cluster_features = X.loc[cluster_indices]
        distances = euclidean_distances(cluster_features)
        sorted_indices = np.argsort(distances, axis=1)[:, 1:n_similar+1]  # Exclude self (index 0)
        
        for i, index in enumerate(cluster_indices):
            similar_people[index] = cluster_indices[sorted_indices[i]].tolist()
            avg_match_distances.append(np.mean(distances[i][sorted_indices[i]]))
    
    # Compute overall Average Match Distance (AMD)
    overall_avg_match_distance = np.mean(avg_match_distances)
    print(f"Average Match Distance (AMD): {overall_avg_match_distance:.4f}")
    
    # Convert the dictionary into a DataFrame
    similar_df = pd.DataFrame.from_dict(similar_people, orient='index')
    similar_df.columns = [f'similar_{i+1}' for i in range(n_similar)]
    
    # Merge with the original dataset
    dataset = dataset.merge(similar_df, left_index=True, right_index=True)
    
    return 


## Testing better model for text dimensionality reduction

Let's re-run the section below with the final vector of X and see if we gat the same recomendation. We first ran with the vector num_cat_columns above. But we later did a detailed analysis to select the best features and we will re-run the analysis below.

In [None]:
num_cat_columns=['female', 'age_scaled', 'single','height_scaled','orientation_bisexual', 'orientation_gay',
     'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
     'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
     'education_status_not_disclosed', 'education_status_working',  'speaks_english', 'speaks_spanish', 
     'speaks_other''diet_type_vegetarian', 'has_dogs_yes', 'no_of_kids_more_than_one','no_of_kids_one',
     'text_length_scaled']

### Model 1: Text Kmeans Cluster

In [122]:
%%time
text_columns = ['cluster_text']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0711
Inertia (SSE): 308001.6869
Davies-Bouldin Score: 3.0502
Calinski-Harabasz Score: 3585.9392
Average Match Distance (AMD): 1.1903
CPU times: user 1min 20s, sys: 56.1 s, total: 2min 16s
Wall time: 47.8 s


### Model 2: Therapist

In [124]:
%%time
text_columns = [ 'therapist_Expressions of Happiness and Joy',
 'therapist_Managing and Increasing Energy Levels',
 'therapist_Other',
 'therapist_See and Understanding Conversations',
 'therapist_Time Up and Future Meetings']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0738
Inertia (SSE): 319298.1742
Davies-Bouldin Score: 2.8844
Calinski-Harabasz Score: 2297.6252
Average Match Distance (AMD): 1.1822
CPU times: user 2min 16s, sys: 10.3 s, total: 2min 26s
Wall time: 2min 11s


### Model 3: General

In [129]:
%%time
text_columns = ['general_Entertainment',
 'general_Home & Hobbies',
 'general_Literature',
 'general_Other',
 'general_Social Life']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0465
Inertia (SSE): 292765.0515
Davies-Bouldin Score: 3.0941
Calinski-Harabasz Score: 2159.6429
Average Match Distance (AMD): 1.0408
CPU times: user 1min 15s, sys: 1min, total: 2min 15s
Wall time: 36.9 s


### Model 4: 5 LDA Topics

In [113]:
%%time
text_columns = ['topic_0_from_five', 'topic_1_from_five', 
                'topic_2_from_five', 'topic_3_from_five', 'topic_4_from_five']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0536
Inertia (SSE): 320171.4776
Davies-Bouldin Score: 3.0366
Calinski-Harabasz Score: 1971.7674
Average Match Distance (AMD): 1.2594
CPU times: user 2min 12s, sys: 19.7 s, total: 2min 32s
Wall time: 1min 57s


### Model 5: 2 LDA Topics

In [117]:
%%time
text_columns = ['topic_0_from_two','topic_1_from_two']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0729
Inertia (SSE): 263936.2889
Davies-Bouldin Score: 3.0072
Calinski-Harabasz Score: 2408.2509
Average Match Distance (AMD): 0.9860
CPU times: user 2min 21s, sys: 20.2 s, total: 2min 41s
Wall time: 2min 21s


### Model 6: PCA SDV

In [118]:
%%time
text_columns = ['PCA_sdv_1','PCA_sdv_2']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0584
Inertia (SSE): 265009.0252
Davies-Bouldin Score: 2.8963
Calinski-Harabasz Score: 2318.5807
Average Match Distance (AMD): 0.9601
CPU times: user 2min 4s, sys: 24.5 s, total: 2min 29s
Wall time: 1min 35s


### Model 7: PCA Word2vec embeddings

In [119]:
%%time
text_columns = ['PCA_emb_1','PCA_emb_2']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0616
Inertia (SSE): 266141.2972
Davies-Bouldin Score: 2.8174
Calinski-Harabasz Score: 2304.3198
Average Match Distance (AMD): 0.9323
CPU times: user 1min 43s, sys: 33.7 s, total: 2min 16s
Wall time: 1min 20s


### Model 8: PCA TfidfVectorizer

In [121]:
%%time
text_columns = ['PCA_vect_1','PCA_vect_2']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0584
Inertia (SSE): 265009.3912
Davies-Bouldin Score: 2.8963
Calinski-Harabasz Score: 2318.5776
Average Match Distance (AMD): 0.9601
CPU times: user 1min 53s, sys: 35.9 s, total: 2min 29s
Wall time: 1min 26s


### Summary



| Metric                     | Description                         | Better When  |
|----------------------------|-------------------------------------|-------------|
| **Silhouette Score**       | Cluster separation quality         | Higher      |
| **Inertia (SSE)**          | Sum of squared distances to centers | Lower       |
| **Davies-Bouldin Score**   | Cluster similarity ratio           | Lower       |
| **Calinski-Harabasz Score** | Ratio of inter/intra-cluster variance | Higher      |
| **Average Match Distance (AMD)** | Avg. distance to closest matches | Lower       |

**Interpretation**:  
- **Higher values** of *Silhouette Score* and *Calinski-Harabasz Score* indicate better clustering.  
- **Lower values** of *Inertia*, *Davies-Bouldin Score*, and *AMD* indicate better cluster compactness and well-separated groups.

### Model Comparison Table

| Model                              | Silhouette ↑ | Inertia (SSE) ↓ | Davies-Bouldin ↓ | Calinski-Harabasz ↑ | AMD ↓  | Score (Lower is Better) |
|------------------------------------|-------------|----------------|----------------|------------------|------|---------------------|
| **Model 1: Text KMeans Cluster**  | 0.0711      | 308001.6869    | 3.0502         | 3585.9392       | 1.1903 | 4 |
| **Model 2: Therapist**            | 0.0738      | 319298.1742    | 2.8844         | 2297.6252       | 1.1822 | 5 |
| **Model 3: General**              | 0.0465      | 292765.0515    | 3.0941         | 2159.6429       | 1.0408 | 6 |
| **Model 4: 5 LDA Topics**         | 0.0536      | 320171.4776    | 3.0366         | 1971.7674       | 1.2594 | 7 |
| **Model 5: 2 LDA Topics**         | 0.0729      | 263936.2889    | 3.0072         | 2408.2509       | 0.9860 | **1** 🏆 |
| **Model 6: PCA SDV**              | 0.0584      | 265009.0252    | 2.8963         | 2318.5807       | 0.9601 | 3 |
| **Model 7: PCA Word2Vec**         | 0.0616      | 266141.2972    | 2.8174         | 2304.3198       | 0.9323 | **2** 🥈 |
| **Model 8: PCA TF-IDF**           | 0.0584      | 265009.3912    | 2.8963         | 2318.5776       | 0.9601 | 3 |

### Best Models:
1️⃣ **Model 5 (2 LDA Topics)** – Best overall, with a good balance of low AMD and SSE.  
2️⃣ **Model 7 (PCA Word2Vec)** – Close second, slightly better Davies-Bouldin but slightly higher SSE.  
3️⃣ **Models 6 & 8 (PCA SDV & PCA TF-IDF)** – Also strong contenders, but slightly higher AMD.

**How to choose?**  
- If you prioritize **tight clusters & best matches** → **Model 5 (2 LDA Topics)**.  
- If you want **word embeddings-based clusters** → **Model 7 (PCA Word2Vec)**.  
- If you prefer **statistical dimensionality reduction** → **Model 6 or 8 (PCA-based methods)**.

🚀 **Next step**: Try the best models with different `n_clusters` values for further optimization!  


## Testing which features to include in the model

### Adding features and analysing

In [19]:
%%time
new=['topic_0_from_two']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.5358
Inertia (SSE): 22.4772
Davies-Bouldin Score: 0.5047
Calinski-Harabasz Score: 495648.7711
Average Match Distance (AMD): 0.0000
CPU times: user 31.8 s, sys: 2.63 s, total: 34.5 s
Wall time: 33.7 s


In [16]:
%%time
new=['topic_0_from_two',
 'topic_1_from_two']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.5340
Inertia (SSE): 44.2808
Davies-Bouldin Score: 0.5072
Calinski-Harabasz Score: 503201.5262
Average Match Distance (AMD): 0.0000
CPU times: user 1min 2s, sys: 54 s, total: 1min 56s
Wall time: 36.1 s


In [18]:
%%time
new=['age_scaled', 'topic_0_from_two',
 'topic_1_from_two']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.3392
Inertia (SSE): 626.4378
Davies-Bouldin Score: 0.8347
Calinski-Harabasz Score: 48362.3012
Average Match Distance (AMD): 0.0014
CPU times: user 1min 7s, sys: 52.1 s, total: 1min 59s
Wall time: 38.7 s


In [20]:
%%time
new=['age_scaled', 'single', 'topic_0_from_two',
 'topic_1_from_two']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.3561
Inertia (SSE): 916.2438
Davies-Bouldin Score: 0.8427
Calinski-Harabasz Score: 54818.5556
Average Match Distance (AMD): 0.0022
CPU times: user 1min 7s, sys: 55 s, total: 2min 2s
Wall time: 42.6 s


In [21]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.4059
Inertia (SSE): 1646.2831
Davies-Bouldin Score: 0.7757
Calinski-Harabasz Score: 75799.0390
Average Match Distance (AMD): 0.0037
CPU times: user 1min 8s, sys: 53.7 s, total: 2min 2s
Wall time: 41.7 s


In [22]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two',  'orientation_bisexual', 'orientation_gay',
 'orientation_straight']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.4214
Inertia (SSE): 4042.7787
Davies-Bouldin Score: 0.8159
Calinski-Harabasz Score: 47682.7808
Average Match Distance (AMD): 0.0063
CPU times: user 1min 13s, sys: 51.2 s, total: 2min 5s
Wall time: 46 s


In [None]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two',  'orientation_bisexual', 'orientation_gay',
 'orientation_straight']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

In [24]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.4919
Inertia (SSE): 23761.3141
Davies-Bouldin Score: 1.1433
Calinski-Harabasz Score: 14937.8450
Average Match Distance (AMD): 0.0192
CPU times: user 1min 8s, sys: 53.7 s, total: 2min 1s
Wall time: 40 s


In [26]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.2924
Inertia (SSE): 54981.2490
Davies-Bouldin Score: 1.4925
Calinski-Harabasz Score: 6980.5454
Average Match Distance (AMD): 0.0364
CPU times: user 1min 1s, sys: 57.5 s, total: 1min 59s
Wall time: 37.3 s


In [27]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1905
Inertia (SSE): 86119.1756
Davies-Bouldin Score: 2.0949
Calinski-Harabasz Score: 4341.3648
Average Match Distance (AMD): 0.0774
CPU times: user 1min 7s, sys: 53.5 s, total: 2min 1s
Wall time: 39.1 s


In [28]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1375
Inertia (SSE): 108120.9289
Davies-Bouldin Score: 2.1793
Calinski-Harabasz Score: 3869.3748
Average Match Distance (AMD): 0.1267
CPU times: user 1min 23s, sys: 40.7 s, total: 2min 3s
Wall time: 1min 3s


In [29]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1022
Inertia (SSE): 125245.3447
Davies-Bouldin Score: 2.2350
Calinski-Harabasz Score: 3566.3926
Average Match Distance (AMD): 0.1921
CPU times: user 2min 19s, sys: 22.7 s, total: 2min 41s
Wall time: 2min 17s


In [31]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes',  'has_dogs_no', 'has_dogs_not_disclosed','has_dogs_yes']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0787
Inertia (SSE): 139426.0870
Davies-Bouldin Score: 2.4819
Calinski-Harabasz Score: 3256.4672
Average Match Distance (AMD): 0.2513
CPU times: user 1min 44s, sys: 25.9 s, total: 2min 10s
Wall time: 1min 29s


In [33]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes' , 'has_dogs_no', 'has_dogs_not_disclosed','has_dogs_yes',
 'like_dogs_no', 'like_dogs_not_disclosed', 'like_dogs_yes']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1049
Inertia (SSE): 155476.2675
Davies-Bouldin Score: 2.3097
Calinski-Harabasz Score: 3367.3100
Average Match Distance (AMD): 0.3092
CPU times: user 1min 58s, sys: 26.7 s, total: 2min 24s
Wall time: 1min 32s


In [34]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes',  'has_dogs_no', 'has_dogs_not_disclosed','has_dogs_yes',
 'like_dogs_no', 'like_dogs_not_disclosed', 'like_dogs_yes',  'has_cats_no',
 'has_cats_not_disclosed','has_cats_yes']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0918
Inertia (SSE): 173734.6288
Davies-Bouldin Score: 2.8097
Calinski-Harabasz Score: 2992.6794
Average Match Distance (AMD): 0.4030
CPU times: user 1min 59s, sys: 24 s, total: 2min 24s
Wall time: 1min 36s


In [35]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes',  'has_dogs_no', 'has_dogs_not_disclosed','has_dogs_yes',
 'like_dogs_no', 'like_dogs_not_disclosed', 'like_dogs_yes',  'has_cats_no',
 'has_cats_not_disclosed','has_cats_yes',  'like_cats_no', 'like_cats_not_disclosed',
 'like_cats_yes']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0835
Inertia (SSE): 192555.5129
Davies-Bouldin Score: 2.6658
Calinski-Harabasz Score: 3084.4088
Average Match Distance (AMD): 0.4575
CPU times: user 2min 3s, sys: 17.4 s, total: 2min 21s
Wall time: 1min 43s


In [36]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes',  'has_dogs_no', 'has_dogs_not_disclosed','has_dogs_yes',
 'like_dogs_no', 'like_dogs_not_disclosed', 'like_dogs_yes',  'has_cats_no',
 'has_cats_not_disclosed','has_cats_yes',  'like_cats_no', 'like_cats_not_disclosed',
 'like_cats_yes',  'religion_agnosticism', 'religion_atheism', 'religion_catholicism',
 'religion_christianity','religion_judaism', 'religion_not_disclosed','religion_other']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0825
Inertia (SSE): 234397.6699
Davies-Bouldin Score: 2.9154
Calinski-Harabasz Score: 2680.5108
Average Match Distance (AMD): 0.7785
CPU times: user 1min 13s, sys: 50.8 s, total: 2min 4s
Wall time: 40.2 s


In [37]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes',  'has_dogs_no', 'has_dogs_not_disclosed','has_dogs_yes',
 'like_dogs_no', 'like_dogs_not_disclosed', 'like_dogs_yes',  'has_cats_no',
 'has_cats_not_disclosed','has_cats_yes',  'like_cats_no', 'like_cats_not_disclosed',
 'like_cats_yes',  'religion_agnosticism', 'religion_atheism', 'religion_catholicism',
 'religion_christianity','religion_judaism', 'religion_not_disclosed','religion_other',
 'smokes_no', 'smokes_not_disclosed', 'smokes_yes']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0702
Inertia (SSE): 260182.5113
Davies-Bouldin Score: 3.0105
Calinski-Harabasz Score: 2407.7145
Average Match Distance (AMD): 0.9226
CPU times: user 1min 11s, sys: 51.1 s, total: 2min 2s
Wall time: 38.9 s


| Model                               | Silhouette ↑ | Inertia (SSE) ↓ | Davies-Bouldin ↓ | Calinski-Harabasz ↑ | AMD ↓   | Score (Lower is Better) | Best Model Consideration                             | Compare to Previous Model                                      | Recommendation                             |
|-------------------------------------|--------------|-----------------|------------------|---------------------|---------|------------------------|-----------------------------------------------------|---------------------------------------------------------------|--------------------------------------------|
| **Model 1: First LDA Topic only**    | 0.5358       | 22.4772         | 0.5047           | 495648.7711         | 0.0000  | 1                      | Best Overall (Best Silhouette & low AMD)           | N/A                                                           | Keep (Strong performance)                  |
| **Model 2: Adding LDA 2 Topics**    | 0.5340       | 44.2808         | 0.5072           | 503201.5262         | 0.0000  | 2                      | Second Best (Good LDA performance)                 | Silhouette slightly decreased                                           | Keep (Still strong)                        |
| **Model 3: Adding Age**             | 0.3392       | 626.4378        | 0.8347           | 48362.3012          | 0.0014  | 3                      | Not Recommended (Lower Silhouette)                  | Significant drop in Silhouette                                        | Remove (Negative impact)                   |
| **Model 4: Adding Single**          | 0.3561       | 916.2438        | 0.8427           | 54818.5556          | 0.0022  | 4                      | Not Recommended (Lower Silhouette & higher AMD)     | Silhouette and AMD worsened                                            | Remove (Negative impact)                   |
| **Model 5: Adding Gender**          | 0.4059       | 1646.2831       | 0.7757           | 75799.0390          | 0.0037  | 5                      | Solid Option (Balanced performance)                 | Silhouette increased, AMD lower                                         | Keep (Positive effect)                     |
| **Model 6: Adding Orientation**     | 0.4214       | 4042.7787       | 0.8159           | 47682.7808          | 0.0063  | 6                      | Good Option (Strong Silhouette)                     | Good improvement in Silhouette                                          | Keep (Positive effect)                     |
| **Model 7: Adding Body Type**       | 0.4919       | 23761.3141      | 1.1433           | 14937.8450          | 0.0192  | 7                      | Not Recommended (High Davies-Bouldin & AMD)        | Silhouette increased, but high Davies-Bouldin                           | Remove (Negative impact)                   |
| **Model 8: Adding Diet Type**       | 0.2924       | 54981.2490      | 1.4925           | 6980.5454           | 0.0364  | 8                      | Not Recommended (Low Silhouette & High Davies-Bouldin) | Significant drop in Silhouette and high Davies-Bouldin                 | Remove (Negative impact)                   |
| **Model 9: Adding Drink**           | 0.1905       | 86119.1756      | 2.0949           | 4341.3648           | 0.0774  | 9                      | Not Recommended (Low Silhouette & High Davies-Bouldin) | Significant drop in Silhouette, High Davies-Bouldin                    | Remove (Negative impact)                   |
| **Model 10: Adding Has Kids**       | 0.1375       | 108120.9289     | 2.1793           | 3869.3748           | 0.1267  | 10                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Silhouette decreased, Davies-Bouldin increased                         | Remove (Negative impact)                   |
| **Model 11: Adding Wants Kids**     | 0.1022       | 125245.3447     | 2.2350           | 3566.3926           | 0.1921  | 11                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Significant drop in Silhouette, High Davies-Bouldin                    | Remove (Negative impact)                   |
| **Model 12: Adding Has Dog**        | 0.0787       | 139426.0870     | 2.4819           | 3256.4672           | 0.2513  | 12                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Further drop in Silhouette, High Davies-Bouldin                         | Remove (Negative impact)                   |
| **Model 13: Adding Likes Dog**      | 0.1049       | 155476.2675     | 2.3097           | 3367.3100           | 0.3092  | 13                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Drop in Silhouette and higher Davies-Bouldin                            | Remove (Negative impact)                   |
| **Model 14: Adding Likes Cat**      | 0.0835       | 192555.5129     | 2.6658           | 3084.4088           | 0.4575  | 14                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Silhouette slightly improved, but Inertia and Davies-Bouldin worsened   | Remove (Negative impact)                   |
| **Model 15: Adding Religion**       | 0.0825       | 234397.6699     | 2.9154           | 2680.5108           | 0.7785  | 15                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Silhouette decreased, Davies-Bouldin worsened                            | Remove (Negative impact)                   |
| **Model 16: Adding Smokes**         | 0.0702       | 260182.5113     | 3.0105           | 2407.7145           | 0.9226  | 16                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Further decrease in Silhouette, Davies-Bouldin worsened, AMD increased | Remove (Negative impact)                   |


### Keeping best perfomarnce and adding new ones

In [38]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.5743
Inertia (SSE): 386.3850
Davies-Bouldin Score: 0.4720
Calinski-Harabasz Score: 436722.6819
Average Match Distance (AMD): 0.0001
CPU times: user 1min 12s, sys: 54.7 s, total: 2min 6s
Wall time: 46 s


In [39]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.5987
Inertia (SSE): 18912.3684
Davies-Bouldin Score: 0.9301
Calinski-Harabasz Score: 23506.9959
Average Match Distance (AMD): 0.0011
CPU times: user 1min 10s, sys: 53.2 s, total: 2min 3s
Wall time: 42.7 s


In [42]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working',  'speaks_english',
    'speaks_spanish', 'speaks_other']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.3864
Inertia (SSE): 38847.2780
Davies-Bouldin Score: 1.3281
Calinski-Harabasz Score: 11533.1687
Average Match Distance (AMD): 0.0065
CPU times: user 2min 22s, sys: 27.4 s, total: 2min 50s
Wall time: 2min 53s


In [43]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working',  'speaks_english',
    'speaks_spanish', 'speaks_other', 'is_asian', 'is_white',
    'is_black','is_other','is_hispanic_latin','is_ethnicity_nan']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1988
Inertia (SSE): 85417.1991
Davies-Bouldin Score: 1.9556
Calinski-Harabasz Score: 5126.4877
Average Match Distance (AMD): 0.0656
CPU times: user 2min 2s, sys: 17.8 s, total: 2min 20s
Wall time: 1min 54s


In [45]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working',  'speaks_english',
    'speaks_spanish', 'speaks_other', 'text_length_scaled']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.3375
Inertia (SSE): 45733.0714
Davies-Bouldin Score: 1.3540
Calinski-Harabasz Score: 9249.1515
Average Match Distance (AMD): 0.0282
CPU times: user 1min 15s, sys: 55.7 s, total: 2min 11s
Wall time: 47.1 s


In [46]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working',  'speaks_english',
    'speaks_spanish', 'speaks_other', 'text_length_scaled']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.3375
Inertia (SSE): 45733.0714
Davies-Bouldin Score: 1.3540
Calinski-Harabasz Score: 9249.1515
Average Match Distance (AMD): 0.0282
CPU times: user 1min 56s, sys: 33.5 s, total: 2min 30s
Wall time: 1min 48s


In [51]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working',  'speaks_english',
    'speaks_spanish', 'speaks_other', 'text_length_scaled',  'height_scaled']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.3431
Inertia (SSE): 42265.7806
Davies-Bouldin Score: 1.3674
Calinski-Harabasz Score: 10623.3284
Average Match Distance (AMD): 0.0561
CPU times: user 1min 34s, sys: 48.2 s, total: 2min 23s
Wall time: 1min 1s


In [None]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working',  'speaks_english',
    'speaks_spanish', 'speaks_other', 'text_length_scaled',  'height_scaled']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

In [66]:
%%time
new=['female', 'age_scaled', 'single','height_scaled','orientation_bisexual', 'orientation_gay',
     'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
     'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
     'education_status_not_disclosed', 'education_status_working',  'speaks_english', 'speaks_spanish', 
     'speaks_other''diet_type_vegetarian', 'has_dogs_yes', 'no_of_kids_more_than_one','no_of_kids_one',
     'topic_0_from_two','text_length_scaled']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.2209
Inertia (SSE): 67059.5664
Davies-Bouldin Score: 1.5017
Calinski-Harabasz Score: 6683.6448
Average Match Distance (AMD): 0.1647
CPU times: user 1min 6s, sys: 56.1 s, total: 2min 2s
Wall time: 39.9 s


| Model  | Silhouette ↑ | Inertia (SSE) ↓ | Davies-Bouldin ↓ | Calinski-Harabasz ↑ | AMD ↓ | Score (Lower is Better) | Best Model Consideration | Compare to Previous Model | Recommendation |
|--------|--------------|------------------|------------------|---------------------|-------|-------------------------|--------------------------|---------------------------|----------------|
| Model 1: LDA 1 + Gender + Orientation | 0.5743 | 386.3850 | 0.4720 | 436722.6819 | 0.0001 | 7 | Best Overall (Best Silhouette & low AMD) | Significant improvement | Keep (Strong performance) |
| Model 2: Adding Education | 0.5987 | 18912.3684 | 0.9301 | 23506.9959 | 0.0011 | 8 | Solid Option (Good improvement) | Silhouette increased | Keep (Positive effect) |
| Model 3: Adding Languages | 0.3864 | 38847.2780 | 1.3281 | 11533.1687 | 0.0065 | 9 | Positive Impact (Good Silhouette & AMD) | Silhouette increased, balanced performance | Keep (Positive effect) |
| Model 4: Adding Text Length | 0.3375 | 45733.0714 | 1.3540 | 9249.1515 | 0.0282 | 10 | Positive Impact (Good Silhouette & balanced performance) | Balanced performance | Keep (Positive effect) |
| Model 5: Adding Height | 0.3431 | 42265.7806 | 1.3674 | 10623.3284 | 0.0561 | 20 | Good Option (Improved Silhouette) | Silhouette increased | Keep (Positive effect) |
| Model 6: Adding Vegetarian | 0.2986 | 48124.1491 | 1.4833 | 9184.3819 | 0.0654 | 21 | Borderline, but Leaning Toward Keep | Better than some removed features | Keep (Conditional) |
| Model 7: Adding Has Dog | 0.2628 | 53159.2449 | 1.4843 | 8545.5971 | 0.0794 | 23 | Keep (Conditional) | Improved over previous evaluation | Keep (Positive effect) |
| Model 8: Adding Has Kids | 0.2665 | 57530.2317 | 1.4746 | 7929.0712 | 0.0959 | 25 | Keep (Conditional) | Slight improvement over 'Has Dog' | Keep (Positive effect) |
| Model 9: Adding Single | 0.2273 | 64094.6961 | 1.6373 | 6894.5560 | 0.1121 | 26 | Keep (Conditional) | Improved over 'Body Type - Athletic' | Keep (Positive effect) |
| Model 10: Adding Age | 0.2378 | 64639.9396 | 1.5695 | 6963.6418 | 0.1526 | 27 | Solid option (Lower Silhouette but improved Davies-Bouldin) | Slight drop in Silhouette | Keep (Conditional) |

## Function to run several Deep Collaborative Scoring Model (DCScam)

In [157]:
def run_dbscan_recommendation(ok, X, eps=0.5, min_samples=5):
    """
    Runs DBSCAN clustering and recommends the 5 most similar people per individual.
    Also computes evaluation metrics and visualizes clusters.

    Parameters:
        ok (pd.DataFrame): The original dataset.
        X (pd.DataFrame or np.array): Feature matrix (already scaled).
        eps (float): The maximum distance for points to be considered neighbors.
        min_samples (int): The minimum number of samples to form a core point.

    Returns:
        ok (pd.DataFrame): The original dataset with added recommended similar people.
    """
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
    labels = dbscan.fit_predict(X)
    ok['cluster'] = labels  # Assign clusters

    # Dictionary to store the most similar people for each individual
    similar_people = {}
    for i in range(len(X)):
        label = labels[i]
        if label == -1:
            similar_people[i] = []  # Noise points have no cluster
            continue

        cluster_indices = np.where(labels == label)[0]  # Get indices of people in the same cluster
        cluster_distances = np.linalg.norm(X.iloc[cluster_indices].values - X.iloc[i].values, axis=1)  # Compute distances
        sorted_indices = cluster_indices[np.argsort(cluster_distances)]  # Sort by similarity

        similar_people[i] = sorted_indices[1:6].tolist()  # Exclude self and take top 5

    # Convert to DataFrame and merge with original dataset
    similar_df = pd.DataFrame.from_dict(similar_people, orient='index')
    similar_df.columns = [f'similar_{i+1}' for i in range(5)]
    ok = ok.merge(similar_df, left_index=True, right_index=True)

    # Compute clustering evaluation metrics
    valid_mask = labels != -1  # Mask to filter out noise points
    X_valid = X.iloc[valid_mask] if isinstance(X, pd.DataFrame) else X[valid_mask]
    valid_labels = labels[valid_mask]  # Remove noise points
    
    if len(set(valid_labels)) > 1:  # Metrics require at least 2 clusters
        silhouette_avg = silhouette_score(X_valid, valid_labels)
        davies_bouldin = davies_bouldin_score(X_valid, valid_labels)
        calinski_harabasz = calinski_harabasz_score(X_valid, valid_labels)

        print(f"Silhouette Score: {silhouette_avg:.4f}")
        print(f"Davies-Bouldin Score: {davies_bouldin:.4f}")
        print(f"Calinski-Harabasz Score: {calinski_harabasz:.4f}")
    else:
        print("Not enough clusters to compute metrics.")

    return ok


In [161]:
def run_dbscan_recommendation(ok, X, eps=0.5, min_samples=5):
    """
    Runs DBSCAN clustering and recommends the 5 most similar people per individual.
    Also computes evaluation metrics and visualizes clusters.

    Parameters:
        ok (pd.DataFrame): The original dataset.
        X (pd.DataFrame or np.array): Feature matrix (already scaled).
        eps (float): The maximum distance for points to be considered neighbors.
        min_samples (int): The minimum number of samples to form a core point.

    Returns:
        ok (pd.DataFrame): The original dataset with added recommended similar people.
    """
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
    labels = dbscan.fit_predict(X)
    ok['cluster'] = labels  # Assign clusters

    # Dictionary to store the most similar people for each individual
    similar_people = {}
    for i in range(len(X)):
        label = labels[i]
        if label == -1:
            # Find closest points for noise points
            cluster_distances = np.linalg.norm(X.values - X.iloc[i].values, axis=1)
            sorted_indices = np.argsort(cluster_distances)[1:6]  # Exclude self
            similar_people[i] = sorted_indices.tolist()
            continue

        cluster_indices = np.where(labels == label)[0]  # Get indices of people in the same cluster
        cluster_distances = np.linalg.norm(X.iloc[cluster_indices].values - X.iloc[i].values, axis=1)  # Compute distances
        sorted_indices = cluster_indices[np.argsort(cluster_distances)]  # Sort by similarity

        similar_people[i] = sorted_indices[1:6].tolist()  # Exclude self and take top 5

    # Convert to DataFrame and merge with original dataset
    similar_df = pd.DataFrame.from_dict(similar_people, orient='index')
    similar_df.columns = [f'similar_{i+1}' for i in range(5)]
    ok = ok.merge(similar_df, left_index=True, right_index=True, how='left')

    # Compute clustering evaluation metrics
    valid_mask = labels != -1  # Mask to filter out noise points
    X_valid = X.iloc[valid_mask] if isinstance(X, pd.DataFrame) else X[valid_mask]
    valid_labels = labels[valid_mask]  # Remove noise points
    
    if len(set(valid_labels)) > 1:  # Metrics require at least 2 clusters
        silhouette_avg = silhouette_score(X_valid, valid_labels)
        davies_bouldin = davies_bouldin_score(X_valid, valid_labels)
        calinski_harabasz = calinski_harabasz_score(X_valid, valid_labels)

        print(f"Silhouette Score: {silhouette_avg:.4f}")
        print(f"Davies-Bouldin Score: {davies_bouldin:.4f}")
        print(f"Calinski-Harabasz Score: {calinski_harabasz:.4f}")
    else:
        print("Not enough clusters to compute metrics.")

    return ok


In [162]:
run_dbscan_recommendation(ok, X, eps=0.5, min_samples=5)

Silhouette Score: 0.8210
Davies-Bouldin Score: 0.2594
Calinski-Harabasz Score: 1399.5344


Unnamed: 0,age_scaled,single,female,orientation_bisexual,orientation_gay,orientation_straight,body_type_athletic,body_type_average,body_type_curvy,body_type_fit,...,neighbor_4_index,neighbor_4_distance,neighbor_5_index,neighbor_5_distance,dbscan_cluster,similar_1,similar_2,similar_3,similar_4,similar_5
0,0.078431,1,0,0,0,1,0,0,0,0,...,3222,1.415280,44367,1.415285,-1,39105,25734,27522,42728.0,31368.0
1,0.333333,1,0,0,0,1,0,1,0,0,...,2082,1.894438,15383,1.939377,-1,22966,43121,37779,2082.0,40710.0
2,0.392157,0,0,0,0,1,0,0,0,0,...,45678,1.427890,48910,1.438338,-1,17032,12569,43514,32456.0,45678.0
3,0.098039,1,0,0,0,1,0,0,0,0,...,43192,2.001398,32875,2.002026,-1,11733,47513,22386,7532.0,1578.0
4,0.215686,1,0,0,0,1,1,0,0,0,...,6184,0.106500,43804,0.125967,0,35439,12318,40912,9842.0,41258.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,0.803922,1,1,0,0,1,0,0,0,0,...,50716,1.431697,13194,1.440652,-1,28910,30363,7159,13194.0,53938.0
59942,0.117647,1,0,0,0,1,0,0,0,1,...,16953,1.066748,12533,1.414217,-1,37813,29789,16953,59568.0,5248.0
59943,0.470588,1,0,0,0,1,0,1,0,0,...,37566,1.458169,12652,1.462023,-1,25999,11757,40289,37456.0,38881.0
59944,0.176471,1,0,0,0,1,1,0,0,0,...,44242,1.044169,59098,1.070702,-1,29981,33406,44242,31961.0,59098.0


## Function to run several Bayesian Personalized Ranking (BPR) Model

In [12]:
x.shape

(59946, 111)

In [51]:
import implicit
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd

def run_bpr_and_recommend(X, n_users, n_items, factors=10, iterations=50):
    """
    Runs the Bayesian Personalized Ranking (BPR) model and recommends similar individuals.
    
    Parameters:
    - X: A user-item interaction DataFrame (users as rows, items as columns).
    - n_users: Number of unique users.
    - n_items: Number of unique items.
    - factors: Number of latent factors for the BPR model (default=10).
    - iterations: Number of iterations to train the BPR model (default=50).
    
    Returns:
    - recommendations: A DataFrame of user recommendations.
    """
    # Convert the DataFrame to a sparse CSR matrix
    interaction_matrix = csr_matrix(X.values.astype('float32'))

    # Debugging: Check the shape of the interaction matrix
    print("Shape of interaction_matrix:", interaction_matrix.shape)

    # Initialize the BPR model
    bpr_model = implicit.bpr.BayesianPersonalizedRanking(factors=factors, iterations=iterations)

    # Train the BPR model
    bpr_model.fit(interaction_matrix)

    # Generate recommendations for each user
    recommendations = {}
    for user in range(n_users):
        # Ensure the user_items matrix is in CSR format
        user_items_csr = interaction_matrix.tocsr()
        user_recommendations = bpr_model.recommend(user, user_items_csr, N=10)  # Top 10 recommendations
        recommendations[user] = user_recommendations[0]  # Extract item IDs

    # Convert recommendations to DataFrame
    recommendations_df = pd.DataFrame.from_dict(recommendations, orient='index', columns=[f'Rec_{i+1}' for i in range(10)])

    return recommendations_df

In [52]:
run_bpr_and_recommend(X=x, n_users=59946, n_items=111, factors=10, iterations=50)

Shape of interaction_matrix: (59946, 111)


  0%|          | 0/50 [00:00<?, ?it/s]

ValueError: user_items must contain 1 row for every user in userids