# Training Model

## Importing packages

In [220]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
import implicit
from scipy.sparse import csr_matrix
from sklearn.cluster import DBSCAN

## Loading datasets

In [221]:
ok=pd.read_csv('../raw_data/ok_clean.csv')

In [222]:
topics = pd.read_csv('../raw_data/text_and_topics_classified_encoded_all.csv', keep_default_na=False)

In [223]:
ok = pd.concat([ok, topics], axis=1)

In [96]:
#list(ok.columns)

## Defining X vector with features

In [97]:
num_cat_columns = ['age_scaled',
 'single',
 'female',
                   
 'orientation_bisexual',
 'orientation_gay',
 'orientation_straight',
                   
 'body_type_athletic',
 'body_type_average',
 'body_type_curvy',
 'body_type_fit',
 'body_type_not_disclosed',
 'body_type_other',
 'body_type_thin',
                   
 'diet_type_anything',
 'diet_type_not_disclosed',
 'diet_type_other',
 'diet_type_vegetarian',

 'drinks_not at all',
 'drinks_not_disclosed',
 'drinks_often',
 'drinks_rarely',
 'drinks_socially',

 'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed',
 'no_of_kids_one',
 'no_of_kids_zero',
                   
 'want_more_kids_maybe',
 'want_more_kids_no',
 'want_more_kids_not_disclosed',
 'want_more_kids_yes',
                   
 'has_dogs_no',
 'has_dogs_not_disclosed',
 'has_dogs_yes',
                   
 'like_dogs_no',
 'like_dogs_not_disclosed',
 'like_dogs_yes',
                   
 'has_cats_no',
 'has_cats_not_disclosed',
 'has_cats_yes',
                   
 'like_cats_no',
 'like_cats_not_disclosed',
 'like_cats_yes',
                   
 'religion_agnosticism',
 'religion_atheism',
 'religion_catholicism',
 'religion_christianity',
 'religion_judaism',
 'religion_not_disclosed',
 'religion_other',
                   
 'smokes_no',
 'smokes_not_disclosed',
 'smokes_yes',
]


text_columns = ['cluster_text']


X = ok[num_cat_columns + text_columns]

## Function to run several KNN Models

### K-Nearest Neighbors (KNN) Metrics Summary

| Metric                     | Description                                | Better When  |
|----------------------------|--------------------------------------------|-------------|
| **Sum of Squared Errors (SSE)**  | Measures overall spread                  | Lower       |
| **Mean Average Distance (MAD)**  | Avg. distance to k-nearest neighbors     | Lower       |
| **Maximum Distance (MaxD)**      | Largest distance found (outliers)        | Lower       |
| **Minimum Distance (MinD)**      | Smallest distance found (possible duplicates) | Higher*    |

**Interpretation**:  
- **Lower values** of *SSE, MAD, and MaxD* indicate tighter, more meaningful neighbor relationships.  
- **Higher MinD** values may indicate a lack of exact duplicates, but too high could suggest poor coverage.  


In [218]:
def run_unsupervised_knn(ok, X, n_clusters=12, n_neighbors=6):
    """
    Runs K-Means clustering and finds the 5 most similar people within each cluster using KNN.
    Computes clustering and similarity metrics.

    Parameters:
    - ok (pd.DataFrame): The original dataset (modified in place).
    - X (pd.DataFrame or np.array): Feature matrix (already scaled).
    - n_clusters (int): Number of clusters for K-Means (default=12).
    - n_neighbors (int): The number of nearest neighbors to consider (default=6, including self).
    """
    if X is None:
        raise ValueError("Feature vector X must be provided")
    
    # Fit K-Means model
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    ok['cluster'] = kmeans.fit_predict(X)
    
    # Compute Sum of Squared Errors (SSE)
    sse = kmeans.inertia_
    
    # Initialize KNN
    knn = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean')
    knn.fit(X)
    
    # Get the nearest neighbors (including self, so we exclude it)
    distances, indices = knn.kneighbors(X)
    
    # Compute distance-based metrics
    mad = np.mean(distances[:, 1:])  # Mean Average Distance (excluding self)
    maxd = np.max(distances[:, 1:])  # Maximum Distance (excluding self)
    mind = np.min(distances[:, 1:])  # Minimum Distance (excluding self)

    # Convert nearest neighbors to a DataFrame
    similar_df = pd.DataFrame(indices[:, 1:6], index=ok.index, columns=[f'knn_match_{i+1}' for i in range(5)])
    
    # Merge similar people indices with the original dataset in place
    ok.loc[:, similar_df.columns] = similar_df
    
    # Print structured output
    print(f"📌 Sum of Squared Errors (SSE): {sse:.4f} → Measures overall spread")
    print(f"📌 Mean Average Distance (MAD): {mad:.4f} → Average distance to k-nearest neighbors")
    print(f"📌 Maximum Distance (MaxD): {maxd:.4f} → Largest distance found (outlier detection)")
    print(f"📌 Minimum Distance (MinD): {mind:.4f} → Smallest distance found (possible duplicates)")

### Baseline model: KNN 

In [131]:
%%time
run_unsupervised_knn(ok,X, 5)

📌 Sum of Squared Errors (SSE): 435436.8577 → Measures overall spread
📌 Mean Average Distance (MAD): 0.9998 → Average distance to k-nearest neighbors
📌 Maximum Distance (MaxD): 3.1361 → Largest distance found (outlier detection)
📌 Minimum Distance (MinD): 0.0000 → Smallest distance found (possible duplicates)
CPU times: user 20.4 s, sys: 164 ms, total: 20.5 s
Wall time: 2.79 s


## Function to run several Kmean Models

In [219]:
def cluster_and_find_similar(dataset, X, n_clusters=12, n_similar=5):
    """
    Runs K-Means clustering and finds the specified number of most similar individuals within each cluster.
    Saves the 5 most similar individuals permanently to the dataset and prints key clustering metrics.

    Parameters:
    - dataset: DataFrame, the dataset to use (modified in place).
    - X: DataFrame, the feature vector.
    - n_clusters: int, number of clusters for K-Means (default=12).
    - n_similar: int, number of similar individuals to return per person (default=5).
    """
    if X is None:
        raise ValueError("Feature vector X must be provided")
    
    # Fit K-Means model
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    dataset['cluster'] = kmeans.fit_predict(X)
    
    # Compute clustering evaluation metrics
    silhouette_avg = silhouette_score(X, dataset['cluster'])
    sse = kmeans.inertia_
    davies_bouldin = davies_bouldin_score(X, dataset['cluster'])
    calinski_harabasz = calinski_harabasz_score(X, dataset['cluster'])
    
    # Print structured output
    print(f"📌 Silhouette Score: {silhouette_avg:.4f} → Measures clustering cohesion")
    print(f"📌 Sum of Squared Errors (SSE): {sse:.4f} → Measures overall spread")
    print(f"📌 Davies-Bouldin Score: {davies_bouldin:.4f} → Measures cluster separation (lower is better)")
    print(f"📌 Calinski-Harabasz Score: {calinski_harabasz:.4f} → Measures cluster quality (higher is better)")

    # Dictionary to store similar individuals
    similar_people = {}
    avg_match_distances = []

    # Find closest people within each cluster
    for cluster in range(n_clusters):
        cluster_indices = dataset[dataset['cluster'] == cluster].index
        cluster_features = X.loc[cluster_indices]

        # Compute pairwise distances within the cluster
        distances = euclidean_distances(cluster_features)
        sorted_indices = np.argsort(distances, axis=1)[:, 1:n_similar+1]  # Exclude self (index 0)

        for i, index in enumerate(cluster_indices):
            similar_people[index] = cluster_indices[sorted_indices[i]].tolist()
            avg_match_distances.append(np.mean(distances[i][sorted_indices[i]]))

    # Compute Average Match Distance (AMD)
    overall_avg_match_distance = np.mean(avg_match_distances)
    print(f"📌 Average Match Distance (AMD): {overall_avg_match_distance:.4f} → Measures how close recommended matches are")

    # Convert dictionary to DataFrame
    similar_df = pd.DataFrame.from_dict(similar_people, orient='index')
    similar_df.columns = [f'kmeans_match_{i+1}' for i in range(n_similar)]

    # Merge the similar individuals with the dataset in place
    dataset.loc[:, similar_df.columns] = similar_df

## Testing better model for text dimensionality reduction

Let's re-run the section below with the final vector of X and see if we gat the same recomendation. We first ran with the vector num_cat_columns above. But we later did a detailed analysis to select the best features and we will re-run the analysis below.

In [8]:
num_cat_columns=['female', 'age_scaled', 'single','height_scaled','orientation_bisexual', 'orientation_gay',
     'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
     'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
     'education_status_not_disclosed', 'education_status_working',  'speaks_english', 'speaks_spanish', 
     'speaks_other','diet_type_vegetarian', 'has_dogs_yes', 'no_of_kids_more_than_one','no_of_kids_one',
     'text_length_scaled']

### Model 1: Text Kmeans Cluster

In [9]:
%%time
text_columns = ['cluster_text']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1844
Inertia (SSE): 101693.6377
Davies-Bouldin Score: 1.7407
Calinski-Harabasz Score: 9744.4874
Average Match Distance (AMD): 0.1773
CPU times: user 1min 5s, sys: 58.4 s, total: 2min 4s
Wall time: 38.9 s


### Model 2: Therapist

In [10]:
%%time
text_columns = [ 'therapist_Expressions of Happiness and Joy',
 'therapist_Managing and Increasing Energy Levels',
 'therapist_Other',
 'therapist_See and Understanding Conversations',
 'therapist_Time Up and Future Meetings']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1686
Inertia (SSE): 100305.4067
Davies-Bouldin Score: 2.1442
Calinski-Harabasz Score: 4654.6152
Average Match Distance (AMD): 0.1835
CPU times: user 1min 10s, sys: 1min 7s, total: 2min 17s
Wall time: 44.5 s


### Model 3: General

In [11]:
%%time
text_columns = ['general_Entertainment',
 'general_Home & Hobbies',
 'general_Literature',
 'general_Other',
 'general_Social Life']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1982
Inertia (SSE): 81460.9735
Davies-Bouldin Score: 1.8188
Calinski-Harabasz Score: 5482.3137
Average Match Distance (AMD): 0.1543
CPU times: user 1min 45s, sys: 51.1 s, total: 2min 36s
Wall time: 1min 10s


### Model 4: 5 LDA Topics

In [12]:
%%time
text_columns = ['topic_0_from_five', 'topic_1_from_five', 
                'topic_2_from_five', 'topic_3_from_five', 'topic_4_from_five']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1453
Inertia (SSE): 95745.2212
Davies-Bouldin Score: 2.1358
Calinski-Harabasz Score: 4626.3059
Average Match Distance (AMD): 0.2095
CPU times: user 1min 38s, sys: 53.7 s, total: 2min 32s
Wall time: 1min 3s


### Model 5: 2 LDA Topics

In [13]:
%%time
text_columns = ['topic_0_from_two','topic_1_from_two']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.2209
Inertia (SSE): 67059.5664
Davies-Bouldin Score: 1.5017
Calinski-Harabasz Score: 6683.6448
Average Match Distance (AMD): 0.1647
CPU times: user 1min 35s, sys: 1min 3s, total: 2min 38s
Wall time: 1min 5s


### Model 6: PCA SDV

In [14]:
%%time
text_columns = ['PCA_sdv_1','PCA_sdv_2']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.2060
Inertia (SSE): 72612.6824
Davies-Bouldin Score: 1.6309
Calinski-Harabasz Score: 5854.6708
Average Match Distance (AMD): 0.1683
CPU times: user 1min 40s, sys: 55.7 s, total: 2min 36s
Wall time: 1min 7s


### Model 7: PCA Word2vec embeddings

In [15]:
%%time
text_columns = ['PCA_emb_1','PCA_emb_2']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.2075
Inertia (SSE): 70680.4075
Davies-Bouldin Score: 1.7744
Calinski-Harabasz Score: 6234.4083
Average Match Distance (AMD): 0.1404
CPU times: user 1min 27s, sys: 1min 9s, total: 2min 37s
Wall time: 1min 2s


### Model 8: PCA TfidfVectorizer

In [17]:
%%time
text_columns = ['PCA_vect_1','PCA_vect_2']
X = ok[num_cat_columns + text_columns]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.2060
Inertia (SSE): 72613.2315
Davies-Bouldin Score: 1.6309
Calinski-Harabasz Score: 5854.6146
Average Match Distance (AMD): 0.1683
CPU times: user 1min 41s, sys: 50.1 s, total: 2min 31s
Wall time: 1min 4s


### Summary



| Metric                     | Description                         | Better When  |
|----------------------------|-------------------------------------|-------------|
| **Silhouette Score**       | Cluster separation quality         | Higher      |
| **Inertia (SSE)**          | Sum of squared distances to centers | Lower       |
| **Davies-Bouldin Score**   | Cluster similarity ratio           | Lower       |
| **Calinski-Harabasz Score** | Ratio of inter/intra-cluster variance | Higher      |
| **Average Match Distance (AMD)** | Avg. distance to closest matches | Lower       |

**Interpretation**:  
- **Higher values** of *Silhouette Score* and *Calinski-Harabasz Score* indicate better clustering.  
- **Lower values** of *Inertia*, *Davies-Bouldin Score*, and *AMD* indicate better cluster compactness and well-separated groups.

### Model Comparison Table

| Model                              | Silhouette ↑ | Inertia (SSE) ↓ | Davies-Bouldin ↓ | Calinski-Harabasz ↑ | AMD ↓  | Score (Lower is Better) |
|------------------------------------|-------------|----------------|----------------|------------------|------|---------------------|
| **Model 1: Text KMeans Cluster**  | 0.1844      | 101693.6377    | 1.7407         | 9744.4874        | 0.1773 | 5 |
| **Model 2: Therapist**            | 0.1686      | 100305.4067    | 2.1442         | 4654.6152        | 0.1835 | 7 |
| **Model 3: General**              | 0.1982      | 81460.9735     | 1.8188         | 5482.3137        | 0.1543 | 6 |
| **Model 4: 5 LDA Topics**         | 0.1453      | 95745.2212     | 2.1358         | 4626.3059        | 0.2095 | 8 |
| **Model 5: 2 LDA Topics**         | 0.2209      | 67059.5664     | 1.5017         | 6683.6448        | 0.1647 | **1** 🏆 |
| **Model 6: PCA SDV**              | 0.2060      | 72612.6824     | 1.6309         | 5854.6708        | 0.1683 | **3** 🥉 |
| **Model 7: PCA Word2Vec**         | 0.2075      | 70680.4075     | 1.7744         | 6234.4083        | 0.1404 | 4 |
| **Model 8: PCA TF-IDF**           | 0.2060      | 72613.2315     | 1.6309         | 5854.6146        | 0.1683 | 4 |

---

### **Best Models:**
1️⃣ **Model 5 (2 LDA Topics)** – Best overall with the highest Silhouette score, a moderate Davies-Bouldin score, and relatively low AMD.  
2️⃣ **Model 6 (PCA SDV)** – Solid second choice with a good balance of performance and lower AMD compared to the others.  
3️⃣ **Model 7 (PCA Word2Vec)** – Strong contender with a slight increase in Silhouette and lower AMD than other models.  
4️⃣ **Model 8 (PCA TF-IDF)** – Same performance as Model 7, but more consistent.

### **How to Choose:**
- **For best balance of cluster cohesion and match quality** → **Model 5 (2 LDA Topics)**.  
- **For PCA-based dimensionality reduction** with balanced performance → **Model 6 (PCA SDV)** or **Model 7 (PCA Word2Vec)**.  
- **For word-based features** → **Model 7 (PCA Word2Vec)** or **Model 8 (PCA TF-IDF)**.

🚀 **Next step**: Based on these results, you might want to explore hyperparameter optimization for the top models (such as adjusting `n_clusters`) for further improvement.


## Testing which features to include in the model

### Adding features and analysing

In [19]:
%%time
new=['topic_0_from_two']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.5358
Inertia (SSE): 22.4772
Davies-Bouldin Score: 0.5047
Calinski-Harabasz Score: 495648.7711
Average Match Distance (AMD): 0.0000
CPU times: user 31.8 s, sys: 2.63 s, total: 34.5 s
Wall time: 33.7 s


In [16]:
%%time
new=['topic_0_from_two',
 'topic_1_from_two']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.5340
Inertia (SSE): 44.2808
Davies-Bouldin Score: 0.5072
Calinski-Harabasz Score: 503201.5262
Average Match Distance (AMD): 0.0000
CPU times: user 1min 2s, sys: 54 s, total: 1min 56s
Wall time: 36.1 s


In [18]:
%%time
new=['age_scaled', 'topic_0_from_two',
 'topic_1_from_two']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.3392
Inertia (SSE): 626.4378
Davies-Bouldin Score: 0.8347
Calinski-Harabasz Score: 48362.3012
Average Match Distance (AMD): 0.0014
CPU times: user 1min 7s, sys: 52.1 s, total: 1min 59s
Wall time: 38.7 s


In [20]:
%%time
new=['age_scaled', 'single', 'topic_0_from_two',
 'topic_1_from_two']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.3561
Inertia (SSE): 916.2438
Davies-Bouldin Score: 0.8427
Calinski-Harabasz Score: 54818.5556
Average Match Distance (AMD): 0.0022
CPU times: user 1min 7s, sys: 55 s, total: 2min 2s
Wall time: 42.6 s


In [21]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.4059
Inertia (SSE): 1646.2831
Davies-Bouldin Score: 0.7757
Calinski-Harabasz Score: 75799.0390
Average Match Distance (AMD): 0.0037
CPU times: user 1min 8s, sys: 53.7 s, total: 2min 2s
Wall time: 41.7 s


In [22]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two',  'orientation_bisexual', 'orientation_gay',
 'orientation_straight']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.4214
Inertia (SSE): 4042.7787
Davies-Bouldin Score: 0.8159
Calinski-Harabasz Score: 47682.7808
Average Match Distance (AMD): 0.0063
CPU times: user 1min 13s, sys: 51.2 s, total: 2min 5s
Wall time: 46 s


In [None]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two',  'orientation_bisexual', 'orientation_gay',
 'orientation_straight']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

In [24]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.4919
Inertia (SSE): 23761.3141
Davies-Bouldin Score: 1.1433
Calinski-Harabasz Score: 14937.8450
Average Match Distance (AMD): 0.0192
CPU times: user 1min 8s, sys: 53.7 s, total: 2min 1s
Wall time: 40 s


In [26]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.2924
Inertia (SSE): 54981.2490
Davies-Bouldin Score: 1.4925
Calinski-Harabasz Score: 6980.5454
Average Match Distance (AMD): 0.0364
CPU times: user 1min 1s, sys: 57.5 s, total: 1min 59s
Wall time: 37.3 s


In [27]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1905
Inertia (SSE): 86119.1756
Davies-Bouldin Score: 2.0949
Calinski-Harabasz Score: 4341.3648
Average Match Distance (AMD): 0.0774
CPU times: user 1min 7s, sys: 53.5 s, total: 2min 1s
Wall time: 39.1 s


In [28]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1375
Inertia (SSE): 108120.9289
Davies-Bouldin Score: 2.1793
Calinski-Harabasz Score: 3869.3748
Average Match Distance (AMD): 0.1267
CPU times: user 1min 23s, sys: 40.7 s, total: 2min 3s
Wall time: 1min 3s


In [29]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1022
Inertia (SSE): 125245.3447
Davies-Bouldin Score: 2.2350
Calinski-Harabasz Score: 3566.3926
Average Match Distance (AMD): 0.1921
CPU times: user 2min 19s, sys: 22.7 s, total: 2min 41s
Wall time: 2min 17s


In [31]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes',  'has_dogs_no', 'has_dogs_not_disclosed','has_dogs_yes']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0787
Inertia (SSE): 139426.0870
Davies-Bouldin Score: 2.4819
Calinski-Harabasz Score: 3256.4672
Average Match Distance (AMD): 0.2513
CPU times: user 1min 44s, sys: 25.9 s, total: 2min 10s
Wall time: 1min 29s


In [33]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes' , 'has_dogs_no', 'has_dogs_not_disclosed','has_dogs_yes',
 'like_dogs_no', 'like_dogs_not_disclosed', 'like_dogs_yes']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1049
Inertia (SSE): 155476.2675
Davies-Bouldin Score: 2.3097
Calinski-Harabasz Score: 3367.3100
Average Match Distance (AMD): 0.3092
CPU times: user 1min 58s, sys: 26.7 s, total: 2min 24s
Wall time: 1min 32s


In [34]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes',  'has_dogs_no', 'has_dogs_not_disclosed','has_dogs_yes',
 'like_dogs_no', 'like_dogs_not_disclosed', 'like_dogs_yes',  'has_cats_no',
 'has_cats_not_disclosed','has_cats_yes']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0918
Inertia (SSE): 173734.6288
Davies-Bouldin Score: 2.8097
Calinski-Harabasz Score: 2992.6794
Average Match Distance (AMD): 0.4030
CPU times: user 1min 59s, sys: 24 s, total: 2min 24s
Wall time: 1min 36s


In [35]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes',  'has_dogs_no', 'has_dogs_not_disclosed','has_dogs_yes',
 'like_dogs_no', 'like_dogs_not_disclosed', 'like_dogs_yes',  'has_cats_no',
 'has_cats_not_disclosed','has_cats_yes',  'like_cats_no', 'like_cats_not_disclosed',
 'like_cats_yes']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0835
Inertia (SSE): 192555.5129
Davies-Bouldin Score: 2.6658
Calinski-Harabasz Score: 3084.4088
Average Match Distance (AMD): 0.4575
CPU times: user 2min 3s, sys: 17.4 s, total: 2min 21s
Wall time: 1min 43s


In [36]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes',  'has_dogs_no', 'has_dogs_not_disclosed','has_dogs_yes',
 'like_dogs_no', 'like_dogs_not_disclosed', 'like_dogs_yes',  'has_cats_no',
 'has_cats_not_disclosed','has_cats_yes',  'like_cats_no', 'like_cats_not_disclosed',
 'like_cats_yes',  'religion_agnosticism', 'religion_atheism', 'religion_catholicism',
 'religion_christianity','religion_judaism', 'religion_not_disclosed','religion_other']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0825
Inertia (SSE): 234397.6699
Davies-Bouldin Score: 2.9154
Calinski-Harabasz Score: 2680.5108
Average Match Distance (AMD): 0.7785
CPU times: user 1min 13s, sys: 50.8 s, total: 2min 4s
Wall time: 40.2 s


In [37]:
%%time
new=['age_scaled', 'single', 'female', 'topic_0_from_two',
 'topic_1_from_two' , 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'body_type_athletic','body_type_average',
 'body_type_curvy','body_type_fit', 'body_type_not_disclosed',
 'body_type_other','body_type_thin',  'diet_type_anything',
 'diet_type_not_disclosed', 'diet_type_other','diet_type_vegetarian',
 'drinks_not at all', 'drinks_not_disclosed', 'drinks_often',
 'drinks_rarely', 'drinks_socially',  'no_of_kids_more_than_one',
 'no_of_kids_not_disclosed', 'no_of_kids_one','no_of_kids_zero',  
 'want_more_kids_maybe', 'want_more_kids_no', 'want_more_kids_not_disclosed',
 'want_more_kids_yes',  'has_dogs_no', 'has_dogs_not_disclosed','has_dogs_yes',
 'like_dogs_no', 'like_dogs_not_disclosed', 'like_dogs_yes',  'has_cats_no',
 'has_cats_not_disclosed','has_cats_yes',  'like_cats_no', 'like_cats_not_disclosed',
 'like_cats_yes',  'religion_agnosticism', 'religion_atheism', 'religion_catholicism',
 'religion_christianity','religion_judaism', 'religion_not_disclosed','religion_other',
 'smokes_no', 'smokes_not_disclosed', 'smokes_yes']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.0702
Inertia (SSE): 260182.5113
Davies-Bouldin Score: 3.0105
Calinski-Harabasz Score: 2407.7145
Average Match Distance (AMD): 0.9226
CPU times: user 1min 11s, sys: 51.1 s, total: 2min 2s
Wall time: 38.9 s


| Model                               | Silhouette ↑ | Inertia (SSE) ↓ | Davies-Bouldin ↓ | Calinski-Harabasz ↑ | AMD ↓   | Score (Lower is Better) | Best Model Consideration                             | Compare to Previous Model                                      | Recommendation                             |
|-------------------------------------|--------------|-----------------|------------------|---------------------|---------|------------------------|-----------------------------------------------------|---------------------------------------------------------------|--------------------------------------------|
| **Model 1: First LDA Topic only**    | 0.5358       | 22.4772         | 0.5047           | 495648.7711         | 0.0000  | 1                      | Best Overall (Best Silhouette & low AMD)           | N/A                                                           | Keep (Strong performance)                  |
| **Model 2: Adding LDA 2 Topics**    | 0.5340       | 44.2808         | 0.5072           | 503201.5262         | 0.0000  | 2                      | Second Best (Good LDA performance)                 | Silhouette slightly decreased                                           | Keep (Still strong)                        |
| **Model 3: Adding Age**             | 0.3392       | 626.4378        | 0.8347           | 48362.3012          | 0.0014  | 3                      | Not Recommended (Lower Silhouette)                  | Significant drop in Silhouette                                        | Remove (Negative impact)                   |
| **Model 4: Adding Single**          | 0.3561       | 916.2438        | 0.8427           | 54818.5556          | 0.0022  | 4                      | Not Recommended (Lower Silhouette & higher AMD)     | Silhouette and AMD worsened                                            | Remove (Negative impact)                   |
| **Model 5: Adding Gender**          | 0.4059       | 1646.2831       | 0.7757           | 75799.0390          | 0.0037  | 5                      | Solid Option (Balanced performance)                 | Silhouette increased, AMD lower                                         | Keep (Positive effect)                     |
| **Model 6: Adding Orientation**     | 0.4214       | 4042.7787       | 0.8159           | 47682.7808          | 0.0063  | 6                      | Good Option (Strong Silhouette)                     | Good improvement in Silhouette                                          | Keep (Positive effect)                     |
| **Model 7: Adding Body Type**       | 0.4919       | 23761.3141      | 1.1433           | 14937.8450          | 0.0192  | 7                      | Not Recommended (High Davies-Bouldin & AMD)        | Silhouette increased, but high Davies-Bouldin                           | Remove (Negative impact)                   |
| **Model 8: Adding Diet Type**       | 0.2924       | 54981.2490      | 1.4925           | 6980.5454           | 0.0364  | 8                      | Not Recommended (Low Silhouette & High Davies-Bouldin) | Significant drop in Silhouette and high Davies-Bouldin                 | Remove (Negative impact)                   |
| **Model 9: Adding Drink**           | 0.1905       | 86119.1756      | 2.0949           | 4341.3648           | 0.0774  | 9                      | Not Recommended (Low Silhouette & High Davies-Bouldin) | Significant drop in Silhouette, High Davies-Bouldin                    | Remove (Negative impact)                   |
| **Model 10: Adding Has Kids**       | 0.1375       | 108120.9289     | 2.1793           | 3869.3748           | 0.1267  | 10                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Silhouette decreased, Davies-Bouldin increased                         | Remove (Negative impact)                   |
| **Model 11: Adding Wants Kids**     | 0.1022       | 125245.3447     | 2.2350           | 3566.3926           | 0.1921  | 11                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Significant drop in Silhouette, High Davies-Bouldin                    | Remove (Negative impact)                   |
| **Model 12: Adding Has Dog**        | 0.0787       | 139426.0870     | 2.4819           | 3256.4672           | 0.2513  | 12                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Further drop in Silhouette, High Davies-Bouldin                         | Remove (Negative impact)                   |
| **Model 13: Adding Likes Dog**      | 0.1049       | 155476.2675     | 2.3097           | 3367.3100           | 0.3092  | 13                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Drop in Silhouette and higher Davies-Bouldin                            | Remove (Negative impact)                   |
| **Model 14: Adding Likes Cat**      | 0.0835       | 192555.5129     | 2.6658           | 3084.4088           | 0.4575  | 14                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Silhouette slightly improved, but Inertia and Davies-Bouldin worsened   | Remove (Negative impact)                   |
| **Model 15: Adding Religion**       | 0.0825       | 234397.6699     | 2.9154           | 2680.5108           | 0.7785  | 15                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Silhouette decreased, Davies-Bouldin worsened                            | Remove (Negative impact)                   |
| **Model 16: Adding Smokes**         | 0.0702       | 260182.5113     | 3.0105           | 2407.7145           | 0.9226  | 16                     | Not Recommended (Low Silhouette & High Davies-Bouldin) | Further decrease in Silhouette, Davies-Bouldin worsened, AMD increased | Remove (Negative impact)                   |


### Keeping best perfomarnce and adding new ones

In [38]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.5743
Inertia (SSE): 386.3850
Davies-Bouldin Score: 0.4720
Calinski-Harabasz Score: 436722.6819
Average Match Distance (AMD): 0.0001
CPU times: user 1min 12s, sys: 54.7 s, total: 2min 6s
Wall time: 46 s


In [39]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.5987
Inertia (SSE): 18912.3684
Davies-Bouldin Score: 0.9301
Calinski-Harabasz Score: 23506.9959
Average Match Distance (AMD): 0.0011
CPU times: user 1min 10s, sys: 53.2 s, total: 2min 3s
Wall time: 42.7 s


In [42]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working',  'speaks_english',
    'speaks_spanish', 'speaks_other']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.3864
Inertia (SSE): 38847.2780
Davies-Bouldin Score: 1.3281
Calinski-Harabasz Score: 11533.1687
Average Match Distance (AMD): 0.0065
CPU times: user 2min 22s, sys: 27.4 s, total: 2min 50s
Wall time: 2min 53s


In [43]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working',  'speaks_english',
    'speaks_spanish', 'speaks_other', 'is_asian', 'is_white',
    'is_black','is_other','is_hispanic_latin','is_ethnicity_nan']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.1988
Inertia (SSE): 85417.1991
Davies-Bouldin Score: 1.9556
Calinski-Harabasz Score: 5126.4877
Average Match Distance (AMD): 0.0656
CPU times: user 2min 2s, sys: 17.8 s, total: 2min 20s
Wall time: 1min 54s


In [45]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working',  'speaks_english',
    'speaks_spanish', 'speaks_other', 'text_length_scaled']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.3375
Inertia (SSE): 45733.0714
Davies-Bouldin Score: 1.3540
Calinski-Harabasz Score: 9249.1515
Average Match Distance (AMD): 0.0282
CPU times: user 1min 15s, sys: 55.7 s, total: 2min 11s
Wall time: 47.1 s


In [46]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working',  'speaks_english',
    'speaks_spanish', 'speaks_other', 'text_length_scaled']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.3375
Inertia (SSE): 45733.0714
Davies-Bouldin Score: 1.3540
Calinski-Harabasz Score: 9249.1515
Average Match Distance (AMD): 0.0282
CPU times: user 1min 56s, sys: 33.5 s, total: 2min 30s
Wall time: 1min 48s


In [51]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working',  'speaks_english',
    'speaks_spanish', 'speaks_other', 'text_length_scaled',  'height_scaled']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.3431
Inertia (SSE): 42265.7806
Davies-Bouldin Score: 1.3674
Calinski-Harabasz Score: 10623.3284
Average Match Distance (AMD): 0.0561
CPU times: user 1min 34s, sys: 48.2 s, total: 2min 23s
Wall time: 1min 1s


In [None]:
%%time
new=['female', 'topic_0_from_two', 'orientation_bisexual', 'orientation_gay',
 'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
    'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
    'education_status_not_disclosed', 'education_status_working',  'speaks_english',
    'speaks_spanish', 'speaks_other', 'text_length_scaled',  'height_scaled']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

In [66]:
%%time
new=['female', 'age_scaled', 'single','height_scaled','orientation_bisexual', 'orientation_gay',
     'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
     'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
     'education_status_not_disclosed', 'education_status_working',  'speaks_english', 'speaks_spanish', 
     'speaks_other''diet_type_vegetarian', 'has_dogs_yes', 'no_of_kids_more_than_one','no_of_kids_one',
     'topic_0_from_two','text_length_scaled']
X = ok[new]
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

Silhouette Score: 0.2209
Inertia (SSE): 67059.5664
Davies-Bouldin Score: 1.5017
Calinski-Harabasz Score: 6683.6448
Average Match Distance (AMD): 0.1647
CPU times: user 1min 6s, sys: 56.1 s, total: 2min 2s
Wall time: 39.9 s


| Model  | Silhouette ↑ | Inertia (SSE) ↓ | Davies-Bouldin ↓ | Calinski-Harabasz ↑ | AMD ↓ | Score (Lower is Better) | Best Model Consideration | Compare to Previous Model | Recommendation |
|--------|--------------|------------------|------------------|---------------------|-------|-------------------------|--------------------------|---------------------------|----------------|
| Model 1: LDA 1 + Gender + Orientation | 0.5743 | 386.3850 | 0.4720 | 436722.6819 | 0.0001 | 7 | Best Overall (Best Silhouette & low AMD) | Significant improvement | Keep (Strong performance) |
| Model 2: Adding Education | 0.5987 | 18912.3684 | 0.9301 | 23506.9959 | 0.0011 | 8 | Solid Option (Good improvement) | Silhouette increased | Keep (Positive effect) |
| Model 3: Adding Languages | 0.3864 | 38847.2780 | 1.3281 | 11533.1687 | 0.0065 | 9 | Positive Impact (Good Silhouette & AMD) | Silhouette increased, balanced performance | Keep (Positive effect) |
| Model 4: Adding Text Length | 0.3375 | 45733.0714 | 1.3540 | 9249.1515 | 0.0282 | 10 | Positive Impact (Good Silhouette & balanced performance) | Balanced performance | Keep (Positive effect) |
| Model 5: Adding Height | 0.3431 | 42265.7806 | 1.3674 | 10623.3284 | 0.0561 | 20 | Good Option (Improved Silhouette) | Silhouette increased | Keep (Positive effect) |
| Model 6: Adding Vegetarian | 0.2986 | 48124.1491 | 1.4833 | 9184.3819 | 0.0654 | 21 | Borderline, but Leaning Toward Keep | Better than some removed features | Keep (Conditional) |
| Model 7: Adding Has Dog | 0.2628 | 53159.2449 | 1.4843 | 8545.5971 | 0.0794 | 23 | Keep (Conditional) | Improved over previous evaluation | Keep (Positive effect) |
| Model 8: Adding Has Kids | 0.2665 | 57530.2317 | 1.4746 | 7929.0712 | 0.0959 | 25 | Keep (Conditional) | Slight improvement over 'Has Dog' | Keep (Positive effect) |
| Model 9: Adding Single | 0.2273 | 64094.6961 | 1.6373 | 6894.5560 | 0.1121 | 26 | Keep (Conditional) | Improved over 'Body Type - Athletic' | Keep (Positive effect) |
| Model 10: Adding Age | 0.2378 | 64639.9396 | 1.5695 | 6963.6418 | 0.1526 | 27 | Solid option (Lower Silhouette but improved Davies-Bouldin) | Slight drop in Silhouette | Keep (Conditional) |

## Function to run several Deep Collaborative Scoring Model (DBScam)

In [207]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import numpy as np
import pandas as pd

def run_dbscan_recommendation(ok, X, eps=0.5, min_samples=5):
    """
    Runs DBSCAN clustering and recommends the 5 most similar people per individual.
    Also computes evaluation metrics and visualizes clusters.

    Parameters:
        ok (pd.DataFrame): The original dataset (modified in place).
        X (pd.DataFrame or np.array): Feature matrix (already scaled).
        eps (float): The maximum distance for points to be considered neighbors.
        min_samples (int): The minimum number of samples to form a core point.
    """
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
    labels = dbscan.fit_predict(X)
    ok['cluster'] = labels  # Assign clusters

    # Dictionary to store the most similar people for each individual
    similar_people = {}
    for i in range(len(X)):
        label = labels[i]
        if label == -1:
            similar_people[i] = []  # Noise points have no cluster
            continue

        cluster_indices = np.where(labels == label)[0]  # Get indices of people in the same cluster
        cluster_distances = np.linalg.norm(X.iloc[cluster_indices].values - X.iloc[i].values, axis=1)  # Compute distances
        sorted_indices = cluster_indices[np.argsort(cluster_distances)]  # Sort by similarity

        similar_people[i] = sorted_indices[1:6].tolist()  # Exclude self and take top 5

    # Convert to DataFrame and save permanently
    similar_df = pd.DataFrame.from_dict(similar_people, orient='index')
    similar_df.columns = [f'db_match_{i+1}' for i in range(5)]
    
    ok.loc[:, similar_df.columns] = similar_df  # Save columns permanently to the dataset

    # Compute clustering evaluation metrics
    valid_mask = labels != -1  # Mask to filter out noise points
    X_valid = X.iloc[valid_mask] if isinstance(X, pd.DataFrame) else X[valid_mask]
    valid_labels = labels[valid_mask]  # Remove noise points
    
    if len(set(valid_labels)) > 1:  # Metrics require at least 2 clusters
        silhouette_avg = silhouette_score(X_valid, valid_labels)
        davies_bouldin = davies_bouldin_score(X_valid, valid_labels)
        calinski_harabasz = calinski_harabasz_score(X_valid, valid_labels)

        print(f"📌 Silhouette Score: {silhouette_avg:.4f} → Measures clustering cohesion")
        print(f"📌 Davies-Bouldin Score: {davies_bouldin:.4f} → Measures cluster separation (lower is better)")
        print(f"📌 Calinski-Harabasz Score: {calinski_harabasz:.4f} → Measures cluster quality (higher is better)")
    else:
        print("⚠️ Not enough clusters to compute metrics.")


In [208]:
run_dbscan_recommendation(ok, X, eps=0.5, min_samples=5)

📌 Silhouette Score: 0.6875 → Measures clustering cohesion
📌 Davies-Bouldin Score: 0.4939 → Measures cluster separation (lower is better)
📌 Calinski-Harabasz Score: 2607.6688 → Measures cluster quality (higher is better)


In [209]:
ok

Unnamed: 0,age_scaled,single,female,orientation_bisexual,orientation_gay,orientation_straight,body_type_athletic,body_type_average,body_type_curvy,body_type_fit,...,therapist_Managing and Increasing Energy Levels,therapist_Other,therapist_See and Understanding Conversations,therapist_Time Up and Future Meetings,cluster,db_match_1,db_match_2,db_match_3,db_match_4,db_match_5
0,0.078431,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,22670.0,36389.0,4295.0,44270.0,136.0
1,0.333333,1,0,0,0,1,0,1,0,0,...,1,0,0,0,1,36507.0,26895.0,58469.0,39768.0,55205.0
2,0.392157,0,0,0,0,1,0,0,0,0,...,0,1,0,0,2,50006.0,48841.0,12933.0,46852.0,42873.0
3,0.098039,1,0,0,0,1,0,0,0,0,...,0,1,0,0,3,7635.0,3356.0,48076.0,7123.0,28862.0
4,0.215686,1,0,0,0,1,1,0,0,0,...,0,0,0,0,4,109.0,4427.0,23631.0,33688.0,40679.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,0.803922,1,1,0,0,1,0,0,0,0,...,0,0,0,0,190,55574.0,33394.0,38258.0,39818.0,47175.0
59942,0.117647,1,0,0,0,1,0,0,0,1,...,0,0,0,1,0,28719.0,47895.0,41669.0,43969.0,31827.0
59943,0.470588,1,0,0,0,1,0,1,0,0,...,0,0,0,1,15,59176.0,5435.0,23857.0,10048.0,52493.0
59944,0.176471,1,0,0,0,1,1,0,0,0,...,0,1,0,0,0,52506.0,9766.0,40937.0,43761.0,55271.0


## Comparing models (KNN, Kmeans, DB Scam)

In [162]:
features=['female', 'age_scaled', 'single','height_scaled','orientation_bisexual', 'orientation_gay',
     'orientation_straight', 'education_type_college_univ', 'education_type_grad_or_professional_edu',
     'education_type_not_disclosed','education_type_two_year_college_or_less', 'education_status_graduated',
     'education_status_not_disclosed', 'education_status_working',  'speaks_english', 'speaks_spanish', 
     'speaks_other','diet_type_vegetarian', 'has_dogs_yes', 'no_of_kids_more_than_one','no_of_kids_one',
     'text_length_scaled']

In [163]:
X = ok[features]

In [164]:
ok

Unnamed: 0,age_scaled,single,female,orientation_bisexual,orientation_gay,orientation_straight,body_type_athletic,body_type_average,body_type_curvy,body_type_fit,...,general_Other,general_Social Life,therapis_label,therapist_real_label,therapist_grouped_label,therapist_Expressions of Happiness and Joy,therapist_Managing and Increasing Energy Levels,therapist_Other,therapist_See and Understanding Conversations,therapist_Time Up and Future Meetings
0,0.078431,1,0,0,0,1,0,0,0,0,...,0,1,0,Time Up and Future Meetings,Time Up and Future Meetings,0,0,0,0,1
1,0.333333,1,0,0,0,1,0,1,0,0,...,0,1,34,Managing and Increasing Energy Levels,Managing and Increasing Energy Levels,0,1,0,0,0
2,0.392157,0,0,0,0,1,0,0,0,0,...,0,1,18,Voices and Perception of Sound,Other,0,0,1,0,0
3,0.098039,1,0,0,0,1,0,0,0,0,...,0,0,14,Struggles and Desires in Learning,Other,0,0,1,0,0
4,0.215686,1,0,0,0,1,1,0,0,0,...,0,1,31,Expressions of Happiness and Joy,Expressions of Happiness and Joy,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,0.803922,1,1,0,0,1,0,0,0,0,...,0,1,31,Expressions of Happiness and Joy,Expressions of Happiness and Joy,1,0,0,0,0
59942,0.117647,1,0,0,0,1,0,0,0,1,...,0,1,0,Time Up and Future Meetings,Time Up and Future Meetings,0,0,0,0,1
59943,0.470588,1,0,0,0,1,0,1,0,0,...,0,1,0,Time Up and Future Meetings,Time Up and Future Meetings,0,0,0,0,1
59944,0.176471,1,0,0,0,1,1,0,0,0,...,0,1,33,Drinking Habits and Concerns,Other,0,0,1,0,0


### KNN

In [186]:
run_unsupervised_knn(ok, X, n_clusters=12, n_neighbors=6)

📌 Sum of Squared Errors (SSE): 65087.3645 → Measures overall spread
📌 Mean Average Distance (MAD): 0.1097 → Average distance to k-nearest neighbors
📌 Maximum Distance (MaxD): 1.7540 → Largest distance found (outlier detection)
📌 Minimum Distance (MinD): 0.0000 → Smallest distance found (possible duplicates)


### Kmeans

In [190]:
cluster_and_find_similar(dataset=ok, X=X, n_clusters=12, n_similar=5)

📌 Silhouette Score: 0.2430 → Measures clustering cohesion
📌 Sum of Squared Errors (SSE): 65087.3645 → Measures overall spread
📌 Davies-Bouldin Score: 1.7160 → Measures cluster separation (lower is better)
📌 Calinski-Harabasz Score: 6705.3753 → Measures cluster quality (higher is better)
📌 Average Match Distance (AMD): 0.1099 → Measures how close recommended matches are


### DB Scam

In [174]:
run_dbscan_recommendation(ok, X, eps=0.5, min_samples=5)

Silhouette Score: 0.6875
Davies-Bouldin Score: 0.4939
Calinski-Harabasz Score: 2607.6688


Unnamed: 0,age_scaled,single,female,orientation_bisexual,orientation_gay,orientation_straight,body_type_athletic,body_type_average,body_type_curvy,body_type_fit,...,kmeans_match_1,kmeans_match_2,kmeans_match_3,kmeans_match_4,kmeans_match_5,similar_1,similar_2,similar_3,similar_4,similar_5
0,0.078431,1,0,0,0,1,0,0,0,0,...,22670,36389,4295,44270,136,22670.0,36389.0,4295.0,44270.0,136.0
1,0.333333,1,0,0,0,1,0,1,0,0,...,36507,26895,58469,39768,55205,36507.0,26895.0,58469.0,39768.0,55205.0
2,0.392157,0,0,0,0,1,0,0,0,0,...,50006,48841,12933,46852,42873,50006.0,48841.0,12933.0,46852.0,42873.0
3,0.098039,1,0,0,0,1,0,0,0,0,...,7635,3356,48076,7123,28862,7635.0,3356.0,48076.0,7123.0,28862.0
4,0.215686,1,0,0,0,1,1,0,0,0,...,109,4427,23631,33688,40679,109.0,4427.0,23631.0,33688.0,40679.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,0.803922,1,1,0,0,1,0,0,0,0,...,55574,33394,38258,39818,47175,55574.0,33394.0,38258.0,39818.0,47175.0
59942,0.117647,1,0,0,0,1,0,0,0,1,...,28719,47895,41669,43969,31827,28719.0,47895.0,41669.0,43969.0,31827.0
59943,0.470588,1,0,0,0,1,0,1,0,0,...,59176,5435,23857,10048,52493,59176.0,5435.0,23857.0,10048.0,52493.0
59944,0.176471,1,0,0,0,1,1,0,0,0,...,52506,9766,40937,43761,55271,52506.0,9766.0,40937.0,43761.0,55271.0


In [None]:
ok

### Model Comparison Table

| Model             | Silhouette ↑ | Inertia (SSE) ↓ | Davies-Bouldin ↓ | Calinski-Harabasz ↑ | MAD ↓  | MaxD ↓  | MinD ↓  | Score (Lower is Better) |
|-------------------|--------------|-----------------|------------------|---------------------|--------|---------|---------|------------------------|
| **Model 1: KNN**  | N/A          | 13462.4987      | N/A              | N/A                 | 0.1097 | 1.7540  | 0.0000  | **5** (Outlier Detection) |
| **Model 2: KMeans** | 0.2430      | 65087.3645      | 1.7160           | 6705.3753           | N/A    | N/A     | N/A     | **6** (Cluster Quality)    |
| **Model 3: DBSCAN** | 0.6875      | N/A             | 0.4939           | 2607.6688           | N/A    | N/A     | N/A     | **4** (Best Overall)       |

### Best Model:
1️⃣ **Model 3 (DBSCAN)** – Best overall, with the highest Silhouette score and lowest Davies-Bouldin score.  
2️⃣ **Model 2 (KMeans)** – A solid choice, though with lower performance than DBSCAN.  
3️⃣ **Model 1 (KNN)** – The least favorable based on clustering metrics (e.g., no Silhouette or Davies-Bouldin score).

### How to choose?  
- If you prioritize **tight clusters & balanced performance** → **Model 3 (DBSCAN)**.  
- If you prefer **a more traditional clustering method** with more interpretable metrics → **Model 2 (KMeans)**.  
- If you are focused on **outlier detection and KNN** → **Model 1 (KNN)**.


## Function to run several Bayesian Personalized Ranking (BPR) Model

In [12]:
x.shape

(59946, 111)

In [51]:
import implicit
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd

def run_bpr_and_recommend(X, n_users, n_items, factors=10, iterations=50):
    """
    Runs the Bayesian Personalized Ranking (BPR) model and recommends similar individuals.
    
    Parameters:
    - X: A user-item interaction DataFrame (users as rows, items as columns).
    - n_users: Number of unique users.
    - n_items: Number of unique items.
    - factors: Number of latent factors for the BPR model (default=10).
    - iterations: Number of iterations to train the BPR model (default=50).
    
    Returns:
    - recommendations: A DataFrame of user recommendations.
    """
    # Convert the DataFrame to a sparse CSR matrix
    interaction_matrix = csr_matrix(X.values.astype('float32'))

    # Debugging: Check the shape of the interaction matrix
    print("Shape of interaction_matrix:", interaction_matrix.shape)

    # Initialize the BPR model
    bpr_model = implicit.bpr.BayesianPersonalizedRanking(factors=factors, iterations=iterations)

    # Train the BPR model
    bpr_model.fit(interaction_matrix)

    # Generate recommendations for each user
    recommendations = {}
    for user in range(n_users):
        # Ensure the user_items matrix is in CSR format
        user_items_csr = interaction_matrix.tocsr()
        user_recommendations = bpr_model.recommend(user, user_items_csr, N=10)  # Top 10 recommendations
        recommendations[user] = user_recommendations[0]  # Extract item IDs

    # Convert recommendations to DataFrame
    recommendations_df = pd.DataFrame.from_dict(recommendations, orient='index', columns=[f'Rec_{i+1}' for i in range(10)])

    return recommendations_df

In [52]:
run_bpr_and_recommend(X=x, n_users=59946, n_items=111, factors=10, iterations=50)

Shape of interaction_matrix: (59946, 111)


  0%|          | 0/50 [00:00<?, ?it/s]

ValueError: user_items must contain 1 row for every user in userids