# Training Model

## Importing packages

In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

## Loading datasets

In [62]:
ok=pd.read_csv('../raw_data/ok_clean.csv')

In [63]:
topics = pd.read_csv('../raw_data/text_and_topics.csv', keep_default_na=False)

In [64]:
ok = pd.concat([ok, topics], axis=1)

## Defining X vector with features

In [26]:
num_cat=ok.iloc[:,1:107]

In [27]:
text=ok.iloc[:,116:121]

In [28]:
x = pd.concat([num_cat, text], axis=1)

## Baseline KNN Model

### Define K-Means

#### Not considering sex orientation

In [32]:
# Let's say `X` is the combined feature matrix with all the features you're using (including Word2Vec embeddings)

# Initialize the KNN model
knn = NearestNeighbors(n_neighbors=6, metric='cosine', n_jobs=-1)  # n_neighbors=6 because we include the query person as well
knn.fit(x)  # Fit the model with your data

# For each person in the dataset, find the 5 most similar people (excluding themselves)
distances, indices = knn.kneighbors(x)

# Now `indices` contains the indices of the 6 nearest neighbors (including the person themselves) for each person
# So, for each row (person), take indices[1:] to exclude the person from the nearest neighbors

# To visualize or get the top 5 nearest neighbors for each person:
for i in range(len(indices)):
    similar_people = indices[i][1:]  # Exclude the person themselves (index 0)
    #print(f"Person {i}'s 5 most similar people: {similar_people}")

# Store the 5 closest matches (excluding self)
for i in range(5):
    ok[f'match_{i+1}_index'] = indices[:, i+1]  # Row index
    ok[f'match_{i+1}_distance'] = distances[:, i+1]  # Distance score

#### Taking orientation into account

In [39]:
n_neighbors = 20  # Get more matches to filter by gender & orientation

# Fit KNN model
knn = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean')
knn.fit(x)

# Get the nearest neighbors (including self)
distances, indices = knn.kneighbors(x)

# Function to filter based on orientation & gender
def get_valid_matches(idx):
    user_gender = ok.loc[idx, 'female']  # 1 = Female, 0 = Male
    is_bisexual = ok.loc[idx, 'orientation_bisexual'] == 1
    is_gay = ok.loc[idx, 'orientation_gay'] == 1
    is_straight = ok.loc[idx, 'orientation_straight'] == 1

    valid_matches = []
    count = 0

    for neighbor_idx, dist in zip(indices[idx, 1:], distances[idx, 1:]):  # Exclude self
        neighbor_gender = ok.loc[neighbor_idx, 'female']

        # Matching logic
        if is_bisexual or (is_straight and user_gender != neighbor_gender) or (is_gay and user_gender == neighbor_gender):
            valid_matches.append((neighbor_idx, dist))
            count += 1
            if count == 5:  # Stop after 5 valid matches
                break
    
    return valid_matches

# Apply filtering for all users
match_indices = []
match_distances = []

for idx in range(len(ok)):
    matches = get_valid_matches(idx)
    
    # If not enough matches, fill with NaN
    while len(matches) < 5:
        matches.append((None, None))

    match_indices.append([m[0] for m in matches])
    match_distances.append([m[1] for m in matches])

# Save matches to the dataset
for i in range(5):
    ok[f'match_{i+1}_index'] = [row[i] for row in match_indices]
    ok[f'match_{i+1}_distance'] = [row[i] for row in match_distances]

In [41]:
ok

Unnamed: 0,age_scaled,single,female,orientation_bisexual,orientation_gay,orientation_straight,body_type_athletic,body_type_average,body_type_curvy,body_type_fit,...,match_1_index,match_1_distance,match_2_index,match_2_distance,match_3_index,match_3_distance,match_4_index,match_4_distance,match_5_index,match_5_distance
0,0.078431,1,0,0,0,1,0,0,0,0,...,51038.0,3.811792,11098.0,3.918768,,,,,,
1,0.333333,1,0,0,0,1,0,1,0,0,...,55336.0,3.679834,46448.0,3.905600,,,,,,
2,0.392157,0,0,0,0,1,0,0,0,0,...,12169.0,3.769973,31921.0,3.792117,30799.0,3.907568,38045.0,3.919192,56791.0,3.948018
3,0.098039,1,0,0,0,1,0,0,0,0,...,52475.0,3.240542,1712.0,3.475727,34137.0,3.653172,53227.0,3.663476,,
4,0.215686,1,0,0,0,1,1,0,0,0,...,36386.0,3.106583,34667.0,3.271129,32466.0,3.298720,44081.0,3.375794,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,0.803922,1,1,0,0,1,0,0,0,0,...,50126.0,3.353490,48181.0,3.401199,36020.0,3.477639,,,,
59942,0.117647,1,0,0,0,1,0,0,0,1,...,59896.0,3.467010,52117.0,3.613204,16296.0,3.614896,46448.0,3.618499,59387.0,3.621909
59943,0.470588,1,0,0,0,1,0,1,0,0,...,49716.0,3.409213,51574.0,3.487132,18765.0,3.546465,,,,
59944,0.176471,1,0,0,0,1,1,0,0,0,...,17656.0,3.733573,47273.0,3.823109,27294.0,3.825668,11418.0,3.831555,3913.0,3.844830


### Evaluating the model

#### Internal Clustering Metrics (to evaluate similarity structure)

##### Silhouette Score, Davies-Bouldin Index, Calinski-Harabasz Index

In [51]:
# Silhouette Score measures how well each point fits into its cluster.
# Ranges from -1 to 1 (higher is better)
silhouette_avg = silhouette_score(x, ok['cluster'])
print(f'Silhouette Score: {silhouette_avg:.4f}')

#Davies-Bouldin Index Measures the average similarity between clusters.
#Lower values indicate better clustering.
db_index = davies_bouldin_score(x, ok['cluster'])
print(f'Davies-Bouldin Index: {db_index:.4f}')

# Calinski-Harabasz Index measures the separation between clusters.
# Higher values indicate better clustering
ch_score = calinski_harabasz_score(x, ok['cluster'])
print(f'Calinski-Harabasz Score: {ch_score:.4f}')

Silhouette Score: 0.0206
Davies-Bouldin Index: 3.6780
Calinski-Harabasz Score: 688.5955


#### Recommendation-Based Metrics (to evaluate user matches)

##### Average Distance of Matches

In [53]:
#Checks if the suggested matches are actually close in feature space
#Lower values mean the suggested users are more similar. 

avg_match_distance = np.mean([ok[f'match_{i+1}_distance'].mean() for i in range(5)])
print(f'Average Match Distance: {avg_match_distance:.4f}')

Average Match Distance: 3.2456


##### Gender & Orientation Consistency

In [45]:
valid_suggestions = 0
total_suggestions = 0

for i in range(len(ok)):
    user_gender = ok.loc[i, 'female']
    is_bisexual = ok.loc[i, 'orientation_bisexual'] == 1
    is_gay = ok.loc[i, 'orientation_gay'] == 1
    is_straight = ok.loc[i, 'orientation_straight'] == 1

    for j in range(5):  # Check all 5 matches
        match_idx = ok.loc[i, f'match_{j+1}_index']
        if match_idx is None or np.isnan(match_idx):
            continue  # Skip if no match found

        match_gender = ok.loc[int(match_idx), 'female']
        
        # Check if match follows orientation rules
        if is_bisexual or (is_straight and user_gender != match_gender) or (is_gay and user_gender == match_gender):
            valid_suggestions += 1
        
        total_suggestions += 1

print(f'Gender/Orientation Match Accuracy: {valid_suggestions / total_suggestions:.2%}')


Gender/Orientation Match Accuracy: 100.00%


#### Summarize metric results

Metrics Breakdown
1️⃣ Silhouette Score: 0.0206 (Poor)
This is very low, meaning clusters are not well-separated and may have significant overlap.
Ideal values are closer to 1.0 (well-clustered) and should at least be above 0.2–0.3 for meaningful separation.
2️⃣ Davies-Bouldin Index: 3.6780 (High = Bad)
Lower is better. A high DB index means clusters are not distinct, indicating high within-cluster variance.
Values closer to 1.0 or lower are preferable.
3️⃣ Calinski-Harabasz Score: 688.5955 (Higher is Better)
Measures cluster separation & cohesion (higher is better).
The value itself isn't bad, but it should be interpreted alongside Silhouette & DB Index.
4️⃣ Average Match Distance: 3.2456
Measures how far suggested matches are in feature space.
Lower is better, ideally below 2.0.


## Function to run several KNN Models

In [8]:
def run_knn_and_evaluate(k, x, metric='euclidean'):
    """
    Runs a KNN model, evaluates matches based on gender and orientation, and calculates evaluation metrics.
    
    Parameters:
        k (int): Number of neighbors to consider.
        x (array-like): Feature vectors for the KNN model.
        metric (str): Distance metric for KNN (default: 'euclidean').
    
    Returns:
        dict: A dictionary containing evaluation metrics.
    """
    # Fit KNN model
    knn = NearestNeighbors(n_neighbors=k, metric=metric)
    knn.fit(x)

    # Get the nearest neighbors (including self)
    distances, indices = knn.kneighbors(x)

    # Function to filter based on orientation & gender
    def get_valid_matches(idx):
        user_gender = ok.loc[idx, 'female']  # 1 = Female, 0 = Male
        is_bisexual = ok.loc[idx, 'orientation_bisexual'] == 1
        is_gay = ok.loc[idx, 'orientation_gay'] == 1
        is_straight = ok.loc[idx, 'orientation_straight'] == 1

        valid_matches = []
        count = 0

        for neighbor_idx, dist in zip(indices[idx, 1:], distances[idx, 1:]):  # Exclude self
            neighbor_gender = ok.loc[neighbor_idx, 'female']

            # Matching logic
            if is_bisexual or (is_straight and user_gender != neighbor_gender) or (is_gay and user_gender == neighbor_gender):
                valid_matches.append((neighbor_idx, dist))
                count += 1
                if count == 5:  # Stop after 5 valid matches
                    break
        
        return valid_matches

    # Apply filtering for all users
    match_indices = []
    match_distances = []

    for idx in range(len(ok)):
        matches = get_valid_matches(idx)
        
        # If not enough matches, fill with NaN
        while len(matches) < 5:
            matches.append((None, None))

        match_indices.append([m[0] for m in matches])
        match_distances.append([m[1] for m in matches])

    # Save matches to the dataset
    for i in range(5):
        ok[f'match_{i+1}_index'] = [row[i] for row in match_indices]
        ok[f'match_{i+1}_distance'] = [row[i] for row in match_distances]

    # Evaluation Metrics
    silhouette_avg = silhouette_score(x, ok['cluster'])
    db_index = davies_bouldin_score(x, ok['cluster'])
    ch_score = calinski_harabasz_score(x, ok['cluster'])
    avg_match_distance = np.mean([ok[f'match_{i+1}_distance'].mean() for i in range(5)])

    # Gender/Orientation Match Accuracy
    valid_suggestions = 0
    total_suggestions = 0

    for i in range(len(ok)):
        user_gender = ok.loc[i, 'female']
        is_bisexual = ok.loc[i, 'orientation_bisexual'] == 1
        is_gay = ok.loc[i, 'orientation_gay'] == 1
        is_straight = ok.loc[i, 'orientation_straight'] == 1

        for j in range(5):  # Check all 5 matches
            match_idx = ok.loc[i, f'match_{j+1}_index']
            if match_idx is None or np.isnan(match_idx):
                continue  # Skip if no match found

            match_gender = ok.loc[int(match_idx), 'female']
            
            # Check if match follows orientation rules
            if is_bisexual or (is_straight and user_gender != match_gender) or (is_gay and user_gender == match_gender):
                valid_suggestions += 1
            
            total_suggestions += 1

    gender_orientation_accuracy = valid_suggestions / total_suggestions

    # Return metrics as a dictionary
    metrics = {
        'Silhouette Score': silhouette_avg,
        'Davies-Bouldin Index': db_index,
        'Calinski-Harabasz Score': ch_score,
        'Average Match Distance': avg_match_distance,
        'Gender/Orientation Match Accuracy': gender_orientation_accuracy
    }

    return metrics

### Model 1: Baseline 5 LDA Topics, all 111 features

In [56]:
run_knn_and_evaluate(20, x, metric='euclidean')

{'Silhouette Score': 0.020581739133098898,
 'Davies-Bouldin Index': 3.6779848197346308,
 'Calinski-Harabasz Score': 688.5955142538597,
 'Average Match Distance': 3.245550940371514,
 'Gender/Orientation Match Accuracy': 1.0}

### Model 2: 2 LDA Topics, all 111 features)

In [62]:
text=ok.iloc[:,121:123]
num_cat=ok.iloc[:,1:107]
x = pd.concat([num_cat, text], axis=1)

In [63]:
run_knn_and_evaluate(20, x, metric='euclidean')

{'Silhouette Score': 0.020845198435285095,
 'Davies-Bouldin Index': 3.6294405236603535,
 'Calinski-Harabasz Score': 696.1035073024036,
 'Average Match Distance': 3.1157450996331484,
 'Gender/Orientation Match Accuracy': 1.0}

### Model 3: 2 Word2vec_embeddings_scaled, all 111 features)

In [66]:
# Clean the string representation
ok['word2vec_embeddings_scaled'] = (
    ok['word2vec_embeddings_scaled']
    .str.replace(r'\s+', ',', regex=True)  # Replace multiple spaces with a single comma
    .str.replace(',,', ',')  # Replace double commas with a single comma
    .str.strip('[]')  # Remove square brackets
)

In [67]:
from ast import literal_eval

# Convert the string representations to lists of floats
ok['word2vec_embeddings_scaled'] = ok['word2vec_embeddings_scaled'].apply(literal_eval)

In [68]:
# Convert the list to a NumPy array
ok['word2vec_embeddings_scaled'] = ok['word2vec_embeddings_scaled'].apply(np.array)

In [69]:
 # Replace with your actual feature names
X_word2vec = np.array(ok['word2vec_embeddings_scaled'].tolist())  # Convert to 2D array
X_other = x.values 
X_combined = np.hstack((X_word2vec, X_other))

In [55]:
num_cat=ok.iloc[:,1:107]
text=ok.iloc[:,112:113]
x = pd.concat([num_cat, text], axis=1)

In [74]:
#run_knn_and_evaluate(20, X_combined, metric='euclidean')

In [71]:
print(X_word2vec_padded.shape[0] == X_other.shape[0])

True


In [72]:
print(X_combined.shape)

(59946, 407)


In [73]:
print(ok['word2vec_embeddings_scaled'].head())

0    [0.76321473, 0.64034454, 0.58260717, 0.1941664...
1    [0.65697592, 0.62005705, 0.59067174, 0.1890259...
2    [0.70801798, 0.64246498, 0.52894441, 0.1918418...
3    [0.65878697, 0.67627868, 0.53726614, 0.2329434...
4    [0.62431777, 0.69718922, 0.49372216, 0.2371102...
Name: word2vec_embeddings_scaled, dtype: object
