# Training Model

## Importing packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
import implicit
from scipy.sparse import csr_matrix

## Loading datasets

In [2]:
ok=pd.read_csv('../raw_data/ok_clean.csv')

In [3]:
topics = pd.read_csv('../raw_data/text_and_topics.csv', keep_default_na=False)

In [4]:
ok = pd.concat([ok, topics], axis=1)

## Defining X vector with features

In [5]:
num_cat=ok.iloc[:,1:107]

In [6]:
text=ok.iloc[:,116:121]

In [7]:
x = pd.concat([num_cat, text], axis=1)

In [None]:
n_neighbors = 20  # Get more matches to filter by gender & orientation

# Fit KNN model
knn = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean')
knn.fit(x)

# Get the nearest neighbors (including self)
distances, indices = knn.kneighbors(x)

# Function to filter based on orientation & gender
def get_valid_matches(idx):
    user_gender = ok.loc[idx, 'female']  # 1 = Female, 0 = Male
    is_bisexual = ok.loc[idx, 'orientation_bisexual'] == 1
    is_gay = ok.loc[idx, 'orientation_gay'] == 1
    is_straight = ok.loc[idx, 'orientation_straight'] == 1

    valid_matches = []
    count = 0

    for neighbor_idx, dist in zip(indices[idx, 1:], distances[idx, 1:]):  # Exclude self
        neighbor_gender = ok.loc[neighbor_idx, 'female']

        # Matching logic
        if is_bisexual or (is_straight and user_gender != neighbor_gender) or (is_gay and user_gender == neighbor_gender):
            valid_matches.append((neighbor_idx, dist))
            count += 1
            if count == 5:  # Stop after 5 valid matches
                break
    
    return valid_matches

# Apply filtering for all users
match_indices = []
match_distances = []

for idx in range(len(ok)):
    matches = get_valid_matches(idx)
    
    # If not enough matches, fill with NaN
    while len(matches) < 5:
        matches.append((None, None))

    match_indices.append([m[0] for m in matches])
    match_distances.append([m[1] for m in matches])

# Save matches to the dataset
for i in range(5):
    ok[f'match_{i+1}_index'] = [row[i] for row in match_indices]
    ok[f'match_{i+1}_distance'] = [row[i] for row in match_distances]

In [41]:
ok

Unnamed: 0,age_scaled,single,female,orientation_bisexual,orientation_gay,orientation_straight,body_type_athletic,body_type_average,body_type_curvy,body_type_fit,...,match_1_index,match_1_distance,match_2_index,match_2_distance,match_3_index,match_3_distance,match_4_index,match_4_distance,match_5_index,match_5_distance
0,0.078431,1,0,0,0,1,0,0,0,0,...,51038.0,3.811792,11098.0,3.918768,,,,,,
1,0.333333,1,0,0,0,1,0,1,0,0,...,55336.0,3.679834,46448.0,3.905600,,,,,,
2,0.392157,0,0,0,0,1,0,0,0,0,...,12169.0,3.769973,31921.0,3.792117,30799.0,3.907568,38045.0,3.919192,56791.0,3.948018
3,0.098039,1,0,0,0,1,0,0,0,0,...,52475.0,3.240542,1712.0,3.475727,34137.0,3.653172,53227.0,3.663476,,
4,0.215686,1,0,0,0,1,1,0,0,0,...,36386.0,3.106583,34667.0,3.271129,32466.0,3.298720,44081.0,3.375794,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,0.803922,1,1,0,0,1,0,0,0,0,...,50126.0,3.353490,48181.0,3.401199,36020.0,3.477639,,,,
59942,0.117647,1,0,0,0,1,0,0,0,1,...,59896.0,3.467010,52117.0,3.613204,16296.0,3.614896,46448.0,3.618499,59387.0,3.621909
59943,0.470588,1,0,0,0,1,0,1,0,0,...,49716.0,3.409213,51574.0,3.487132,18765.0,3.546465,,,,
59944,0.176471,1,0,0,0,1,1,0,0,0,...,17656.0,3.733573,47273.0,3.823109,27294.0,3.825668,11418.0,3.831555,3913.0,3.844830


## Function to run several KNN Models

In [9]:
def knn_and_evaluate(dataset=ok, X=None, n_neighbors=10):
    """
    Runs a KNN model and calculates evaluation metrics, including average match distance
    and clustering metrics based on pseudo-clusters.
    
    Parameters:
    - dataset: DataFrame, the dataset to use (default is 'ok')
    - X: DataFrame, the feature vector
    - n_neighbors: int, number of similar individuals to return per person (default=10)
    
    Returns:
    - Updated DataFrame with the most similar individuals per person
    - Prints model evaluation metrics
    """
    if X is None:
        raise ValueError("Feature vector X must be provided")
    
    # Fit KNN model
    knn = NearestNeighbors(n_neighbors=n_neighbors + 1, metric='euclidean')
    knn.fit(X)  # Fit on the feature vectors
    
    # Find the nearest neighbors
    distances, indices = knn.kneighbors(X)
    
    # Create a dictionary to store the most similar people for each individual
    similar_people = {}
    avg_match_distances = []
    
    for i, index in enumerate(dataset.index):
        # Exclude self (index 0), as it's included in the nearest neighbors
        similar_people[index] = indices[i][1:].tolist()  # Exclude the person itself
        
        # Compute average match distance for each person
        avg_distance = np.mean(distances[i][1:])  # Exclude self
        avg_match_distances.append(avg_distance)
    
    # Compute overall Average Match Distance (AMD)
    overall_avg_match_distance = np.mean(avg_match_distances)
    print(f"Average Match Distance (AMD): {overall_avg_match_distance:.4f}")
    
    # Convert the dictionary into a DataFrame
    similar_df = pd.DataFrame.from_dict(similar_people, orient='index')
    similar_df.columns = [f'similar_{i+1}' for i in range(n_neighbors)]
    
    # Merge with the original dataset
    dataset = dataset.merge(similar_df, left_index=True, right_index=True)
    
    # Use the nearest neighbor indices to create pseudo-clusters
    pseudo_clusters = indices[:, 0]  # Use the nearest neighbor as the pseudo-cluster label
    
    # Compute clustering metrics
    silhouette_avg = silhouette_score(X, pseudo_clusters)
    davies_bouldin = davies_bouldin_score(X, pseudo_clusters)
    calinski_harabasz = calinski_harabasz_score(X, pseudo_clusters)
    
    print(f"Silhouette Score: {silhouette_avg:.4f}")
    print(f"Davies-Bouldin Score: {davies_bouldin:.4f}")
    print(f"Calinski-Harabasz Score: {calinski_harabasz:.4f}")
    
    # Return updated dataset and evaluation metrics
    metrics = {
        'Silhouette Score': silhouette_avg,
        'Davies-Bouldin Score': davies_bouldin,
        'Calinski-Harabasz Score': calinski_harabasz,
        'Average Match Distance': overall_avg_match_distance
    }
    
    return 

In [None]:
%%time
knn_and_evaluate(dataset=ok, X=x, n_neighbors=5)

Average Match Distance (AMD): 2.9985


### Model 1: Baseline 5 LDA Topics, all 111 features

In [56]:
run_knn_and_evaluate(20, x, metric='euclidean')

{'Silhouette Score': 0.020581739133098898,
 'Davies-Bouldin Index': 3.6779848197346308,
 'Calinski-Harabasz Score': 688.5955142538597,
 'Average Match Distance': 3.245550940371514,
 'Gender/Orientation Match Accuracy': 1.0}

### Model 2: 2 LDA Topics, all 111 features)

In [62]:
text=ok.iloc[:,121:123]
num_cat=ok.iloc[:,1:107]
x = pd.concat([num_cat, text], axis=1)

In [63]:
run_knn_and_evaluate(20, x, metric='euclidean')

{'Silhouette Score': 0.020845198435285095,
 'Davies-Bouldin Index': 3.6294405236603535,
 'Calinski-Harabasz Score': 696.1035073024036,
 'Average Match Distance': 3.1157450996331484,
 'Gender/Orientation Match Accuracy': 1.0}

### Model 3: 2 Word2vec_embeddings_scaled, all 111 features)

In [66]:
# Clean the string representation
ok['word2vec_embeddings_scaled'] = (
    ok['word2vec_embeddings_scaled']
    .str.replace(r'\s+', ',', regex=True)  # Replace multiple spaces with a single comma
    .str.replace(',,', ',')  # Replace double commas with a single comma
    .str.strip('[]')  # Remove square brackets
)

In [67]:
from ast import literal_eval

# Convert the string representations to lists of floats
ok['word2vec_embeddings_scaled'] = ok['word2vec_embeddings_scaled'].apply(literal_eval)

In [68]:
# Convert the list to a NumPy array
ok['word2vec_embeddings_scaled'] = ok['word2vec_embeddings_scaled'].apply(np.array)

In [69]:
 # Replace with your actual feature names
X_word2vec = np.array(ok['word2vec_embeddings_scaled'].tolist())  # Convert to 2D array
X_other = x.values 
X_combined = np.hstack((X_word2vec, X_other))

In [55]:
num_cat=ok.iloc[:,1:107]
text=ok.iloc[:,112:113]
x = pd.concat([num_cat, text], axis=1)

In [74]:
#run_knn_and_evaluate(20, X_combined, metric='euclidean')

In [71]:
print(X_word2vec_padded.shape[0] == X_other.shape[0])

True


In [72]:
print(X_combined.shape)

(59946, 407)


In [73]:
print(ok['word2vec_embeddings_scaled'].head())

0    [0.76321473, 0.64034454, 0.58260717, 0.1941664...
1    [0.65697592, 0.62005705, 0.59067174, 0.1890259...
2    [0.70801798, 0.64246498, 0.52894441, 0.1918418...
3    [0.65878697, 0.67627868, 0.53726614, 0.2329434...
4    [0.62431777, 0.69718922, 0.49372216, 0.2371102...
Name: word2vec_embeddings_scaled, dtype: object


## Function to run several Kmean Models

In [10]:
def cluster_and_find_similar(dataset=ok, X=None, n_clusters=12, n_similar=10):
    """
    Runs K-Means clustering and finds the specified number of most similar individuals within each cluster.
    
    Parameters:
    - dataset: DataFrame, the dataset to use (default is 'ok')
    - X: DataFrame, the feature vector
    - n_clusters: int, number of clusters for K-Means (default=12)
    - n_similar: int, number of similar individuals to return per person (default=10)
    
    Returns:
    - Updated DataFrame with assigned clusters and specified number of most similar individuals per person
    - Prints model evaluation metrics
    """
    if X is None:
        raise ValueError("Feature vector X must be provided")
    
    # Fit K-Means model
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    dataset['cluster'] = kmeans.fit_predict(X)
    
    # Compute evaluation metrics
    silhouette_avg = silhouette_score(X, dataset['cluster'])
    davies_bouldin = davies_bouldin_score(X, dataset['cluster'])
    calinski_harabasz = calinski_harabasz_score(X, dataset['cluster'])
    
    print(f"Silhouette Score: {silhouette_avg:.4f}")
    print(f"Davies-Bouldin Score: {davies_bouldin:.4f}")
    print(f"Calinski-Harabasz Score: {calinski_harabasz:.4f}")
    
    # Create a dictionary to store the most similar people for each individual
    similar_people = {}
    avg_match_distances = []
    
    # Find the closest people within each cluster
    for cluster in range(n_clusters):
        cluster_indices = dataset[dataset['cluster'] == cluster].index
        cluster_features = X.loc[cluster_indices]
        distances = euclidean_distances(cluster_features)
        
        for i, index in enumerate(cluster_indices):
            sorted_indices = np.argsort(distances[i])[1:n_similar+1]  # Exclude self (index 0)
            similar_people[index] = cluster_indices[sorted_indices].tolist()
            
            # Compute average match distance for each person
            avg_distance = np.mean(distances[i][sorted_indices])
            avg_match_distances.append(avg_distance)
    
    # Compute overall Average Match Distance (AMD)
    overall_avg_match_distance = np.mean(avg_match_distances)
    print(f"Average Match Distance (AMD): {overall_avg_match_distance:.4f}")
    
    # Convert the dictionary into a DataFrame
    similar_df = pd.DataFrame.from_dict(similar_people, orient='index')
    similar_df.columns = [f'similar_{i+1}' for i in range(n_similar)]
    
    # Merge with the original dataset
    dataset = dataset.merge(similar_df, left_index=True, right_index=True)
    
    return 

In [12]:
%%time
cluster_and_find_similar(dataset=ok, X=x, n_clusters=12, n_similar=5)

Silhouette Score: 0.0300
Davies-Bouldin Score: 3.9053
Calinski-Harabasz Score: 1256.3048
Average Match Distance (AMD): 3.0591
CPU times: user 2min, sys: 51.2 s, total: 2min 51s
Wall time: 51.5 s


## Function to run several Deep Collaborative Scoring Model (DCScam)

In [60]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate

def run_dcscam_and_recommend(X, n_users, n_items, n_latent_factors=10, epochs=10, batch_size=64):
    """
    Runs the Deep Collaborative Scoring Model (DCScam) and recommends similar individuals.
    
    Parameters:
    - X: A user-item matrix or feature matrix (users as rows, items as columns).
    - n_users: Number of unique users.
    - n_items: Number of unique items.
    - n_latent_factors: Number of latent factors for embedding layers (default=10).
    - epochs: Number of epochs to train the model (default=10).
    - batch_size: Size of the batch for training (default=64).
    
    Returns:
    - recommendations: A DataFrame of user recommendations.
    """
    # Debugging: Check the input data
    print("Input data (X):")
    print(X[:5])  # Print the first 5 rows of X

    # Ensure X is a list of tuples (user, item, rating)
    if isinstance(X, pd.DataFrame):
        X = list(zip(X.iloc[:, 0], X.iloc[:, 1], X.iloc[:, 2]))  # Convert DataFrame to list of tuples
    elif isinstance(X, np.ndarray):
        X = list(zip(X[:, 0], X[:, 1], X[:, 2]))  # Convert NumPy array to list of tuples
    elif isinstance(X, list):
        pass  # X is already in the correct format
    else:
        raise ValueError("X must be a DataFrame, NumPy array, or list of tuples.")

    # Extract user, item, and rating data
    user_data = np.array([x[0] for x in X], dtype=np.int32)  # Ensure integer type
    item_data = np.array([x[1] for x in X], dtype=np.int32)  # Ensure integer type
    ratings = np.array([x[2] for x in X], dtype=np.float32)  # Ensure float type

    # Debugging: Check the extracted data
    print("User data:", user_data[:5])
    print("Item data:", item_data[:5])
    print("Ratings:", ratings[:5])

    # Define the model architecture
    user_input = Input(shape=(1,), name='user_input')
    item_input = Input(shape=(1,), name='item_input')

    user_embedding = Embedding(input_dim=n_users, output_dim=n_latent_factors, name='user_embedding')(user_input)
    item_embedding = Embedding(input_dim=n_items, output_dim=n_latent_factors, name='item_embedding')(item_input)

    user_flat = Flatten()(user_embedding)
    item_flat = Flatten()(item_embedding)

    concat = Concatenate()([user_flat, item_flat])
    dense = Dense(128, activation='relu')(concat)
    output = Dense(1, activation='sigmoid')(dense)

    model = Model(inputs=[user_input, item_input], outputs=output)
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit([user_data, item_data], ratings, epochs=epochs, batch_size=batch_size)
    
    # Predict the scores for all user-item pairs
    predictions = model.predict([user_data, item_data])
    
    # Recommendation: Get top N items for each user
    recommendations = {}
    for user in range(n_users):
        user_indices = np.where(user_data == user)[0]  # Find indices where user_data == user
        user_predictions = predictions[user_indices]
        top_items = item_data[user_indices][np.argsort(user_predictions.flatten())[::-1][:10]]  # Top 10 recommendations
        recommendations[user] = top_items
    
    # Return recommendations as DataFrame
    recommendations_df = pd.DataFrame.from_dict(recommendations, orient='index', columns=[f'Rec_{i+1}' for i in range(10)])
    
    return recommendations_df

# Example usage
# run_dcscam_and_recommend(X=x, n_users=59946, n_items=111, n_latent_factors=10, epochs=10, batch_size=64)

In [54]:
x.shape

(59946, 111)

In [61]:
run_dcscam_and_recommend(X=x, n_users=59946, n_items=111, n_latent_factors=10, epochs=10, batch_size=64)

Input data (X):
   single  female  orientation_bisexual  orientation_gay  \
0       1       0                     0                0   
1       1       0                     0                0   
2       0       0                     0                0   
3       1       0                     0                0   
4       1       0                     0                0   

   orientation_straight  body_type_athletic  body_type_average  \
0                     1                   0                  0   
1                     1                   0                  1   
2                     1                   0                  0   
3                     1                   0                  0   
4                     1                   1                  0   

   body_type_curvy  body_type_fit  body_type_not_disclosed  ...  smokes_yes  \
0                0              0                        0  ...           1   
1                0              0                        0  ...     

Unnamed: 0,Rec_1,Rec_2,Rec_3,Rec_4,Rec_5,Rec_6,Rec_7,Rec_8,Rec_9,Rec_10
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
59941,,,,,,,,,,
59942,,,,,,,,,,
59943,,,,,,,,,,
59944,,,,,,,,,,


## Function to run several Bayesian Personalized Ranking (BPR) Model

In [12]:
x.shape

(59946, 111)

In [51]:
import implicit
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd

def run_bpr_and_recommend(X, n_users, n_items, factors=10, iterations=50):
    """
    Runs the Bayesian Personalized Ranking (BPR) model and recommends similar individuals.
    
    Parameters:
    - X: A user-item interaction DataFrame (users as rows, items as columns).
    - n_users: Number of unique users.
    - n_items: Number of unique items.
    - factors: Number of latent factors for the BPR model (default=10).
    - iterations: Number of iterations to train the BPR model (default=50).
    
    Returns:
    - recommendations: A DataFrame of user recommendations.
    """
    # Convert the DataFrame to a sparse CSR matrix
    interaction_matrix = csr_matrix(X.values.astype('float32'))

    # Debugging: Check the shape of the interaction matrix
    print("Shape of interaction_matrix:", interaction_matrix.shape)

    # Initialize the BPR model
    bpr_model = implicit.bpr.BayesianPersonalizedRanking(factors=factors, iterations=iterations)

    # Train the BPR model
    bpr_model.fit(interaction_matrix)

    # Generate recommendations for each user
    recommendations = {}
    for user in range(n_users):
        # Ensure the user_items matrix is in CSR format
        user_items_csr = interaction_matrix.tocsr()
        user_recommendations = bpr_model.recommend(user, user_items_csr, N=10)  # Top 10 recommendations
        recommendations[user] = user_recommendations[0]  # Extract item IDs

    # Convert recommendations to DataFrame
    recommendations_df = pd.DataFrame.from_dict(recommendations, orient='index', columns=[f'Rec_{i+1}' for i in range(10)])

    return recommendations_df

In [52]:
run_bpr_and_recommend(X=x, n_users=59946, n_items=111, factors=10, iterations=50)

Shape of interaction_matrix: (59946, 111)


  0%|          | 0/50 [00:00<?, ?it/s]

ValueError: user_items must contain 1 row for every user in userids