# 04. Similarity Matching

This notebook implements similarity matching algorithms to find users with similar music tastes and generate recommendations.

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')

## Load Data

In [None]:
# Load user features and clusters
user_features = pd.read_csv('../data/processed/user_features_selected.csv')
user_clusters = pd.read_csv('../data/processed/user_clusters.csv')

# Load listening history for recommendations
listening_history = pd.read_csv('../data/raw/listening_history.csv')
track_features = pd.read_csv('../data/raw/track_features.csv')

# Merge cluster information
user_features = user_features.merge(user_clusters[['user_id', 'kmeans_cluster']], on='user_id')

print(f"Users: {len(user_features)}")
print(f"Features: {user_features.shape[1] - 2}")  # Exclude user_id and cluster
print(f"Clusters: {user_features['kmeans_cluster'].nunique()}")

## Similarity Computation

In [None]:
# Prepare feature matrix
feature_cols = [col for col in user_features.columns if col not in ['user_id', 'kmeans_cluster']]
X = user_features[feature_cols].values
user_ids = user_features['user_id'].values

# Compute different similarity metrics
print("Computing similarity matrices...")

# Cosine similarity
cosine_sim = cosine_similarity(X)
print(f"Cosine similarity shape: {cosine_sim.shape}")

# Euclidean distance (convert to similarity)
euclidean_dist = euclidean_distances(X)
euclidean_sim = 1 / (1 + euclidean_dist)  # Convert distance to similarity
print(f"Euclidean similarity shape: {euclidean_sim.shape}")

In [None]:
# Analyze similarity distributions
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Cosine similarity distribution
cosine_upper = cosine_sim[np.triu_indices_from(cosine_sim, k=1)]
axes[0].hist(cosine_upper, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Cosine Similarity')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of Cosine Similarities')
axes[0].axvline(x=cosine_upper.mean(), color='red', linestyle='--', 
                label=f'Mean: {cosine_upper.mean():.3f}')
axes[0].legend()

# Euclidean similarity distribution
euclidean_upper = euclidean_sim[np.triu_indices_from(euclidean_sim, k=1)]
axes[1].hist(euclidean_upper, bins=50, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Euclidean Similarity')
axes[1].set_ylabel('Count')
axes[1].set_title('Distribution of Euclidean Similarities')
axes[1].axvline(x=euclidean_upper.mean(), color='red', linestyle='--', 
                label=f'Mean: {euclidean_upper.mean():.3f}')
axes[1].legend()

plt.tight_layout()
plt.show()

## K-Nearest Neighbors

In [None]:
# Build KNN model
n_neighbors = 10
knn = NearestNeighbors(n_neighbors=n_neighbors + 1, metric='cosine')
knn.fit(X)

# Find neighbors for all users
distances, indices = knn.kneighbors(X)

# Remove self from neighbors (first neighbor is always self)
distances = distances[:, 1:]
indices = indices[:, 1:]

print(f"Found {n_neighbors} nearest neighbors for each user")

## Find Similar Users Function

In [None]:
def find_similar_users(user_id, n_similar=5, method='cosine', same_cluster_only=False):
    """
    Find most similar users to a given user
    """
    # Get user index
    user_idx = np.where(user_ids == user_id)[0][0]
    
    # Get similarity scores
    if method == 'cosine':
        sim_scores = cosine_sim[user_idx]
    else:
        sim_scores = euclidean_sim[user_idx]
    
    # Apply cluster filter if requested
    if same_cluster_only:
        user_cluster = user_features.iloc[user_idx]['kmeans_cluster']
        cluster_mask = user_features['kmeans_cluster'] == user_cluster
        sim_scores = sim_scores * cluster_mask.values
    
    # Sort and get top similar users (excluding self)
    similar_indices = np.argsort(sim_scores)[::-1][1:n_similar+1]
    
    # Create results dataframe
    results = pd.DataFrame({
        'user_id': user_ids[similar_indices],
        'similarity': sim_scores[similar_indices],
        'cluster': user_features.iloc[similar_indices]['kmeans_cluster'].values
    })
    
    return results

# Example: Find similar users for a random user
sample_user = np.random.choice(user_ids)
similar_users = find_similar_users(sample_user, n_similar=10)

print(f"Top 10 users similar to user {sample_user}:")
print(similar_users)

## Music Taste Twin Matching

In [None]:
def find_music_twins(similarity_threshold=0.9, max_twins=5):
    """
    Find music taste twins - users with very high similarity
    """
    twins = []
    
    # Find all pairs above threshold
    high_sim_pairs = np.where(cosine_sim > similarity_threshold)
    
    for i, j in zip(high_sim_pairs[0], high_sim_pairs[1]):
        if i < j:  # Avoid duplicates
            twins.append({
                'user1': user_ids[i],
                'user2': user_ids[j],
                'similarity': cosine_sim[i, j],
                'same_cluster': user_features.iloc[i]['kmeans_cluster'] == user_features.iloc[j]['kmeans_cluster']
            })
    
    # Convert to dataframe and sort
    twins_df = pd.DataFrame(twins)
    twins_df = twins_df.sort_values('similarity', ascending=False).head(max_twins * 10)
    
    return twins_df

# Find music twins
music_twins = find_music_twins(similarity_threshold=0.95)
print(f"Found {len(music_twins)} music twin pairs")
print("\nTop music twins:")
print(music_twins.head(10))

## Cluster-Based Similarity

In [None]:
# Analyze similarity within vs between clusters
n_clusters = user_features['kmeans_cluster'].nunique()
cluster_similarities = np.zeros((n_clusters, n_clusters))

for i in range(n_clusters):
    for j in range(n_clusters):
        mask_i = user_features['kmeans_cluster'] == i
        mask_j = user_features['kmeans_cluster'] == j
        
        if i == j:
            # Within-cluster similarity (exclude self-similarities)
            cluster_sim = cosine_sim[np.ix_(mask_i, mask_j)]
            cluster_similarities[i, j] = np.mean(cluster_sim[np.triu_indices_from(cluster_sim, k=1)])
        else:
            # Between-cluster similarity
            cluster_similarities[i, j] = np.mean(cosine_sim[np.ix_(mask_i, mask_j)])

# Visualize cluster similarities
plt.figure(figsize=(10, 8))
sns.heatmap(cluster_similarities, annot=True, fmt='.3f', cmap='YlOrRd',
            xticklabels=[f'C{i}' for i in range(n_clusters)],
            yticklabels=[f'C{i}' for i in range(n_clusters)])
plt.title('Average Similarity Between Clusters')
plt.tight_layout()
plt.show()

# Print insights
print("Cluster cohesion (within-cluster similarity):")
for i in range(n_clusters):
    print(f"  Cluster {i}: {cluster_similarities[i, i]:.3f}")

## Track Recommendation System

In [None]:
def get_user_tracks(user_id):
    """
    Get tracks listened to by a user
    """
    user_tracks = listening_history[listening_history['user_id'] == user_id]['track_id'].unique()
    return set(user_tracks)

def recommend_tracks_from_similar_users(user_id, n_recommendations=10, n_similar_users=5):
    """
    Recommend tracks based on what similar users listen to
    """
    # Get similar users
    similar_users = find_similar_users(user_id, n_similar=n_similar_users)
    
    # Get user's current tracks
    user_tracks = get_user_tracks(user_id)
    
    # Collect tracks from similar users
    track_scores = {}
    
    for _, row in similar_users.iterrows():
        similar_user_id = row['user_id']
        similarity = row['similarity']
        
        # Get tracks for similar user
        similar_user_tracks = listening_history[listening_history['user_id'] == similar_user_id]
        
        for _, track_row in similar_user_tracks.iterrows():
            track_id = track_row['track_id']
            
            # Skip if user already knows this track
            if track_id in user_tracks:
                continue
            
            # Weight by similarity
            if track_id not in track_scores:
                track_scores[track_id] = 0
            track_scores[track_id] += similarity
    
    # Sort and get top recommendations
    recommended_tracks = sorted(track_scores.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
    
    # Get track details
    recommendations = []
    for track_id, score in recommended_tracks:
        track_info = track_features[track_features['track_id'] == track_id].iloc[0]
        recommendations.append({
            'track_id': track_id,
            'track_name': track_info.get('track_name', 'Unknown'),
            'artist_name': track_info.get('artist_name', 'Unknown'),
            'score': score
        })
    
    return pd.DataFrame(recommendations)

# Example recommendations
sample_user = np.random.choice(user_ids)
recommendations = recommend_tracks_from_similar_users(sample_user)

print(f"\nTrack recommendations for user {sample_user}:")
print(recommendations)

## Similarity Network Visualization

In [None]:
# Create similarity network for visualization
# Sample users for visualization
n_sample = min(100, len(user_ids))
sample_indices = np.random.choice(len(user_ids), n_sample, replace=False)
sample_sim = cosine_sim[np.ix_(sample_indices, sample_indices)]

# Create network graph
threshold = 0.8  # Only show strong connections
G = nx.Graph()

# Add nodes
for i, idx in enumerate(sample_indices):
    G.add_node(i, user_id=user_ids[idx], 
               cluster=user_features.iloc[idx]['kmeans_cluster'])

# Add edges for high similarity
for i in range(n_sample):
    for j in range(i+1, n_sample):
        if sample_sim[i, j] > threshold:
            G.add_edge(i, j, weight=sample_sim[i, j])

# Plot network
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G, k=1/np.sqrt(n_sample), iterations=50)

# Color by cluster
node_colors = [G.nodes[node]['cluster'] for node in G.nodes()]

# Draw network
nx.draw_networkx_nodes(G, pos, node_color=node_colors, cmap='viridis', 
                      node_size=300, alpha=0.8)
nx.draw_networkx_edges(G, pos, alpha=0.2)

plt.title(f'User Similarity Network (threshold > {threshold})')
plt.axis('off')
plt.tight_layout()
plt.show()

print(f"Network statistics:")
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")
print(f"Average degree: {np.mean([d for n, d in G.degree()]):.2f}")

## Performance Metrics

In [None]:
# Evaluate similarity metrics performance
def evaluate_similarity_within_clusters():
    """
    Evaluate how well similarity metrics align with clusters
    """
    results = []
    
    for cluster in range(n_clusters):
        cluster_mask = user_features['kmeans_cluster'] == cluster
        cluster_indices = np.where(cluster_mask)[0]
        
        if len(cluster_indices) < 2:
            continue
        
        # Within-cluster similarities
        within_sim = []
        for i in range(len(cluster_indices)):
            for j in range(i+1, len(cluster_indices)):
                within_sim.append(cosine_sim[cluster_indices[i], cluster_indices[j]])
        
        # Outside-cluster similarities
        outside_mask = ~cluster_mask
        outside_indices = np.where(outside_mask)[0]
        
        outside_sim = []
        for i in cluster_indices:
            for j in outside_indices:
                outside_sim.append(cosine_sim[i, j])
        
        results.append({
            'cluster': cluster,
            'size': len(cluster_indices),
            'avg_within_similarity': np.mean(within_sim),
            'avg_outside_similarity': np.mean(outside_sim),
            'separation': np.mean(within_sim) - np.mean(outside_sim)
        })
    
    return pd.DataFrame(results)

cluster_eval = evaluate_similarity_within_clusters()
print("Cluster similarity evaluation:")
print(cluster_eval)

# Visualize cluster separation
plt.figure(figsize=(10, 6))
x = cluster_eval['cluster']
width = 0.35

plt.bar(x - width/2, cluster_eval['avg_within_similarity'], width, label='Within cluster')
plt.bar(x + width/2, cluster_eval['avg_outside_similarity'], width, label='Outside cluster')

plt.xlabel('Cluster')
plt.ylabel('Average Similarity')
plt.title('Within vs Outside Cluster Similarities')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Export Similarity Data

In [None]:
# Save top similar users for each user
similar_users_data = []

for user_id in user_ids[:1000]:  # Limit to first 1000 users for file size
    similar = find_similar_users(user_id, n_similar=5)
    for _, row in similar.iterrows():
        similar_users_data.append({
            'user_id': user_id,
            'similar_user_id': row['user_id'],
            'similarity': row['similarity'],
            'same_cluster': row['cluster'] == user_features[user_features['user_id'] == user_id]['kmeans_cluster'].values[0]
        })

similar_users_df = pd.DataFrame(similar_users_data)
similar_users_df.to_csv('../data/processed/user_similarities.csv', index=False)

# Save music twins
music_twins.to_csv('../data/processed/music_twins.csv', index=False)

# Save KNN model
import joblib
joblib.dump(knn, '../models/knn_similarity_model.pkl')

print("Similarity data saved!")
print("\nSaved files:")
print("- user_similarities.csv: Top similar users for each user")
print("- music_twins.csv: Highly similar user pairs")
print("\nSaved models:")
print("- knn_similarity_model.pkl")

## Summary and Next Steps

In [None]:
print("=== SIMILARITY MATCHING SUMMARY ===")
print(f"\nDataset: {len(user_features)} users")
print(f"Features used: {len(feature_cols)} dimensions")
print(f"\nSimilarity metrics:")
print(f"- Average cosine similarity: {cosine_upper.mean():.3f}")
print(f"- Users with >0.9 similarity: {(cosine_upper > 0.9).sum()}")
print(f"- Music twin pairs (>0.95): {len(music_twins)}")
print(f"\nCluster analysis:")
print(f"- Average within-cluster similarity: {cluster_eval['avg_within_similarity'].mean():.3f}")
print(f"- Average between-cluster similarity: {cluster_eval['avg_outside_similarity'].mean():.3f}")
print(f"- Average separation: {cluster_eval['separation'].mean():.3f}")

print("\n=== NEXT STEPS ===")
print("1. Deploy similarity matching API")
print("2. A/B test recommendation algorithms")
print("3. Collect user feedback on matches")
print("4. Optimize for real-time performance")
print("5. Add explanation for why users are similar")