In [205]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors, KNeighborsTransformer

### Towns recommended based on Jaccard similarity

Jaccard similarity is primarily used for calculating similarities between sets. It measures the similarity between finite sample sets by looking at what they have in common divided by their total combined elements. Mathematically, it's defined as the size of the intersection divided by the size of the union of the sets:

J(A,B) = |A ∩ B| / |A ∪ B|

For town recommendations, this is particularly useful when comparing binary or categorical features (like amenities, services, or attributes that towns either have or don't have). The score ranges from 0 (no similarity) to 1 (identical).

Unlike cosine similarity which works well with continuous numerical data, Jaccard similarity excels with binary/categorical data and is not concerned with the magnitude of features, only their presence or absence.

In [206]:
import os

# User selections
# List to store DataFrames for each user
user_choices = []

# Get all files from the user_output directory
user_files = os.listdir("../../data/user_output/")

# Read each file into a DataFrame and add to the list
for file in user_files:
    file_path = os.path.join("../../data/user_output/", file)
    user_df = pd.read_csv(file_path)
    user_choices.append(user_df)

df_pueblos = pd.read_csv("../../data/interim/pueblos_recommender.csv")

### We have to make sure to load the user choices from the same set of encoded towns.

In [207]:
user_choices

[   Unnamed: 0   cmun   province            municipality  altitude  longitude  \
 0        6618  29020     Málaga                 Arriate  596.1475  -5.140064   
 1        2208   8258  Barcelona       Santa Maria d'Oló  515.0869   2.035291   
 2        3568  44208     Teruel  Santa Cruz de Nogueras  888.4601  -1.088758   
 3          45  48008    Bizkaia             Artzentales  245.3207  -3.241389   
 4         254  39027  Cantabria       Campoo de Enmedio  878.4844  -4.150000   
 
    latitude  0-17  18-24  25-34  ...  enc_75    enc_76    enc_77    enc_78  \
 0  36.79936   678    331    425  ...     0.0  1.841297  0.721064  1.336097   
 1  41.87207   176     64     80  ...     0.0  0.284170  0.721064 -0.729801   
 2  41.11458     0      0      1  ...     0.0 -0.883675 -1.242798  0.303148   
 3  43.24055    97     26     58  ...     0.0  0.284170  0.721064 -0.213326   
 4  42.96667   604    216    284  ...     0.0 -0.883675 -1.242798 -0.213326   
 
      enc_79    enc_80    enc_81    

In [208]:
df_pueblos

Unnamed: 0.1,Unnamed: 0,cmun,province,municipality,altitude,longitude,latitude,0-17,18-24,25-34,...,enc_75,enc_76,enc_77,enc_78,enc_79,enc_80,enc_81,enc_82,enc_83,enc_84
0,0,48067,Bizkaia,Muxika,18.95764,-2.692941,43.28967,302,96,101,...,0.0,1.062733,0.721064,-0.213326,0.521190,0.386912,1.185980,0.750134,0.143118,0.482462
1,1,15064,A Coruña,Paderne,151.44100,-8.176355,43.28677,265,119,160,...,1.0,0.284170,0.721064,2.110809,1.199804,0.731121,0.865604,-0.135614,0.767742,1.260643
2,2,15039,A Coruña,Irixoa,362.87660,-8.058925,43.28474,119,61,95,...,0.0,-0.105112,0.721064,1.077860,0.350131,-0.397119,0.279576,-0.236198,0.148186,0.371063
3,3,48062,Bizkaia,Mendata,107.88860,-2.633333,43.28333,58,28,32,...,0.0,0.284170,0.721064,-0.213326,-0.413611,-0.397119,0.994147,0.732753,-0.580331,-0.413576
4,4,48905,Bizkaia,Zamudio,44.55854,-2.866667,43.28333,528,226,291,...,0.0,2.619860,0.721064,0.819623,1.939453,6.343633,2.815453,0.890158,1.046479,1.977473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6665,6665,29057,Málaga,Genalguacil,508.32510,-5.235879,36.54427,44,16,32,...,0.0,1.062733,0.721064,0.561385,-0.395943,-0.397119,1.096759,-0.174132,-0.097609,-0.374828
6666,6666,11025,Cádiz,Paterna de Rivera,125.97050,-5.866053,36.52333,966,460,630,...,0.0,-0.494394,0.721064,-0.729801,3.709472,2.413918,7.126529,1.572083,2.171563,3.574197
6667,6667,29056,Málaga,Gaucín,609.39610,-5.317524,36.51830,250,101,139,...,0.0,0.284170,0.721064,1.077860,0.558132,0.511210,1.842909,0.602008,0.915980,0.532511
6668,6668,11001,Cádiz,Alcalá de los Gazules,158.39630,-5.723718,36.46045,793,445,670,...,0.0,-0.883675,0.721064,-0.471564,3.480590,2.413918,3.239257,1.345819,3.476559,3.398218


In [209]:
df_pueblos['cluster'].unique()

array([2, 4, 1, 3, 0])

In [None]:
# Create a dictionary to store DataFrames for each cluster
clusters = {}
for cluster in df_pueblos['cluster'].unique():
    # Store the DataFrame with rows belonging to the current cluster
    clusters[f"cluster_{cluster}"] = df_pueblos[df_pueblos['cluster'] == cluster]

# Filter encoded features for each cluster
pueblo_clusters_encoded_features = {}
for cluster_name, cluster_df in clusters.items():
    pueblo_clusters_encoded_features[cluster_name] = cluster_df.filter(regex=r"^enc_", axis=1)

# Filter for encoded features which will be used in cells below
users_encoded_features = [df_user_choice.filter(regex=r"^enc_", axis=1) for df_user_choice in user_choices]

In [211]:
# Fixed Jaccard similarity calculation function
def calculate_jaccard_similarity(user_features, town_features):
    """
    Calculate Jaccard similarity between user preferences and town features
    
    Parameters:
    -----------
    user_features : pandas.Series or DataFrame row
        Binary features representing user preferences
    town_features : pandas.Series or DataFrame row
        Binary features representing town characteristics
        
    Returns:
    --------
    float
        Jaccard similarity score (0-1)
    """
    # Check if input is DataFrame or Series and handle accordingly
    if isinstance(user_features, pd.DataFrame):
        # Extract binary columns - only consider the first 16 encoded features
        # These are typically the binary encoded features
        user_binary = user_features.iloc[0, :16].astype(bool)
    else:
        # It's a Series
        user_binary = user_features.iloc[:16].astype(bool)
    
    # Same for town features
    if isinstance(town_features, pd.DataFrame):
        town_binary = town_features.iloc[0, :16].astype(bool)
    else:
        town_binary = town_features.iloc[:16].astype(bool)
    
    # Calculate intersection and union
    intersection = sum(user_binary & town_binary)
    union = sum(user_binary | town_binary)
    
    # Avoid division by zero
    if union == 0:
        return 0
    
    return intersection / union

# Create similarity profiles for each user
user_similarity_profiles = []

# For each user in user_encoded_features
for user_idx, user_features in enumerate(users_encoded_features):
    # Create a dictionary to store similarities for each cluster
    similarity_profile = {
        'user_idx': user_idx,
        'cluster_similarities': {}
    }
    
    # Calculate similarities to each cluster
    for cluster_name, cluster_features in pueblo_clusters_encoded_features.items():
        # List to store similarities to each town in the cluster
        town_similarities = []
        
        # Calculate similarity to each town in the cluster
        # Calculate similarity to each town in the cluster
        for idx, town_features in cluster_features.iterrows():
            similarity = calculate_jaccard_similarity(user_features, town_features)
            town_similarities.append({
                'town_idx': idx,
                'similarity': similarity,
                'cmun': df_pueblos.loc[idx, 'cmun'],
                'municipality': df_pueblos.loc[idx, 'municipality']
            })
        
        # Sort towns by similarity (descending)
        town_similarities.sort(key=lambda x: x['similarity'], reverse=True)
        
        # Get the top 5 most similar towns
        top_towns = town_similarities[:5]
        
        # Calculate average similarity to cluster
        avg_similarity = sum(item['similarity'] for item in town_similarities) / len(town_similarities)
        
        # Store in the profile
        similarity_profile['cluster_similarities'][cluster_name] = {
            'avg_similarity': avg_similarity,
            'top_towns': top_towns
        }
    
    # Determine the most similar cluster
    most_similar_cluster = max(
        similarity_profile['cluster_similarities'].items(),
        key=lambda x: x[1]['avg_similarity']
    )
    
    similarity_profile['best_cluster'] = {
        'name': most_similar_cluster[0],
        'avg_similarity': most_similar_cluster[1]['avg_similarity']
    }
    
    # Add to the list of profiles
    user_similarity_profiles.append(similarity_profile)

# Create a summary DataFrame of user preferences by cluster
user_cluster_preferences = pd.DataFrame(
    [[profile['cluster_similarities'][f'cluster_{c}']['avg_similarity'] 
      for c in df_pueblos['cluster'].unique()] 
     for profile in user_similarity_profiles],
    columns=[f'Cluster {c}' for c in df_pueblos['cluster'].unique()]
)

# display(user_similarity_profiles)

user_cluster_preferences['Best Cluster'] = [profile['best_cluster']['name'] for profile in user_similarity_profiles]
user_cluster_preferences.index = [f'User {i+1}' for i in range(len(user_cluster_preferences))]

# Display the summary
display(user_cluster_preferences)

user_cluster_preferences.shape

Unnamed: 0,Cluster 2,Cluster 4,Cluster 1,Cluster 3,Cluster 0,Best Cluster
User 1,0.0,0.0,0.0,0.0,0.0,cluster_2
User 2,0.031558,0.017005,0.015351,0.0,0.057154,cluster_0
User 3,0.055227,0.030085,0.039474,0.0,0.011591,cluster_2
User 4,0.0,0.0,0.0,0.0,0.0,cluster_2
User 5,0.02071,0.000654,0.010965,0.027374,0.007994,cluster_3
User 6,0.0,0.0,0.0,0.0,0.0,cluster_2


(6, 6)