In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

# Load dataset
user_df = pd.read_csv('user_profiles.csv')

# Preprocess
def preprocess_data(df):
    df['smoking_tolerance'] = df['smoking_tolerance'].astype(int)
    df['pets_tolerance'] = df['pets_tolerance'].astype(int)
    df['party_tolerance'] = df['party_tolerance'].astype(int)

    # Select numerical features
    numerical_features = df[['budget_min', 'budget_max', 'cleanliness_level', 
                             'noise_tolerance', 'smoking_tolerance', 
                             'pets_tolerance', 'party_tolerance']]
    
    # Standardize numerical features
    scaler = StandardScaler()
    scaled_numerical = scaler.fit_transform(numerical_features)

    # One-hot encode gender_preference
    encoder = OneHotEncoder(drop='first') 
    gender_encoded = encoder.fit_transform(df[['gender_preference']]).toarray()
    
    # Combine numerical + categorical features
    combined_features = np.hstack((scaled_numerical, gender_encoded))

    return combined_features

# Preprocess user data
user_features = preprocess_data(user_df)

# Similarity
def calculate_similarity(features):
    return cosine_similarity(features)

similarity_matrix = calculate_similarity(user_features)

# Recommendation Function
def recommend(user_id, n=5):
    user_index = user_id - 1
    similarity_scores = similarity_matrix[user_index]
    similar_indices = np.argsort(similarity_scores)[-n-1:-1][::-1]
    recommended_users = user_df['user_id'].iloc[similar_indices].values
    return recommended_users




In [3]:
import pickle

# Save the processed user features
with open('user_features.pkl', 'wb') as f:
    pickle.dump(user_features, f)

# Save the similarity matrix
with open('similarity_matrix.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)


In [None]:
'''# Update preprocess_data function to include new features if needed
def preprocess_data(df):
    # Existing preprocessing steps
    
    # Additional processing for new features (e.g., TF-IDF or word embeddings)
    
    return scaled_features

# Update similarity calculation to include new features
def calculate_similarity(features):
    # Adjust cosine_similarity calculation to include new features if applicable
    
    return cosine_similarity(features)'''