In [5]:
import pandas as pd

In [6]:
df  = pd.read_csv('final_preprocessed_df.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
import numpy as np
from textblob import TextBlob

def get_listing_index(listing_id, df_grouped):
    try:
        return df_grouped[df_grouped['listing_id'] == listing_id].index[0]
    except IndexError:
        return None 

df_grouped = df.groupby('listing_id').agg({  
    'comments': lambda x: ' '.join(x),
    'name': 'first',
    'description': 'first',
    'neighbourhood_cleansed': 'first',
    'property_type': 'first',
    'price': 'mean',
    'review_scores_rating': 'mean',       
    'accommodates': 'mean',
    'bathrooms_text': 'first', 
    'bedrooms': 'mean',
    'beds': 'mean',
    'minimum_nights': 'mean',
    'maximum_nights': 'mean',
    'distance_to_center': 'mean',
    'sentiment': 'mean',
    'amenities': lambda x: ','.join(set(','.join(x).split(',')))
}).reset_index()

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(df_grouped['name'] + " " + df_grouped['description'] + " " + df_grouped['comments'])

encoder = OneHotEncoder()

neighbourhood_encoded = encoder.fit_transform(df_grouped[['neighbourhood_cleansed']])

property_type_encoded = encoder.fit_transform(df_grouped[['property_type']])

df_grouped['amenities'] = df_grouped['amenities'].apply(lambda x: ','.join(x.split(',')))

vectorizer_amenities = TfidfVectorizer(tokenizer=lambda x: x.split(','), lowercase=False)

amenities_matrix = vectorizer_amenities.fit_transform(df_grouped['amenities'])

price_weight = 5

structured_features = np.array(df_grouped[['price', 'review_scores_rating', 'accommodates', 'bedrooms', 'beds', 
                                           'minimum_nights', 'maximum_nights', 'distance_to_center', 'sentiment']])

structured_features[:, 0] = structured_features[:, 0] * price_weight

scaler = StandardScaler()

structured_features_scaled = scaler.fit_transform(structured_features)

combined_features = hstack([tfidf_matrix, neighbourhood_encoded, property_type_encoded, amenities_matrix, structured_features_scaled])

cosine_sim = cosine_similarity(combined_features)

def get_similar_listings(listing_id, num_recommendations=3):
    
    listing_index = get_listing_index(listing_id, df_grouped)
    
    if listing_index is None:
        
        return pd.DataFrame()

    sim_scores = list(enumerate(cosine_sim[listing_index]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = sim_scores[1:num_recommendations + 1]
    
    similar_indices = [i[0] for i in sim_scores]
    
    return df_grouped.iloc[similar_indices]

synthetic_listing_id = 9999

recommended_listings = get_similar_listings(synthetic_listing_id, num_recommendations=3)


KeyError: "Column(s) ['distance_to_center', 'neighbourhood_cleansed'] do not exist"

In [4]:
import lenskit
from lenskit.algorithms.basic import Bias
from lenskit.algorithms.item_knn import ItemItem
from lenskit import batch, topn
import pandas as pd

ratings_data = df[['reviewer_id', 'listing_id', 'synthetic_rating']].rename(columns={
    'reviewer_id': 'user',
    'listing_id': 'item',
    'synthetic_rating': 'rating'
})

bias = Bias()

cf_model = ItemItem(20)  

cf_model.fit(ratings_data)

def get_cf_recommendations(user_id, num_recommendations=5):
   
    recs = batch.recommend(cf_model, ratings_data, user=user_id, n=num_recommendations)
    return df_filtered[df_filtered['listing_id'].isin(recs['item'])]

user_id = 9999  # Replace with an actual user ID
recommendations = get_cf_recommendations(user_id, num_recommendations=5)
print(recommendations)


KeyError: "['synthetic_rating'] not in index"

In [None]:
def hybrid_recommendation(user_id, listing_id, num_recommendations=5, weight_content=0.5, weight_cf=0.5):

    content_based_recs = get_content_based_recommendations(listing_id, num_recommendations)
    
    cf_recs = get_cf_recommendations(user_id, num_recommendations)
    
    combined_recs = pd.concat([content_based_recs, cf_recs]).drop_duplicates(subset='listing_id', keep='first')

    combined_recs['score'] = (weight_content * combined_recs['content_score']) + (weight_cf * combined_recs['cf_score'])
    
    final_recommendations = combined_recs.sort_values(by='score', ascending=False).head(num_recommendations)
    
    return final_recommendations
