In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from ast import literal_eval

In [None]:
listing = pd.read_csv('drive/MyDrive/Colab Notebooks/data/listing_with_visitors.csv')

In [None]:
listing

Unnamed: 0,listing_id,name,host_id,host_since,host_location,host_response_time,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,...,Lock on Bedroom Door,Wheelchair Accessible,Indoor Fireplace,Smoking Allowed,Kitchen,First Aid Kit,Air Conditioning,Family/Kid Friendly,Carbon Monoxide Detector,visitors
0,35001175,Cozy Brickwall Loft in Lower East Side Manhattan,86982839,2016.7.31,"Paris, Ile-de-France, France",a few days or more,0%,f,1,t,...,0,0,0,0,1,0,1,0,1,"[5127407, 176056033, 106157270, 25522657, 3332..."
1,41893558,Cute 4 Bedroom in Hamilton Heights,154741428,2017.10.15,"Paris, Ile-de-France, France",a few days or more,0%,f,1,t,...,0,0,0,0,1,0,1,0,1,[63395736]
2,37102870,SeaSide Hong Kong,107359991,2016.12.15,"Paris, Ile-de-France, France",a few days or more,0%,f,1,t,...,0,0,0,0,1,0,1,0,1,[294492520]
3,34074389,"Vue sur le Bosphore, jardin magnifique, 3 pi__...",3135679,2012.8.2,"Paris, Ile-de-France, France",a few days or more,0%,f,1,t,...,0,0,0,0,1,0,0,0,0,"[245789218, 162337174, 204713793]"
4,10669776,"Chiara House, near Central Station",55114051,2016.1.18,"Paris, Ile-de-France, France",a few days or more,0%,f,1,t,...,0,0,0,0,0,0,0,0,0,"[117957721, 258534733, 236158497, 237131457, 1..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108956,31830688,Near the Eiffel Tower!,6012581,2013.4.20,"Paris, Ile-de-France, France",within an hour,100%,t,1,t,...,0,0,0,0,1,0,0,0,1,"[163570168, 287867129, 123897701, 262214893, 1..."
108957,34729731,Studio cozy,25682201,2015.1.6,"Paris, Ile-de-France, France",within an hour,100%,f,1,t,...,0,0,0,0,1,0,0,0,1,"[98007774, 169671586, 337409991, 31460017]"
108958,44273734,Charming duplex near JARDIN DU LUXEMBOURG,356893366,2020.7.17,"Paris, Ile-de-France, France",within an hour,100%,f,1,t,...,0,0,0,0,0,0,0,0,1,"[155108254, 12033233]"
108959,46673032,Moderne-ancien- jungle urbaine - proche Montma...,2470431,2012.5.26,"Paris, Ile-de-France, France",within an hour,100%,f,1,t,...,0,0,0,0,0,0,0,0,1,"[103362137, 196763688]"


In [None]:
listing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108961 entries, 0 to 108960
Data columns (total 68 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   listing_id                   108961 non-null  int64 
 1   name                         108960 non-null  object
 2   host_id                      108961 non-null  int64 
 3   host_since                   108961 non-null  object
 4   host_location                108847 non-null  object
 5   host_response_time           108961 non-null  object
 6   host_response_rate           108961 non-null  object
 7   host_is_superhost            108961 non-null  object
 8   host_total_listings_count    108961 non-null  int64 
 9   host_has_profile_pic         108961 non-null  object
 10  host_identity_verified       108961 non-null  object
 11  neighbourhood                108961 non-null  object
 12  city                         108961 non-null  object
 13  property_type 

Content-based filtering

In [None]:
listing['visitors'] = listing['visitors'].apply(literal_eval)

In [None]:
df_relevant = listing[['listing_id', 'property_type', 'room_type', 'accommodates', 'bedrooms', 'price', 'city', 'visitors']]

In [None]:
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(df_relevant[['property_type', 'room_type']])

In [None]:
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['property_type', 'room_type']))
encoded_df = pd.concat([df_relevant.reset_index(drop=True), encoded_df], axis=1)
encoded_df = encoded_df.drop(columns=['property_type', 'room_type'])

In [None]:
encoded_df['price'] = encoded_df['price'].replace({'\\$': ''}, regex=True).astype(float)

In [None]:
features = encoded_df.drop(columns=['listing_id', 'city', 'visitors'])

In [None]:
def calculate_precision_recall(recommended_ids, target_visitor_list, all_visited_listings):
    """
    추천된 숙소들의 precision과 recall을 계산합니다.

    Parameters:
    recommended_ids (list): 추천된 숙소들의 listing_id 리스트
    target_visitor_list (list): 대상 숙소를 방문한 방문자 리스트
    all_visited_listings (dict): 각 방문자가 방문한 모든 숙소 리스트

    Returns:
    tuple: (precision, recall) 값
    """
    # 대상 숙소의 방문자들이 방문한 모든 숙소를 relevant items로 간주
    relevant_items = set()
    for visitor in target_visitor_list:
        relevant_items.update(all_visited_listings.get(visitor, []))

    # 추천된 아이템 중 relevant한 아이템 수 계산
    recommended_set = set(recommended_ids)
    relevant_and_recommended = relevant_items.intersection(recommended_set)

    # Precision과 Recall 계산
    precision = len(relevant_and_recommended) / len(recommended_set) if recommended_set else 0
    recall = len(relevant_and_recommended) / len(relevant_items) if relevant_items else 0

    return precision, recall

In [None]:
def get_recommendations_with_eval(listing_id, topn=10):
    """
    주어진 숙소에 대해 city가 같은 topn개의 유사한 숙소를 추천하고 평가 메트릭을 계산합니다.

    Parameters:
    listing_id (int): 대상 숙소의 listing_id
    topn (int): 상위 몇 개의 숙소를 추천할지 결정

    Returns:
    tuple: (추천 숙소 DataFrame, precision, recall)
    """
    # 대상 숙소의 인덱스 및 city 찾기
    idx = listing[listing['listing_id'] == listing_id].index[0]
    target_city = listing.loc[idx, 'city']

    # 같은 city에 속한 숙소만 필터링
    listings_same_city = encoded_df[encoded_df['city'] == target_city].reset_index(drop=True)
    listings_original_same_city = df_relevant[df_relevant['city'] == target_city].reset_index(drop=True)

    # 대상 숙소의 특징 벡터
    target_features = features.loc[idx].values.reshape(1, -1)

    # 같은 city에 있는 숙소들에 대한 특징 벡터
    city_features = listings_same_city.drop(columns=['listing_id', 'city', 'visitors']).values

    # 코사인 유사도 계산
    cosine_sim = cosine_similarity(target_features, city_features).flatten()

    # 유사도 점수와 인덱스를 결합하여 정렬
    sim_scores = sorted(list(enumerate(cosine_sim)), key=lambda x: x[1], reverse=True)

    # 자기 자신 제외한 상위 N개의 추천 숙소 선택
    sim_scores = sim_scores[1:topn+1]
    recommended_indices = [i[0] for i in sim_scores]

    # 추천 숙소 정보
    recommended_listings = listings_original_same_city.iloc[recommended_indices]

    # 평가 메트릭 계산을 위한 데이터 준비
    target_visitors = listing.loc[idx, 'visitors']
    recommended_ids = recommended_listings['listing_id'].tolist()

    # 각 방문자가 방문한 숙소 매핑 생성
    visitor_to_listings = {}
    for _, row in listing.iterrows():
        for visitor in row['visitors']:
            if visitor not in visitor_to_listings:
                visitor_to_listings[visitor] = []
            visitor_to_listings[visitor].append(row['listing_id'])

    # Precision과 Recall 계산
    precision, recall = calculate_precision_recall(
        recommended_ids,
        target_visitors,
        visitor_to_listings
    )

    return recommended_listings[['listing_id', 'property_type', 'room_type', 'accommodates', 'bedrooms', 'price', 'city']], precision, recall


In [None]:
# 추천 및 평가 실행
recommendations, precision, recall = get_recommendations_with_eval(14130853, topn=10)
print("\nRecommended Listings:")
print(recommendations)
print(f"\nPrecision: {precision:.3f}")
print(f"Recall: {recall:.3f}")


Recommended Listings:
      listing_id property_type     room_type  accommodates  bedrooms    price  \
1766    14130853     Apartment  Private room             2         1  258.00$   
388     10133631     Apartment  Private room             2         1  257.00$   
986     24566582     Apartment  Private room             2         1  260.00$   
2273     4193855     Apartment  Private room             2         1  263.00$   
1004     8768806     Apartment  Private room             2         1  253.00$   
2196    15527609     Apartment  Private room             2         1  252.00$   
2204    10514355     Apartment  Private room             2         1  252.00$   
2208    16849098     Apartment  Private room             2         1  252.00$   
1694     2217839     Apartment  Private room             2         1  250.00$   
2007     9397325     Apartment  Private room             2         1  250.00$   

           city  
1766  Hong Kong  
388   Hong Kong  
986   Hong Kong  
2273  Hong Ko

In [None]:
# 추천 및 평가 실행
recommendations, precision, recall = get_recommendations_with_eval(453168, topn=10)
print("\nRecommended Listings:")
print(recommendations)
print(f"\nPrecision: {precision:.3f}")
print(f"Recall: {recall:.3f}")


Recommended Listings:
       listing_id property_type     room_type  accommodates  bedrooms  \
10172     8479088     Apartment  Entire place             6         3   
12367    35891998     Apartment  Entire place             6         3   
13374    23250677     Apartment  Entire place             6         3   
19744     1329663     Apartment  Entire place             6         3   
2482     23184112     Apartment  Entire place             6         3   
14169    41407089     Apartment  Entire place             6         3   
12028    24006276     Apartment  Entire place             6         3   
13457    16609648     Apartment  Entire place             6         3   
13970     5981388     Apartment  Entire place             6         3   
13288     7598590     Apartment  Entire place             6         3   

         price   city  
10172  160.00$  Paris  
12367  160.00$  Paris  
13374  160.00$  Paris  
19744  160.00$  Paris  
2482   163.00$  Paris  
14169  156.00$  Paris  
12028

In [None]:
# 전체 데이터셋에 대한 평균 성능 계산
def calculate_average_metrics(sample_size=100):
    """
    전체 데이터셋에서 무작위로 sample_size개의 숙소를 선택하여 평균 성능을 계산합니다.
    """
    all_precisions = []
    all_recalls = []

    # visitors가 있는 숙소만 선택
    valid_listings = listing[listing['visitors'].apply(len) > 0]['listing_id'].tolist()

    # sample_size개의 숙소를 무작위로 선택
    if len(valid_listings) > sample_size:
        sample_listings = np.random.choice(valid_listings, sample_size, replace=False)
    else:
        sample_listings = valid_listings

    for lid in sample_listings:
        try:
            _, precision, recall = get_recommendations_with_eval(lid, topn=10)
            all_precisions.append(precision)
            all_recalls.append(recall)
        except Exception as e:
            print(f"Error processing listing {lid}: {str(e)}")
            continue

    return {
        'mean_precision': np.mean(all_precisions),
        'mean_recall': np.mean(all_recalls),
        'std_precision': np.std(all_precisions),
        'std_recall': np.std(all_recalls)
    }

# 평균 성능 계산
print("\nCalculating average metrics...")
metrics = calculate_average_metrics(sample_size=100)
print("\nAverage Performance Metrics:")
print(f"Mean Precision: {metrics['mean_precision']:.3f} (±{metrics['std_precision']:.3f})")
print(f"Mean Recall: {metrics['mean_recall']:.3f} (±{metrics['std_recall']:.3f})")


Calculating average metrics...

Average Performance Metrics:
Mean Precision: 0.049 (±0.070)
Mean Recall: 0.116 (±0.254)
