# SI650 Final Project
Michelle Cheng (michengz@umich.edu)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', 150)
import pyterrier as pt
from pyterrier.measures import *
import os
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk-19.jdk/Contents/Home"
import warnings
warnings.filterwarnings('ignore')

#from pyterrier.batchretrieve import TextScorer

## Data Preparation

In [2]:
df_reviews = pd.read_csv("reviews.csv")
df_reviews.head(2)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2539,55688172,2015-12-04,25160947,Peter,Great host
1,2539,97474898,2016-08-27,91513326,Liz,Nice room for the price. Great neighborhood. John was very accommodating. Bottles of water in the room were a nice touch and very much appreciated.


In [3]:
df_listings = pd.read_csv("listings.csv")
df_listings = df_listings.rename(columns={"id": "listing_id"})
df_listings.head(1)

Unnamed: 0,listing_id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2539,https://www.airbnb.com/rooms/2539,20220907064715,2022-09-07,city scrape,Clean & quiet apt home by the park,"Renovated apt home in elevator building.<br /><br /><b>The space</b><br />Spacious, renovated, and clean apt home, one block to F train, 25 minute...",Close to Prospect Park and Historic Ditmas Park,https://a0.muscache.com/pictures/3949d073-a02e-4ebc-aa9c-ac74f00eaa1f.jpg,2787,...,5.0,4.75,4.88,,f,9,1,6,2,0.11


In [4]:
print(f'df_reviews shape:{df_reviews.shape}')
print(f'df_listings shape:{df_listings.shape}')
print(f'df_reviews unique listings:{df_reviews["listing_id"].nunique()}')

df_reviews shape:(1064458, 6)
df_listings shape:(39881, 75)
df_reviews unique listings:31519


In [5]:
# Concatenating Comments by Listings
df_reviews_concatenated = pd.DataFrame()
df_reviews['comments'] = df_reviews['comments'].apply(lambda x: str(x))
df_reviews_concatenated['listing_id'] = df_reviews.groupby(['listing_id'])['comments'].apply('\n'.join).index
df_reviews_concatenated['comments'] = df_reviews.groupby(['listing_id'])['comments'].apply('\n'.join).tolist()
print(df_reviews_concatenated.shape)
df_reviews_concatenated.head(3)

(31519, 2)


Unnamed: 0,listing_id,comments
0,2539,Great host \nNice room for the price. Great neighborhood. John was very accommodating. Bottles of water in the room were a nice touch and very muc...
1,2595,"Notre séjour de trois nuits.\r<br/>Nous avons apprécier L'appartement qui est très bien situé. Agréable, propre et bien soigné. C'est idéal pour u..."
2,5121,"Simple place, super nice guy.\nGreat guy with a basic room. Clean sheets and towels. No hangs-ups, come and go as you please. The neighborhood was..."


In [6]:
df_joined = df_listings.join(df_reviews_concatenated.set_index('listing_id'), on = 'listing_id')
df = df_joined[['listing_id','name','description','neighborhood_overview','comments','neighbourhood',
                'neighbourhood_cleansed','neighbourhood_group_cleansed','property_type','room_type',
                'accommodates','beds','amenities','price','review_scores_rating','review_scores_cleanliness', 
                'review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value', 
                'host_response_rate','listing_url','picture_url','host_url','host_name']]
df[['name', 'description','neighborhood_overview','comments']] = df[['name', 'description','neighborhood_overview','comments']].fillna('')
df['text'] = df['name'] + '; ' + df['description'] + '; ' + df['neighborhood_overview']+ '; ' + df['comments']

df = df.rename(columns={'listing_id':"docno"})
df['docno'] = df['docno'].apply(lambda x: str(x))

print(f'df shape: {df.shape}')
print(f'Number of nulls in text column: {df["text"].isnull().sum()}')
df.head(1)

df shape: (39881, 26)
Number of nulls in text column: 0


Unnamed: 0,docno,name,description,neighborhood_overview,comments,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,property_type,room_type,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_response_rate,listing_url,picture_url,host_url,host_name,text
0,2539,Clean & quiet apt home by the park,"Renovated apt home in elevator building.<br /><br /><b>The space</b><br />Spacious, renovated, and clean apt home, one block to F train, 25 minute...",Close to Prospect Park and Historic Ditmas Park,Great host \nNice room for the price. Great neighborhood. John was very accommodating. Bottles of water in the room were a nice touch and very muc...,"Brooklyn , New York, United States",Kensington,Brooklyn,Private room in rental unit,Private room,...,5.0,5.0,4.75,4.88,100%,https://www.airbnb.com/rooms/2539,https://a0.muscache.com/pictures/3949d073-a02e-4ebc-aa9c-ac74f00eaa1f.jpg,https://www.airbnb.com/users/show/2787,John,"Clean & quiet apt home by the park; Renovated apt home in elevator building.<br /><br /><b>The space</b><br />Spacious, renovated, and clean apt h..."


In [7]:
def airbnb_filter(location, room_type, accommodates, beds):
    return df[(df['neighbourhood_group_cleansed'] == location) & #'Manhattan'
              (df['room_type'] == room_type) & #'Private room'
              (df['accommodates'] == accommodates) & #2
              (df['beds'] == beds)] #1

In [8]:
df_filtered = airbnb_filter('Manhattan', 'Private room', 2, 1)
df_filtered.shape

(2858, 26)

In [9]:
df_filtered.columns

Index(['docno', 'name', 'description', 'neighborhood_overview', 'comments',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'property_type', 'room_type',
       'accommodates', 'beds', 'amenities', 'price', 'review_scores_rating',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'host_response_rate', 'listing_url',
       'picture_url', 'host_url', 'host_name', 'text'],
      dtype='object')

## Pyterrier Indexing

In [11]:
if not pt.started():
    pt.init(tqdm = 'notebook', 
            logging='ERROR', 
            boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

terrier-prf -SNAPSHOT jar not found, downloading to /Users/michellecheng/.pyterrier...
100% [..............................................................................] 21842 / 21842Done


PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [12]:
docs_df = df_filtered[['docno','text','description','amenities','review_scores_rating','review_scores_cleanliness','review_scores_value','review_scores_communication']]
docs_df['docno'] = docs_df['docno'].apply(lambda x: str(x))
docs_df['review_scores_rating'] = docs_df['review_scores_rating'].apply(lambda x: str(x))
docs_df['review_scores_cleanliness'] = docs_df['review_scores_cleanliness'].apply(lambda x: str(x))
docs_df['review_scores_value'] = docs_df['review_scores_value'].apply(lambda x: str(x))
docs_df['review_scores_communication'] = docs_df['review_scores_communication'].apply(lambda x: str(x))
# docs_df['description'] = docs_df['description'].apply(lambda x: x[:50])
print(docs_df.shape)
docs_df.head(2)

(2858, 8)


Unnamed: 0,docno,text,description,amenities,review_scores_rating,review_scores_cleanliness,review_scores_value,review_scores_communication
7,5178,Large Furnished Room Near B'way　; Please don’t expect the luxury here just a basic room in the center of Manhattan.<br /><br /><b>The space</b><br...,"Please don’t expect the luxury here just a basic room in the center of Manhattan.<br /><br /><b>The space</b><br />You will use one large, furnish...","[""Body soap"", ""Microwave"", ""Laundromat nearby"", ""Mini fridge"", ""Essentials"", ""Lock on bedroom door"", ""Hot water"", ""Heating"", ""Smoke alarm"", ""Free ...",4.23,3.75,4.4,4.46
11,45936,Couldn't Be Closer To Columbia Uni; <b>The space</b><br />Hi and Thank you for checking out this unique and lovely space! <br /> <br />If you want...,<b>The space</b><br />Hi and Thank you for checking out this unique and lovely space! <br /> <br />If you want to be in an absolutely safe and sup...,"[""Breakfast"", ""Cable TV"", ""Elevator"", ""Heating"", ""Kitchen"", ""Dryer"", ""Air conditioning"", ""Washer"", ""Wifi"", ""Long term stays allowed"", ""TV with sta...",4.65,4.47,4.61,4.87


In [13]:
index_dir = './airbnb_index'
if not os.path.exists(index_dir + "/data.properties"):
    pt.set_property("termpipelines", "Stopwords,PorterStemmer")
    indexer = pt.DFIndexer(index_dir, overwrite = True, stemmer = 'PorterStemmer',stopwords = 'Stopwords', tokeniser="UTFTokeniser")
    #indexer.setProperty("termpipelines", "Stopwords, PorterStemmer")
    #   indexer.setProperty("termpipelines","Stopwords,PorterStemmer")
    indexer.setProperties(**{
            "indexer.meta.forward.keylens":"26,2048",
            'metaindex.compressed.crop.long' : 'true'
        })
    index_ref = indexer.index(docs_df["text"], docs_df["docno"], docs_df["description"],docs_df["amenities"],docs_df["review_scores_rating"],docs_df["review_scores_cleanliness"],docs_df["review_scores_value"],docs_df["review_scores_communication"])
    
else:
    index_ref = pt.IndexRef.of(index_dir + "/data.properties")

index = pt.IndexFactory.of(index_ref)

In [14]:
print(index.getCollectionStatistics().toString())

Number of documents: 2858
Number of terms: 52843
Number of postings: 892687
Number of fields: 0
Number of tokens: 2403066
Field names: []
Positions:   false



## Preparing for Evaluation - Annotation

In [15]:
# Loading Queries
queries_df = pd.read_csv("queries.csv")
print(queries_df.shape)
queries_df.head()

(20, 2)


Unnamed: 0,qid,query
0,1,cozy and peaceful loft with modern style for minimalists
1,2,cheap apartment for student staying for a month
2,3,nice place to spend holidays with cute christmas decorations
3,4,close to the subway and supermarket but affordable for young adults
4,5,apartment with hardwood floors that feels like home


In [16]:
def remove_punc(q):
    return "".join([x if x!=',' else "" for x in q])
queries_df['query'] = queries_df['query'].apply(remove_punc)

In [17]:
# Retrieving documents using multiple models
models = ['BM25','PL2','TF_IDF','DPH']
df_retrieved = pd.DataFrame()
for model in models:
    br = pt.BatchRetrieve(index, wmodel = model, num_results = 100)
    temp_df = br(queries_df)
    df_retrieved = pd.concat([df_retrieved, temp_df])
df_retrieved

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,249,4766284,0,16.407905,cozy and peaceful loft with modern style for minimalists
1,1,1278,28989914,1,15.469657,cozy and peaceful loft with modern style for minimalists
2,1,2424,566657891382493833,2,14.458052,cozy and peaceful loft with modern style for minimalists
3,1,789,15141938,3,14.430033,cozy and peaceful loft with modern style for minimalists
4,1,262,4965152,4,14.183009,cozy and peaceful loft with modern style for minimalists
...,...,...,...,...,...,...
1995,20,185,3654510,95,4.519015,has elevator and warm during the winter family with elders
1996,20,2043,49792993,96,4.512133,has elevator and warm during the winter family with elders
1997,20,973,19568889,97,4.491938,has elevator and warm during the winter family with elders
1998,20,544,9698992,98,4.490586,has elevator and warm during the winter family with elders


In [18]:
# Sort and retrieve the top 100 docs
df_retrieved = df_retrieved.join(df_filtered[['docno','text','listing_url','price','host_name']].set_index('docno'), on = 'docno')
df_retrieved_cleansed = df_retrieved.drop_duplicates(subset=['qid','docno'], keep="first").drop_duplicates(subset=['qid','score','host_name'], keep="first")
df_retrieved_cleansed['qid'] = df_retrieved_cleansed['qid'].apply(lambda x:int(x))
df_annotation = df_retrieved_cleansed.sort_values(by=['qid','rank']).groupby('qid').head(100)
print(df_annotation.shape)
df_annotation.head(3)

(2000, 10)


Unnamed: 0,qid,docid,docno,rank,score,query,text,listing_url,price,host_name
0,1,249,4766284,0,16.407905,cozy and peaceful loft with modern style for minimalists,Large size room in loft - for females and LGBTQ; Large size room we are only renting to females/ LGTBQ friendly . Very close to public transport a...,https://www.airbnb.com/rooms/4766284,$80.00,Angie
1,1,1278,28989914,1,15.469657,cozy and peaceful loft with modern style for minimalists,Private room in bright and spacious loft; Welcome to our bright and cozy shared 2 bed/2 bath loft apartment in the heart of East Village. This is ...,https://www.airbnb.com/rooms/28989914,$130.00,Victoria
2,1,2424,566657891382493833,2,14.458052,cozy and peaceful loft with modern style for minimalists,"Clean & Quiet light filled Queen Bedroom; Private bedroom with a queen-sized bed that sleeps two in a 2nd floor brownstone . Located in a quiet, s...",https://www.airbnb.com/rooms/566657891382493833,$135.00,Ali


In [19]:
# Export annotation file
df_annotation.to_csv("annotation.csv", index = False)

In [20]:
# Query-doc pairs
n = 1
docs_df_dup = docs_df
while n < 20:
    docs_df_dup = pd.concat([docs_df_dup,docs_df], axis = 0)
    n += 1
queries_df_dup = queries_df.loc[queries_df.index.repeat(len(docs_df))]
query_doc_df = pd.concat([queries_df_dup.reset_index(),docs_df_dup.reset_index()],axis = 1)[['qid','query','docno','text']]

# Load annotated labels
annotated_df = pd.read_csv('annotated.csv')
annotated_df['docno'] = annotated_df['docno'].apply(lambda x: str(x))
annotated_df['qid'] = annotated_df['qid'].apply(lambda x: str(x))

# Construct qrels dataframe
qrels = query_doc_df.merge(annotated_df, on=['docno','qid'],how = 'left')[['qid','docno','label']]
qrels['label'] = qrels['label'].fillna(1)
qrels['label'] = qrels['label'].apply(lambda x: 1 if x==0 else x)
qrels['label'] = qrels['label'].astype(int)
qrels

Unnamed: 0,qid,docno,label
0,1,5178,1
1,1,45936,1
2,1,54466,1
3,1,9704,1
4,1,54860,1
...,...,...,...
57155,20,594782494398949034,4
57156,20,40986181,1
57157,20,1185357,1
57158,20,35372621,1


In [21]:
# Train-test Split
# topics_train = queries_df[:10]
# topics_test = queries_df[10:]
# qrels_train = qrels[qrels['qid'].apply(lambda x:int(x)) < 11]
# qrels_test = qrels[qrels['qid'].apply(lambda x:int(x)) > 10]

In [22]:
# Train-test Split
from sklearn.model_selection import train_test_split

tr_va_topics, test_topics = train_test_split(queries_df, test_size=0.5, random_state=42)
train_topics, valid_topics =  train_test_split(tr_va_topics, test_size=0.33, random_state=42)

### Baseline Model

In [23]:
bm25 = pt.BatchRetrieve(index, wmodel = 'BM25')

### Naive System

In [24]:
def naive(keyFreq, posting, entryStats, collStats):
    dl = posting.getDocumentLength() # document length
    return dl

In [25]:
naive_model = pt.BatchRetrieve(index, wmodel = naive)

In [26]:
pt.Experiment(
    [bm25,naive_model],
    test_topics,
    qrels,
    eval_metrics=["map", "ndcg",nDCG@5,nDCG@10,nDCG@50,nDCG@100],
    names=["BM25","Naive"])

Unnamed: 0,name,map,ndcg,nDCG@5,nDCG@10,nDCG@50,nDCG@100
0,BM25,0.341183,0.451794,0.621385,0.599905,0.660884,0.686614
1,Naive,0.341183,0.402772,0.297628,0.279068,0.32052,0.391667


In [27]:
pd.DataFrame({'relevance':qrels['label'].value_counts().index,'count':qrels['label'].value_counts()})

Unnamed: 0,relevance,count
1,1,55570
3,3,515
4,4,491
2,2,424
5,5,160


In [28]:
queries_df

Unnamed: 0,qid,query
0,1,cozy and peaceful loft with modern style for minimalists
1,2,cheap apartment for student staying for a month
2,3,nice place to spend holidays with cute christmas decorations
3,4,close to the subway and supermarket but affordable for young adults
4,5,apartment with hardwood floors that feels like home
5,6,fun place for families with children safe and quiet neighborhood
6,7,aesthetic interior design host with great taste
7,8,perfect location for tourists that is worth the money and easy to find
8,9,apartment that looks new with great city view
9,10,friendly host that responds quickly and is trustworthy


# Models

### Query Expansion Techiniques Comparison

In [44]:
bm25 = pt.BatchRetrieve(index, wmodel = 'BM25')
bm25_bo1 = pt.BatchRetrieve(index, wmodel = 'BM25', controls={"qe":"on", "qemodel" : "Bo1"})
bm25_rm3 = bm25 >> pt.rewrite.RM3(index) >> bm25
bm25_kl = bm25 >> pt.rewrite.KLQueryExpansion(index) >> bm25

In [45]:
pt.Experiment(
    [bm25, bm25_bo1, bm25_rm3, bm25_kl],
    test_topics,
    qrels,
    eval_metrics=["ndcg",nDCG@5,nDCG@10,nDCG@50,nDCG@100],
    names=["BM25", 
           "BM25(Bo1)",
           "BM25(RM3)",
           "BM25(KL)"])

Unnamed: 0,name,ndcg,nDCG@5,nDCG@10,nDCG@50,nDCG@100
0,BM25,0.451794,0.621385,0.599905,0.660884,0.686614
1,BM25(Bo1),0.455838,0.586473,0.575748,0.635612,0.67928
2,BM25(RM3),0.453532,0.579263,0.550944,0.587094,0.650791
3,BM25(KL),0.457631,0.626754,0.604719,0.659981,0.690862


### Custom BM25

In [47]:
def bm25_custom_weighting(keyFreq, posting, entryStats, collStats):

    N = collStats.getNumberOfDocuments() # number of documents
    df = entryStats.getDocumentFrequency() # number of documents that contain the term
    tf = posting.getFrequency() # term frequency in document
    dl = posting.getDocumentLength() # document length
    avdl = collStats.getAverageDocumentLength() # average document length
    qtf = keyFreq # term frequency in query
    
    mf = entryStats.getMaxFrequencyInDocuments() # maximum in-document term frequency of the term among all documents
    tt = entryStats.getFrequency() # total number of occurrences of the term
    W = collStats.getNumberOfTokens() # total number of tokens
    avtf = (tt/df)/(W/N) # average term frequency
    
    k1 = 1.2
    k3 = 8
    b = 0.75
    a = 0.5
    c = 1

    idf = np.log((N-df+0.5)/(df+0.5))
    normalized_qtf = ((k3+1)*qtf)/(k3+qtf)
    normalized_tf  = ((k1+1)*tf)/(k1*(3-(a+b+c)+b*(dl/avdl)+c*(avtf/(mf-avtf))+a*(avtf/tf))+tf) 
    
    score =  idf * normalized_qtf * normalized_tf
    
    return score

In [81]:
custom_bm25 = pt.BatchRetrieve(index, wmodel = bm25_custom_weighting)
custom_bm25_kl = custom_bm25 >> pt.rewrite.KLQueryExpansion(index) >> custom_bm25
custom_bm25_rm3 = custom_bm25 >> pt.rewrite.RM3(index) >> custom_bm25
custom_bm25_bo1 = bm25_bo1 = pt.BatchRetrieve(index, wmodel = bm25_custom_weighting, controls={"qe":"on", "qemodel" : "Bo1"})
bm25_and_custom = pt.Experiment(
                        [bm25, bm25_bo1, bm25_rm3, bm25_kl, custom_bm25,custom_bm25_bo1, custom_bm25_rm3,custom_bm25_kl],
                        test_topics,
                        qrels,
                        eval_metrics=["ndcg",nDCG@5,nDCG@10,nDCG@50,nDCG@100],
                        names=['BM25','BM25(Bo1)','BM25(RM3)','BM25(KL)','Custom BM25','Custom BM25(Bo1)','Custom BM25(RM3)','Custom BM25(KL)'])
bm25_and_custom

Unnamed: 0,name,ndcg,nDCG@5,nDCG@10,nDCG@50,nDCG@100
0,BM25,0.451794,0.621385,0.599905,0.660884,0.686614
1,BM25(Bo1),0.456239,0.65606,0.600697,0.632943,0.682405
2,BM25(RM3),0.453532,0.579263,0.550944,0.587094,0.650791
3,BM25(KL),0.457631,0.626754,0.604719,0.659981,0.690862
4,Custom BM25,0.452461,0.66655,0.632331,0.674363,0.697204
5,Custom BM25(Bo1),0.456239,0.65606,0.600697,0.632943,0.682405
6,Custom BM25(RM3),0.45334,0.597681,0.581961,0.59284,0.655319
7,Custom BM25(KL),0.457431,0.673637,0.627158,0.653197,0.69626


### Learning to Rank

In [50]:
import re
def check_amenities(row):
    s = re.sub(r'[^\w\s]','',row['amenities'])
    amenities_list = s.strip('][').split(' ')
    score = 0
    for amenity in amenities_list:
        if amenity in row['query']:
            score += 1
    return score

In [51]:
def get_ratings(row):
    total = 0
    for i in ['review_scores_rating','review_scores_cleanliness','review_scores_value','review_scores_communication']:
        try: 
            total += float(row[i])
        except:
            total += 0
    if str(total).isnumeric() == False:
        total = 0
#     print(total)
    return total

In [85]:
# Features
ltr_feats = (custom_bm25_bo1) >> pt.text.get_text(index, ["description",'amenities','review_scores_rating','review_scores_cleanliness','review_scores_value','review_scores_communication']) >> (
    pt.transformer.IdentityTransformer()
    ** # Score of Description
    pt.text.scorer(body_attr="description", takes='docs', wmodel=bm25_custom_weighting) 
    ** # Abstract Coordinate Match
    pt.BatchRetrieve(index, wmodel="CoordinateMatch")
    ** # Amenities
    pt.apply.doc_score(check_amenities)
    ** # Get ratings
    pt.apply.doc_score(get_ratings)
)
fnames=["BM25", 'description', "CoordinateMatch",'amenities','ratings']

# Features with Bo1 Query Expansion
ltr_feats_bo1 = (custom_bm25_bo1) >> pt.text.get_text(index, ["description",'amenities','review_scores_rating','review_scores_cleanliness','review_scores_value','review_scores_communication']) >> (
    pt.transformer.IdentityTransformer()
    ** # Score of Description
    pt.text.scorer(body_attr="description", takes='docs', wmodel=bm25_custom_weighting) 
    ** # Abstract Coordinate Match
    pt.BatchRetrieve(index, wmodel="CoordinateMatch")
    ** # Amenities
    pt.apply.doc_score(check_amenities)
    ** # Get ratings
    pt.apply.doc_score(get_ratings)
)
fnames=["BM25", 'description', "CoordinateMatch",'amenities','ratings']

# Features with RM3 Query Expansion
ltr_feats_rm3 = (custom_bm25_rm3) >> pt.text.get_text(index, ["description",'amenities','review_scores_rating','review_scores_cleanliness','review_scores_value','review_scores_communication']) >> (
    pt.transformer.IdentityTransformer()
    ** # Score of Description
    pt.text.scorer(body_attr="description", takes='docs', wmodel=bm25_custom_weighting) 
    ** # Abstract Coordinate Match
    pt.BatchRetrieve(index, wmodel="CoordinateMatch")
    ** # Amenities
    pt.apply.doc_score(check_amenities)
    ** # Get ratings
    pt.apply.doc_score(get_ratings)
)
fnames=["BM25", 'description', "CoordinateMatch",'amenities','ratings']


# Features with KL Query Expansion
ltr_feats_kl = (custom_bm25_kl) >> pt.text.get_text(index, ["description",'amenities','review_scores_rating','review_scores_cleanliness','review_scores_value','review_scores_communication']) >> (
    pt.transformer.IdentityTransformer()
    ** # Score of Description
    pt.text.scorer(body_attr="description", takes='docs', wmodel=bm25_custom_weighting) 
    ** # Abstract Coordinate Match
    pt.BatchRetrieve(index, wmodel="CoordinateMatch")
    ** # Amenities
    pt.apply.doc_score(check_amenities)
    ** # Get ratings
    pt.apply.doc_score(get_ratings)
)
fnames=["BM25", 'description', "CoordinateMatch",'amenities','ratings']

In [65]:
qrels['label'] = qrels['label'].apply(lambda x: float(x))

In [66]:
### COORDINATE ASCENT ###
import fastrank
train_request = fastrank.TrainRequest.coordinate_ascent()

params = train_request.params
params.init_random = True
params.normalize = True
params.seed = 1234567

ca_pipe = ltr_feats >> pt.ltr.apply_learned_model(train_request, form='fastrank')
ca_pipe.fit(train_topics, qrels)

---------------------------
Training starts...
---------------------------
[+] Random restart #1/5...
[+] Random restart #3/5...
[+] Random restart #2/5...
[+] Random restart #4/5...
[+] Random restart #5/5...
Shuffle features and optimize!
----------------------------------------
   2|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   0|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   1|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   4|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   3|Feature         |   Weight|     NDCG
----------------------------------------
   3|0         

In [67]:
### RANDOM FOREST ###
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=400, verbose=1, random_state=42, n_jobs=2)

rf_pipe = ltr_feats >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(train_topics, qrels)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    1.0s finished


In [68]:
### SUPPORT VECTOR REGRESSOR ###
from sklearn.svm import SVR
svr = SVR()#C=1.0, epsilon=0.2

svr_pipe = ltr_feats >> pt.ltr.apply_learned_model(svr)
svr_pipe.fit(train_topics, qrels)

In [69]:
### LambdaMART (LightGBM) ###
import lightgbm as lgb

# this configures LightGBM as LambdaMART
lmart_l = lgb.LGBMRanker(
    task="train",
    silent=False,
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=1,
    max_bin=255,
    num_leaves=31,
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[10],
    ndcg_at=[10],
    eval_at=[10],
    learning_rate= .1,
    importance_type="gain",
    num_iterations=100,
    early_stopping_rounds=5
)

lmart_l_pipe = ltr_feats >> pt.ltr.apply_learned_model(lmart_l, form="ltr", fit_kwargs={'eval_at':[10]})
lmart_l_pipe.fit(train_topics, qrels, valid_topics, qrels)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 530
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 4
[1]	valid_0's ndcg@10: 0.196818
Training until validation scores don't improve for 5 rounds
[2]	valid_0's ndcg@10: 0.379753
[3]	valid_0's ndcg@10: 0.442778
[4]	valid_0's ndcg@10: 0.452685
[5]	valid_0's ndcg@10: 0.433009
[6]	valid_0's ndcg@10: 0.405119
[7]	valid_0's ndcg@10: 0.367576
[8]	valid_0's ndcg@10: 0.371023
[9]	valid_0's ndcg@10: 0.377803
Early stopping, best iteration is:
[4]	valid_0's ndcg@10: 0.452685


In [70]:
# LTR FEATUERES WITHOUT QUERY EXPANSION
qrels['label'] = qrels['label'].apply(lambda x: int(x))

ltr_results = pt.Experiment(
                [bm25,naive_model, ca_pipe, rf_pipe, svr_pipe, lmart_l_pipe],
                test_topics,
                qrels,
                eval_metrics=["ndcg",nDCG@5,nDCG@10,nDCG@50,nDCG@100],
                names=["BM25 (Baseline)",
                       "Naive (Baseline)",
                       "Coordinate Ascent",
                       "Random Forest",
                       "Support Vector Regressor",
                       "LambdaMART"])
ltr_results

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.1s finished


Unnamed: 0,name,ndcg,nDCG@5,nDCG@10,nDCG@50,nDCG@100
0,BM25 (Baseline),0.451794,0.621385,0.599905,0.660884,0.686614
1,Naive (Baseline),0.402772,0.297628,0.279068,0.32052,0.391667
2,Coordinate Ascent,0.460291,0.667694,0.655427,0.668675,0.711292
3,Random Forest,0.448403,0.53386,0.51193,0.546447,0.616561
4,Support Vector Regressor,0.447712,0.568808,0.577201,0.586681,0.619124
5,LambdaMART,0.451692,0.592583,0.593206,0.564627,0.628323


In [71]:
### Trainig LTR models with features with Bo1 query expansion
qrels['label'] = qrels['label'].apply(lambda x: float(x))

ca_pipe_bo1 = ltr_feats_bo1 >> pt.ltr.apply_learned_model(train_request, form='fastrank')
ca_pipe_bo1.fit(train_topics, qrels)

rf_pipe_bo1 = ltr_feats_bo1 >> pt.ltr.apply_learned_model(rf)
rf_pipe_bo1.fit(train_topics, qrels)

svr_pipe_bo1 = ltr_feats_bo1 >> pt.ltr.apply_learned_model(svr)
svr_pipe_bo1.fit(train_topics, qrels)

lmart_l_pipe_bo1 = ltr_feats_bo1 >> pt.ltr.apply_learned_model(lmart_l, form="ltr", fit_kwargs={'eval_at':[10]})
lmart_l_pipe_bo1.fit(train_topics, qrels, valid_topics, qrels)

---------------------------
Training starts...
---------------------------
[+] Random restart #1/5...
[+] Random restart #3/5...
[+] Random restart #2/5...
[+] Random restart #4/5...
[+] Random restart #5/5...
Shuffle features and optimize!
----------------------------------------
   0|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   2|Feature         |   Weight|     NDCG
Shuffle features and optimize!
----------------------------------------
   1|Feature         |   Weight|     NDCG
----------------------------------------
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   4|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   3|Feature         |   Weight|     NDCG
----------------------------------------
   2|4         

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.9s finished


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 530
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 4
[1]	valid_0's ndcg@10: 0.196818
Training until validation scores don't improve for 5 rounds
[2]	valid_0's ndcg@10: 0.379753
[3]	valid_0's ndcg@10: 0.442778
[4]	valid_0's ndcg@10: 0.452685
[5]	valid_0's ndcg@10: 0.433009
[6]	valid_0's ndcg@10: 0.405119
[7]	valid_0's ndcg@10: 0.367576
[8]	valid_0's ndcg@10: 0.371023
[9]	valid_0's ndcg@10: 0.377803
Early stopping, best iteration is:
[4]	valid_0's ndcg@10: 0.452685


In [72]:
qrels['label'] = qrels['label'].apply(lambda x: int(x))

ltr_results_bo1 = pt.Experiment(
                [bm25,naive_model, ca_pipe_bo1, rf_pipe_bo1, svr_pipe_bo1, lmart_l_pipe_bo1],
                test_topics,
                qrels,
                eval_metrics=["ndcg",nDCG@5,nDCG@10,nDCG@50,nDCG@100],
                names=["BM25 (Baseline)",
                       "Naive (Baseline)",
                       "Coordinate Ascent(Bo1)",
                       "Random Forest(Bo1)",
                       "Support Vector Regressor(Bo1)",
                       "LambdaMART(Bo1)"])
ltr_results_bo1

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.1s finished


Unnamed: 0,name,ndcg,nDCG@5,nDCG@10,nDCG@50,nDCG@100
0,BM25 (Baseline),0.451794,0.621385,0.599905,0.660884,0.686614
1,Naive (Baseline),0.402772,0.297628,0.279068,0.32052,0.391667
2,Coordinate Ascent(Bo1),0.459859,0.664883,0.668034,0.663036,0.70366
3,Random Forest(Bo1),0.448403,0.53386,0.51193,0.546447,0.616561
4,Support Vector Regressor(Bo1),0.447712,0.568808,0.577201,0.586681,0.619124
5,LambdaMART(Bo1),0.451692,0.592583,0.593206,0.564627,0.628323


In [73]:
### Trainig LTR models with features with KL query expansion
qrels['label'] = qrels['label'].apply(lambda x: float(x))

ca_pipe_kl = ltr_feats_kl >> pt.ltr.apply_learned_model(train_request, form='fastrank')
ca_pipe_kl.fit(train_topics, qrels)

rf_pipe_kl = ltr_feats_kl >> pt.ltr.apply_learned_model(rf)
rf_pipe_kl.fit(train_topics, qrels)

svr_pipe_kl = ltr_feats_kl >> pt.ltr.apply_learned_model(svr)
svr_pipe_kl.fit(train_topics, qrels)

lmart_l_pipe_kl = ltr_feats_kl >> pt.ltr.apply_learned_model(lmart_l, form="ltr", fit_kwargs={'eval_at':[10]})
lmart_l_pipe_kl.fit(train_topics, qrels, valid_topics, qrels)

---------------------------
Training starts...
---------------------------
[+] Random restart #1/5...
[+] Random restart #3/5...
[+] Random restart #2/5...
[+] Random restart #4/5...
[+] Random restart #5/5...
Shuffle features and optimize!
----------------------------------------
   2|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   0|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   1|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   3|Feature         |   Weight|     NDCG
----------------------------------------
   3|0               |    0.000|    0.673
Shuffle features and optimize!
----------------------------------------
   4|Feature         |   Weight|     NDCG
--------------

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    1.3s finished


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 773
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 4
[1]	valid_0's ndcg@10: 0.52522
Training until validation scores don't improve for 5 rounds
[2]	valid_0's ndcg@10: 0.49801
[3]	valid_0's ndcg@10: 0.504583
[4]	valid_0's ndcg@10: 0.545222
[5]	valid_0's ndcg@10: 0.608157
[6]	valid_0's ndcg@10: 0.576901
[7]	valid_0's ndcg@10: 0.594988
[8]	valid_0's ndcg@10: 0.601787
[9]	valid_0's ndcg@10: 0.601212
[10]	valid_0's ndcg@10: 0.573139
Early stopping, best iteration is:
[5]	valid_0's ndcg@10: 0.608157


In [74]:
# LTR FEATUERES WITH KL QUERY EXPANSION
qrels['label'] = qrels['label'].apply(lambda x: int(x))

ltr_results_kl = pt.Experiment(
                [bm25,naive_model, ca_pipe_kl, rf_pipe_kl, svr_pipe_kl, lmart_l_pipe_kl],
                test_topics,
                qrels,
                eval_metrics=["ndcg",nDCG@5,nDCG@10,nDCG@50,nDCG@100],
                names=["BM25 (Baseline)",
                       "Naive (Baseline)",
                       "Coordinate Ascent(KL)",
                       "Random Forest(KL)",
                       "Support Vector Regressor(KL)",
                       "LambdaMART(KL)"])
ltr_results_kl

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.1s finished


Unnamed: 0,name,ndcg,nDCG@5,nDCG@10,nDCG@50,nDCG@100
0,BM25 (Baseline),0.451794,0.621385,0.599905,0.660884,0.686614
1,Naive (Baseline),0.402772,0.297628,0.279068,0.32052,0.391667
2,Coordinate Ascent(KL),0.459852,0.685672,0.653443,0.668138,0.712931
3,Random Forest(KL),0.450122,0.568094,0.568575,0.577722,0.627474
4,Support Vector Regressor(KL),0.447849,0.64848,0.614642,0.593034,0.615083
5,LambdaMART(KL),0.455489,0.604952,0.613657,0.63828,0.681271


In [86]:
### Trainig LTR models with features with RM3 query expansion
qrels['label'] = qrels['label'].apply(lambda x: float(x))

ca_pipe_rm3 = ltr_feats_rm3 >> pt.ltr.apply_learned_model(train_request, form='fastrank')
ca_pipe_rm3.fit(train_topics, qrels)

rf_pipe_rm3 = ltr_feats_rm3 >> pt.ltr.apply_learned_model(rf)
rf_pipe_rm3.fit(train_topics, qrels)

svr_pipe_rm3 = ltr_feats_rm3 >> pt.ltr.apply_learned_model(svr)
svr_pipe_rm3.fit(train_topics, qrels)

lmart_l_pipe_rm3 = ltr_feats_rm3 >> pt.ltr.apply_learned_model(lmart_l, form="ltr", fit_kwargs={'eval_at':[10]})
lmart_l_pipe_rm3.fit(train_topics, qrels, valid_topics, qrels)

---------------------------
Training starts...
---------------------------
[+] Random restart #1/5...
[+] Random restart #3/5...
[+] Random restart #4/5...
[+] Random restart #2/5...
[+] Random restart #5/5...
Shuffle features and optimize!
----------------------------------------
   0|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   2|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   1|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   4|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   3|Feature         |   Weight|     NDCG
----------------------------------------
   2|4         

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    1.1s finished


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 776
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 4
[1]	valid_0's ndcg@10: 0.356615
Training until validation scores don't improve for 5 rounds
[2]	valid_0's ndcg@10: 0.441935
[3]	valid_0's ndcg@10: 0.431846
[4]	valid_0's ndcg@10: 0.388179
[5]	valid_0's ndcg@10: 0.389181
[6]	valid_0's ndcg@10: 0.304658
[7]	valid_0's ndcg@10: 0.366243
Early stopping, best iteration is:
[2]	valid_0's ndcg@10: 0.441935


In [87]:
# LTR FEATUERES WITH KL QUERY EXPANSION
qrels['label'] = qrels['label'].apply(lambda x: int(x))

ltr_results_rm3 = pt.Experiment(
                [bm25,naive_model, ca_pipe_rm3, rf_pipe_rm3, svr_pipe_rm3, lmart_l_pipe_rm3],
                test_topics,
                qrels,
                eval_metrics=["ndcg",nDCG@5,nDCG@10,nDCG@50,nDCG@100],
                names=["BM25 (Baseline)",
                       "Naive (Baseline)",
                       "Coordinate Ascent(KL)",
                       "Random Forest(KL)",
                       "Support Vector Regressor(KL)",
                       "LambdaMART(KL)"])
ltr_results_rm3

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.1s finished


Unnamed: 0,name,ndcg,nDCG@5,nDCG@10,nDCG@50,nDCG@100
0,BM25 (Baseline),0.451794,0.621385,0.599905,0.660884,0.686614
1,Naive (Baseline),0.402772,0.297628,0.279068,0.32052,0.391667
2,Coordinate Ascent(KL),0.454817,0.639567,0.595739,0.608012,0.653479
3,Random Forest(KL),0.445419,0.513698,0.49122,0.50492,0.576379
4,Support Vector Regressor(KL),0.443738,0.527253,0.51133,0.522378,0.576528
5,LambdaMART(KL),0.449977,0.542704,0.521653,0.557887,0.619366


## Ablation Study

In [75]:
from tqdm import tqdm

qrels['label'] = qrels['label'].apply(lambda x: float(x))
# learn a model for 5 features, removing one each time
numf=5
rankers = []
names = []
# learn a model for all four features
full = ltr_feats_bo1 >> pt.ltr.apply_learned_model(train_request, form='fastrank')
full.fit(train_topics, qrels)
rankers.append(full)

for fid in tqdm(range(numf)):
    ablated = ltr_feats_bo1 >> pt.ltr.ablate_features(fid) >> pt.ltr.apply_learned_model(RandomForestRegressor(n_estimators=400))
    ablated.fit(train_topics, qrels)
    rankers.append(ablated)

---------------------------
Training starts...
---------------------------
[+] Random restart #1/5...
[+] Random restart #3/5...
[+] Random restart #2/5...
[+] Random restart #5/5...
[+] Random restart #4/5...
Shuffle features and optimize!
----------------------------------------
   0|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   1|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   2|Feature         |   Weight|     NDCG
----------------------------------------
Shuffle features and optimize!
----------------------------------------
   4|Feature         |   Weight|     NDCG
----------------------------------------
   0|4               |   -0.217|    0.640
Shuffle features and optimize!
----------------------------------------
   3|Feature         |   Weight|     NDCG
--------------

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:07<00:00, 13.44s/it]


In [76]:
qrels['label'] = qrels['label'].apply(lambda x: int(x))
ablation_df = pt.Experiment(
                rankers,
                test_topics,
                qrels,
                eval_metrics=["ndcg",nDCG@5,nDCG@10,nDCG@50,nDCG@100],
                names=["Full Model"]  + ["Full Minus %d" % fid for fid in range(numf)]
)
ablation_df

Unnamed: 0,name,ndcg,nDCG@5,nDCG@10,nDCG@50,nDCG@100
0,Full Model,0.460291,0.667694,0.655427,0.668675,0.711292
1,Full Minus 0,0.437916,0.425841,0.412649,0.434777,0.515128
2,Full Minus 1,0.44702,0.538346,0.525087,0.537624,0.603518
3,Full Minus 2,0.442801,0.500579,0.469543,0.495015,0.560595
4,Full Minus 3,0.448422,0.50812,0.524293,0.545434,0.614218
5,Full Minus 4,0.448949,0.544505,0.519502,0.55451,0.620928


In [544]:
pt.Experiment(
    [ltr_feats1 >> pt.ltr.feature_to_score(i) for i in range(len(fnames))],
    test_topics,
    qrels, 
    names=fnames,
    eval_metrics=["map", "ndcg", "ndcg_cut_10", "num_rel_ret"])



Unnamed: 0,name,map,ndcg,ndcg_cut_10,num_rel_ret
0,BM25,0.349895,0.455743,0.60343,10000.0
1,description,0.349895,0.444015,0.52863,10000.0
2,CoordinateMatch,0.349895,0.444966,0.505448,10000.0
3,ratings,0.349895,0.419912,0.249459,10000.0


## Interaction

In [592]:
retrieved_docnos = ca_pipe('comfy hotel for families')[['docno']]
retrieved_docnos.merge(df_filtered, on = 'docno', how = 'left')[['name','description','neighbourhood','host_name','property_type','listing_url','picture_url']]



Unnamed: 0,name,description,neighbourhood,host_name,property_type,listing_url,picture_url
0,Presence Guest Room - Chelsea NYC,Private 350 Sq foot room with working fireplace and private bathroom in a privately owned single family townhouse in Chelsea. You'll be living wit...,,Tony,Private room in townhouse,https://www.airbnb.com/rooms/191610,https://a0.muscache.com/pictures/9954051/dfd5b29d_original.jpg
1,Large sunny room queen bed &balcony,"Private bedroom is located within a spacious three bedroom/ three floor apartment, right in the heart of the coolest neighborhood in Manhattan. Re...","New York, United States",Amikole,Private room in condo,https://www.airbnb.com/rooms/1716441,https://a0.muscache.com/pictures/24422502/9060e3d0_original.jpg
2,Suite with private bathroom,"We are offering a private bedroom with a full sized bed that will sleep two adults. The room is sunny, located on the second floor of our townhous...","New York, United States",Gregory,Private room in townhouse,https://www.airbnb.com/rooms/10086307,https://a0.muscache.com/pictures/0d99a8e4-cd83-4456-aed4-b34e656be8e1.jpg
3,Calm & private bedroom near Manhattan!,"This peaceful and centrally-located place is where I live with my husband, 10-year-old son, and our little cat, Plissé. We are a loving, busy, fun...","North Bergen, New Jersey, United States",Debora V,Private room in home,https://www.airbnb.com/rooms/618224818091381240,https://a0.muscache.com/pictures/miso/Hosting-618224818091381240/original/070052dc-cce3-4afe-88de-1a5a7ae028e0.jpeg
4,Sunny Bedroom for two,We are offering a private bedroom with a full sized bed that will sleep two adults. The room is sunny and located on the second floor of our town...,"New York, United States",Gregory,Private room in townhouse,https://www.airbnb.com/rooms/2700296,https://a0.muscache.com/pictures/35276364/3eec1819_original.jpg
...,...,...,...,...,...,...,...
995,Exclusive Queen Stay at SQ Boutique Hotel,"This Boutique Hotel is at an amazing location in the center of TIMES SQUARE, it has many things to offer other than just location like amazing stu...","New York, United States",Blake,Room in boutique hotel,https://www.airbnb.com/rooms/49958899,https://a0.muscache.com/pictures/dafc93f9-6912-4516-bad0-1ca16a7cc7b8.jpg
996,NYC Central Park Lux Hotel w/ Breakfast & Dinner,"West 57th St. by Hilton Club comes w/complimentary Breakfast & Dinner. It is opposite fr. Central Park. Lincoln Center, Carnegie Hall and 5th Aven...",,Allison,Room in hotel,https://www.airbnb.com/rooms/29295665,https://a0.muscache.com/pictures/ceef2eb8-59a2-46b7-b616-41b3d558137c.jpg
997,"Cozy, walk to Central Park, Columbia & Morningside",A cozy place to rest after a day touring the city. We will provide all of the essentials as well as coffee & tea.<br /><br /><b>The space</b><br /...,"New York, United States",Paul A,Private room in rental unit,https://www.airbnb.com/rooms/27580964,https://a0.muscache.com/pictures/0939e492-a048-48f0-88a4-5fc9c8308701.jpg
998,Spacious Private Room at the Heart of Manhattan,Welcome to my COVID-19 SANITIZED private room right off CENTER PARK. <br /><br />This private room is located in Midtown West Manhattan within wal...,"New York, United States",Helen,Private room in rental unit,https://www.airbnb.com/rooms/51300917,https://a0.muscache.com/pictures/54b4baba-3177-453c-bdcc-0b4a378cf44a.jpg


In [42]:
def retrieve_airbnb():
    print('Please enter your Airbnb search:')
    x = input()
    print('Retrieving Results...\n')
    retrieved_docnos = ca_pipe(x)[['docno']].head(10)
    output = retrieved_docnos.merge(df_filtered, on = 'docno', how = 'left')[['name','description','neighbourhood_cleansed','host_name','property_type','listing_url','picture_url']]
    for i in range(10):
        print(f'{output.iloc[i,:]["name"]}\n')
        print(f'{output.iloc[i,:]["neighbourhood_cleansed"]}\n')
        print(f'{output.iloc[i,:]["description"]}\n')
        print(f'{output.iloc[i,:]["listing_url"]}\n')
        print('\n')

In [43]:
if __name__ == '__main__':
    retrieve_airbnb()

Please enter your Airbnb search:
comfy
Retrieving Results...

Staypineapple New York, Fashionista King

Hell's Kitchen

It’s no secret. Manhattan has the best, if not the most options for dining, shopping, and entertainment on the planet. Booking a room in the hip and cozy Staypineapple in Midtown puts you right in the heart of the action so you can get the most out of your NYC vacay. <br /><br />An amenity fee with tax ($28.69 per day) will be charged upon arrival.<br /><br />Daily pet fees apply.<br /><br /><b>The space</b><br />The rooms feature plush ambience with modern furniture to provide a peaceful night’s rest in the city that never sleeps. Enjoy the signature Naked Experience that includes duvet beds with luxury towels and robes to provide maximum comfort. Free high-speed Wi-Fi, high-def televisions with premium channels, and complimentary Kuerig coffee and tea are all standard.<br /><br />Midtown Manhattan is the home for some of New York’s top attractions. The Theater Distr