# Similarity search using PGVector extension and Airbnb listing data

## Define constants

In [7]:
EMBEDDINGS_MODEL = "sentence-transformers/all-mpnet-base-v2"
EMBEDDINGS_INDEX_DIMENSIONS = 768 # 768 is specific for all-mpnet-base-v2

# trino - our source data
TRINO_URI = 'trino://trino@localhost:8082/lakehouse/kaggle_airbnb'

# vector db to store embeddings
PGVECTOR_CONNECTION_STRING='postgresql://admin:admin@localhost:5432/test'
PGVECTOR_COLLECTION_NAME = "listings_collection"
SIMILARITY_SEARCH_LIMIT=0.4
TOP_K=5

## Data Sources

### Define Trino Connection
This is utilizing an AirBnB data set from the [toolkit use case](https://github.com/msantana09/data-engineering-toolkit/blob/main/UseCase.md)

In [8]:
from sqlalchemy import create_engine
import pandas as pd

trino_conn = create_engine(TRINO_URI).connect()

def get_listings()->pd.DataFrame:
    query = f"""
        SELECT id, name, description 
        FROM kaggle_airbnb.listings
        """

    return pd.read_sql_query(query, trino_conn) 

## Setup Postgres container with PGVector extension

In [9]:
import psycopg2
from pgvector.psycopg2 import register_vector


# Connect to Postgresql DB and install the pgvector extension
db_connection = psycopg2.connect(PGVECTOR_CONNECTION_STRING)
db_cursor = db_connection.cursor()
db_connection.autocommit = True

# Register the vector type with psycopg2
register_vector(db_connection)

table_create_command = f"""
CREATE TABLE IF NOT EXISTS {PGVECTOR_COLLECTION_NAME} (
          id TEXT PRIMARY KEY,
          text TEXT,
          embedding VECTOR({EMBEDDINGS_INDEX_DIMENSIONS})
            );
            """
db_cursor.execute(table_create_command)

def save_vector_plus_meta(db_cursor, row):
  try:
    vector_id = row['id']
    text =row['text']
    embeddings = row['embeddings'].tolist()
    query =  """
      INSERT INTO {} (id, text, embedding)
      VALUES (%s, %s, %s)
      ON CONFLICT (id)
      DO
        UPDATE SET   text = %s, embedding = %s
    """.format(PGVECTOR_COLLECTION_NAME )

    db_cursor.execute(query, (vector_id, text, embeddings,text, embeddings))
    #print(f"Vector {vector_id} was added to the DB")
    return vector_id
  except Exception as e:
    print(f"[save_vector_plus_meta] exception of type {type(e).__name__}: {e}")

### Functions to query vector db, and filter based on similarity

In [10]:
# Search for top N relevant messages
def get_top_relevant_messages(db_cursor, search_embeddings, k=TOP_K):
  
  def _get_possible_matches():
    # <=> is for cosine distance
    # pgvector also supports inner product (<#>),
    # Euclidean distance (L2 distance) (<->) etc
    #
    # We can filter by distance in the query using the where clause like below
    # at the end of the query, but let's do it explicitly in the code
    # WHERE distance < '{SIMILARITY_SEARCH_LIMIT}'
    query = f"""
      WITH vector_matches AS (
          SELECT id, text,  embedding <=> '{search_embeddings}' AS distance
          FROM {PGVECTOR_COLLECTION_NAME}
      )
      SELECT id, text, distance
      FROM vector_matches
      ORDER BY distance
      LIMIT '{k}';
    """

    db_cursor.execute(query)
    return db_cursor.fetchall()
  
  def _filter_matches_by_distance(matches, similarity_limit:float=SIMILARITY_SEARCH_LIMIT):
    relevant_matches = []
    for row in matches:
      # The lower the score value, the more similar vectors are
      if round(row[2], 2) <= float(similarity_limit):
        relevant_matches.append({
            "document": row,
            "score": row[2]
            })
    return relevant_matches

  def _convert_to_dataframe(matches):
    matches_df = pd.DataFrame(matches)
    matches_df['id'] = matches_df.apply(lambda row: row['document'][0], axis=1)
    matches_df['description'] = matches_df.apply(lambda row: row['document'][1], axis=1)
    matches_df.drop(['document'], axis=1, inplace=True)
    return matches_df

  try:
    all_matches = _get_possible_matches()
    relevant_matches = _filter_matches_by_distance(all_matches)
    if len(relevant_matches) == 0:
      print("No relevant matches found")
      matches_df = pd.DataFrame()
    else:
      #print("Relevant matches: ")
      #[print(f'-- {round(doc["score"], 2)}: {doc["document"][1]} / {doc["document"][2]}') for doc in relevant_matches]

      # Creating a dataframe with the results for easier processing
      matches_df = _convert_to_dataframe(relevant_matches)
    return matches_df
  except Exception as e:
    print(f"[get_top_relevant_messages] {type(e).__name__} exception: {e}")
    return pd.DataFrame()

## Define Models 

In [11]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(EMBEDDINGS_MODEL)

def generate_embeddings(listings_df:pd.DataFrame):
    listings_df['text'] = listings_df['name'] + ". " + listings_df['description' ]
    listings_clean_df = listings_df.drop(['name', 'description'],axis=1)
    listings_clean_df['embeddings'] = listings_clean_df.apply(lambda row: embedding_model.encode(row['text'] ), axis=1)

    listings_clean_df.apply(lambda row: save_vector_plus_meta(db_cursor, row) , axis=1)
    return listings_clean_df

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Generate embeddings for listing names and descriptions (combined)

In [12]:
df_with_embeddings = generate_embeddings(get_listings())
df_with_embeddings[:1]

Unnamed: 0,id,text,embeddings
0,241032,Stylish Queen Anne Apartment. Make your self a...,"[-0.043464553, 0.0056106653, -0.003022629, 0.0..."


In [24]:
pd.set_option('display.max_colwidth', None)

tiny_message= "A cute secluded place with a laid back vibe. A small place, preferably a closet" 

tiny_matches_df = get_top_relevant_messages(db_cursor, embedding_model.encode (tiny_message).tolist())
tiny_matches_df

Unnamed: 0,score,id,description
0,0.314489,9716348,"Private room in quite area. quite, nice size private room"
1,0.340446,6512378,"Quiet Place in a Busy City. Private room in a 2 bedroom townhouse in nice Seattle neighborhood. Comfortable room with a full size bed. Perfect space for one person. Bedroom, Shared Bath, Deck, Living room We live here, so interact as much as you want, or not at all if you prefer some alone time. Quiet residential area. 3 City bus routes will get you to/from our place to the city center or surrounding suburbs"


In [22]:
vibrant_message= "a vibrant, action-packed neighborhood with a lot to offer" 
vibrant_matches_df = get_top_relevant_messages(db_cursor, embedding_model.encode (vibrant_message).tolist())
vibrant_matches_df

Unnamed: 0,score,id,description
0,0.370509,9636238,"Nice,clean apt,vibrant area. Well equipped and maintained apt in an older building. Great neighborhood, lots of cafes, Pubs and a bakery next door. Close to Greenlake Park and the Zoo."
1,0.382488,8968925,Modern large home close to downtown. Great living space close to downtown. Walking distance from great restaurants and coffee shops.
2,0.386212,3177005,"In the thick of things..... Hip, modern, bright and comfortable 4th floor apartment in the Capitol Hill district. My favorite restaurant is steps from the front door as are coffee shops, bars, cafes, grocery stores, and some of the most interesting people in town. The apartment is a two bedroom, two bathroom modern, well built home. There are floor to ceiling windows in the main living/dining and kitchen room. Think treehouse on one side and on the other, think urban rooftop views. Cool if you ask me. The master bedroom has a queen sized (latex mattress) and an en suite bathroom. The second bedroom is a kid's room (for kids of all ages) with one twin bed and a bathroom adjacent to it. This comfortable and clean home is a fantastic gateway to a very urban experience outside the front doors. On a serious note, in the Summer the neighborhood is noisy with urban life all around. There are restaurants in the courtyard, sirens, and street noise. If you want a quiet, out of the way experience, this is probably not t"
