# RAG in Pandas PoC - Airbnb example

In [2]:
from langchain.agents import create_pandas_dataframe_agent
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType

In [3]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = os.environ.get('OPENAI_API_KEY') or getpass("OpenAI API Key:")

In [4]:
from langchain.llms import OpenAI
import pandas as pd

airbnb_df = pd.read_csv('amsterdam_airbnb.csv', index_col=0)

In [5]:
agent = create_pandas_dataframe_agent(
    ChatOpenAI(temperature=0, model="gpt-4"),
    airbnb_df,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS
)

Let's say we want to know if a passager survived. If we ask for exactly the passager's name, we get the answer:

In [9]:
airbnb_df.head()

Unnamed: 0,listing_url,name,description,host_identity_verified,host_is_superhost,beds,amenities,review_scores_rating,price
0,https://www.airbnb.com/rooms/761411,Condo in Amsterdam · ★4.74 · 1 bedroom · 1 bed...,"Really quiet, spacious and safe, a nice place ...",t,t,1.0,"[""Extra pillows and blankets"", ""Drying rack fo...",4.74,$61.00
1,https://www.airbnb.com/rooms/768274,Rental unit in Amsterdam · ★4.83 · 1 bedroom ·...,Our cool and spacious loft is perfect for a st...,t,f,1.0,"[""Extra pillows and blankets"", ""Drying rack fo...",4.83,$327.00
2,https://www.airbnb.com/rooms/768737,Boat in Amsterdam · ★4.82 · 1 bedroom · 1 bed ...,Room to rent in my houseboat. The room has a p...,t,t,1.0,"[""Extra pillows and blankets"", ""Drying rack fo...",4.82,$109.00
3,https://www.airbnb.com/rooms/771217,Houseboat in Amsterdam · ★5.0 · 3 bedrooms · 3...,"Spacious houseboat in Amsterdam, suitable for ...",t,f,3.0,"[""Kitchen"", ""Fire extinguisher"", ""Heating"", ""S...",5.0,$290.00
4,https://www.airbnb.com/rooms/771343,Rental unit in Amsterdam · ★4.89 · 1 bedroom ·...,Royal Bed & Coffee Room with a very comfortabl...,t,t,1.0,"[""Extra pillows and blankets"", ""Drying rack fo...",4.89,$150.00


Let's try to filter using semantic search over the name column:

In [16]:
names = airbnb_df['name'].unique()
names

array(['Condo in Amsterdam · ★4.74 · 1 bedroom · 1 bed · 1 shared bath',
       'Rental unit in Amsterdam · ★4.83 · 1 bedroom · 1 bed · 2 baths',
       'Boat in Amsterdam · ★4.82 · 1 bedroom · 1 bed · 1.5 baths', ...,
       'Houseboat in Amsterdam · ★New · 1 bedroom · 2 beds · 1 bath',
       'Rental unit in Amsterdam · ★New · 4 bedrooms · 4 beds · 3 baths',
       'Hotel in Amsterdam · ★New · 1 bedroom · 1 bed · 1 shared bath'],
      dtype=object)

In [17]:
from langchain.embeddings import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()


unique_names = airbnb_df['name'].unique()
name_embeddings = embeddings_model.embed_documents(unique_names)

name_embedding_dict = dict(zip(unique_names, name_embeddings))

airbnb_df['name_embedding'] = airbnb_df['name'].map(name_embedding_dict)


Now we will find airbnb posts with the name similar to some semantic meaning:

In [30]:
from scipy.spatial.distance import cosine

embed_query = embeddings_model.embed_query("romantic place")

airbnb_df['similarity'] = airbnb_df['name_embedding'].apply(lambda x: (1 - cosine(x, embed_query)))

top_5_rows = airbnb_df.nlargest(5, 'similarity')
top_5_rows['name']

2501    Nature lodge in Amsterdam · ★4.91 · 1 bedroom ...
2362    Place to stay in Amsterdam · 1 bedroom · 2 bed...
2459    Nature lodge in Amsterdam · ★4.89 · 1 bedroom ...
4961    casa particular in Amsterdam · ★4.13 · 1 bedro...
5315       Yurt in Amsterdam · 1 bedroom · 1 bed · 1 bath
Name: name, dtype: object