In [None]:
!pip install weaviate-client==4.6.3 streamlit==1.35.0 streamlit_aggrid==1.0.5 ipywidgets git+https://github.com/mpgreg/funda-scraper.git@main

In [None]:
!docker compose up -d

In [None]:
from funda_scraper import FundaScraper
import pandas as pd
import base64
import requests
import weaviate
from weaviate.util import generate_uuid5
import json

In [None]:
COLLECTION_DEF_FILE = 'collection_def.json'
TEST_CITY_NAME = 'amsterdam'

client = weaviate.connect_to_local()

try:

    current_collections = client.collections.list_all()

    with open(COLLECTION_DEF_FILE) as f:
        collection_def = json.load(f)

    if collection_def['class'] not in current_collections.keys():
        client.collections.create_from_dict(config=collection_def)

finally:
    client.close()

Data processing and import will be done interactively.  Here we run a quick test to make sure that everything is working during startup.

In [None]:
scraper = FundaScraper(area=TEST_CITY_NAME, want_to="buy", find_past=False, page_start=1, n_pages=1)
df = scraper.run(raw_data=False, save=False)
df.set_index('house_id', inplace=True)

For multi-modal search we will use the first (main) photo for the listing.  (Future work: process all images into a separate collection for refined searching)

In [None]:
photos_df = df['photo'].apply(lambda x: x.split(',')).explode()
photos_df = photos_df.apply(lambda x: x.split()).apply(pd.Series)
photos_df = photos_df[photos_df[1] == "180w"].drop(1, axis=1)

cover_photos = photos_df.groupby('house_id').agg(
    image_url = (0, lambda x: str(x.tolist()[0]))
    )
cover_photos['image_enc'] = cover_photos['image_url'].apply(
    lambda x: base64.b64encode(requests.get(x).content).decode("utf-8")
    )

In [None]:
ingest_df = df.join(cover_photos).drop('photo', axis=1).reset_index()
ingest_df['uuid'] = ingest_df['house_id'].apply(lambda x: generate_uuid5(x))

ingest_df['house_id'] = ingest_df['house_id'].apply(str)

In [None]:
client = weaviate.connect_to_local()

try:
    collection = client.collections.get(collection_def['class'])

    results = []
    with collection.batch.dynamic() as batch:
        for data_row in ingest_df.to_dict('records'):
            results.append(batch.add_object(
                uuid=data_row['uuid'],
                properties=data_row,
            ))

finally:
    client.close()

In [None]:
results

This is a very crude test to check imported items.  For anything more than a POC much more sophisticated ingest and checking is needed.

In [None]:
client = weaviate.connect_to_local()

try:
    collection = client.collections.get(collection_def['class'])

    imported_rows = []

    for item in collection.iterator():
        print(item.uuid, item.properties)
        imported_rows.append(str(item.uuid))

finally:
    client.close()

In [None]:
assert set(results) == set(imported_rows)

In [None]:
!streamlit run streamlit/fundalytics_app.py

In [None]:
!docker compose down