In [19]:
#!/usr/bin/env python3

import pandas as pd
import numpy as np

from pprint import pprint
from lib import (
    HowCloseIsItService,
    get_items_from_page,
    get_pages,
    preprocess_items_df,
    find_closest_coordinates,
)


def show_best_offers(df: pd.DataFrame):
    pd.set_option("max_colwidth", 100)
    df[df["address"].notna()].sort_values(by="price_per_m2")[
        (df["price"] >= 0) & (df["price"] <= 500_000)
    ].head(30)


def get_ammenities_df(how_close_service: HowCloseIsItService, use_cache=False):
    zabki = how_close_service.get_all_of_ammenity(
        "zabka",
        use_cache=use_cache,
    )
    biedronki = how_close_service.get_all_of_ammenity(
        "biedronka",
        use_cache=use_cache,
    )
    lidle = how_close_service.get_all_of_ammenity(
        "lidl",
        use_cache=use_cache,
    )
    stacje = how_close_service.get_all_of_ammenity(
        "stacja paliw",
        use_cache=use_cache,
    )
    restauracje = how_close_service.get_all_of_ammenity(
        "restauracja",
        use_cache=use_cache,
    )
    skm = how_close_service.get_all_of_ammenity(
        "skm",
        use_cache=use_cache,
    )
    pociagi = how_close_service.get_all_of_ammenity(
        "pociag",
        use_cache=use_cache,
    )
    return pd.DataFrame(
        [
            *zabki.values(),
            *biedronki.values(),
            *lidle.values(),
            *stacje.values(),
            *restauracje.values(),
            *skm.values(),
            *pociagi.values(),
        ]
    )


if __name__ == "__main__":
    pages = get_pages(load=True)

    assert get_items_from_page(pages[2]) is not None

    all_items = []
    for page in pages:
        items = get_items_from_page(page)
        if items is not None and len(items):
            all_items += items
    all_items_df = pd.DataFrame(all_items)

    df = preprocess_items_df(all_items_df)
    print("got: ", df.shape)

    how_close_service = HowCloseIsItService()

    coords = []
    for address in df["address"].values:
        res = how_close_service.geocode(address)
        coords.append(how_close_service.response_to_coords(res))

    df["coords"] = coords

    ammenities_df = get_ammenities_df(
        how_close_service,
        use_cache=True,
    )

    for ammenity_type in ["biedronka"]:
        # ammenities_df["ammenity"].unique()
        # biedronka, skm ...
        # df["distance_to_" + ammenity_type] =
        df["closest_biedronka"] = [
            find_closest_coordinates(
                np.array(coords),
                ammenities_df["coords"].to_numpy(),
            )
            for coords in df["coords"].values
        ]
        
        apartment_coords = df["coords"]
        closest_ammenity_coords = df["closest_" + ammenity_type]
        distance_to_closest = []
        for apartment_coord, closest_ammenity_coord in zip(apartment_coords, closest_ammenity_coords):
            distance_to_closest.append(
                how_close_service.get_distance(
                    [apartment_coord],
                    [closest_ammenity_coord],
                )
            )

        df["distance_to_closest_" + ammenity_type] = distance_to_closest

# I think that the way to go is to perform this for all of the ammenities and then if have outliers, repeat the request with the 'long' distance as the location, not "Gdańsk"

getting pages from pages.pkl file
got:  (5081, 9)
Loaded Geocode cache of length: 1013
Loaded Ammenities cache of length: 7
Loaded Distance cache of length: 774


In [20]:
df[["distance_to_closest_biedronka", "address", "closest_biedronka"]]

Unnamed: 0,distance_to_closest_biedronka,address,closest_biedronka
0,479,"Gdańsk, Piecki-Migowo, Morena, KRÓLEWSKIE WZGÓRZE","[54.34903560000001, 18.5666317]"
1,23123,"Gdańsk, Wyspa Sobieszewska, Świbno, Klimatyczn...","[54.37358649999999, 18.7294516]"
2,399,"Gdańsk, Ujeścisko-Łostowice, ul. Wielkopolska","[54.3102493, 18.5879726]"
3,86,"Gdańsk, Kowale","[54.308752, 18.561378]"
4,380661,"Gdańsk, Siedlce, ul. Leona Wyczółkowskiego","[54.26417, 18.66556]"
...,...,...,...
6955,1120,"Gdańsk, Jasień, Potęgowska","[54.3256425, 18.5488506]"
6980,509,"Gdańsk, Śródmieście, Siennicka","[54.3493226, 18.6727758]"
6981,540286,"Gdańsk, Chełm, Niepołomicka","[54.26417, 18.66556]"
6982,130,"Gdańsk, Przymorze, Kołobrzeska","[54.4039316, 18.5884121]"
