## Importing the libraries

In [None]:
import numpy as np
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from datetime import datetime
from urllib.request import urlopen
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import geopandas
import folium

## Scraping the initial data with BeautifulSoup

In [None]:
URL = "https://www.etuovi.com/myytavat-asunnot/helsinki?haku=M1608807886&sivu=1"
#requesting the URL above:
page = requests.get(URL)
#specifying the format of “page” using the html parser allowimg python to read the various components of the page, rather than treating it as one long string.
soup = bs(page.text, "html.parser")
#printing soup in a priettier form:
print(soup.prettify())

In [None]:
# Creating a while loop for looping through all the "next pages".
i=0
initial_data = []
while True:
    i = i+1
    #Edit the if function below to scrape a larger number of pages
    if i > 6:
        print("done")
        break
    else:
        #Edit the initial search criteria in at etuovi.com for more specified search.
        url = f"https://www.etuovi.com/myytavat-asunnot/helsinki?haku=M1608933110&sivu="+str(i)
        page = requests.get(url)
        soup = bs(page.text, "html.parser")
        for div in soup.find_all(name="div", attrs={"class":"flexboxgrid__col-xs-12__1I1LS flexboxgrid__col-sm-7__1EzZq flexboxgrid__col-md-9__2kjy7 flexboxgrid__col-lg-9__M7bfm styles__infoArea__2yhEL"}):
            initial_data.append(div.text.strip())
        print(initial_data)

## DATA CLEANSE & ANALYSIS

In [None]:
initial = pd.DataFrame(initial_data)
df_initial = initial.rename(columns={0: 'Raw_Data'})

In [None]:
# Clean the initial soup into separated columns for easier readibility and management
df_initial[['Type','Raw_2']] = df_initial.Raw_Data.str.split("|",expand=True)
df_initial[["Raw_3", "Year"]] = df_initial.Raw_2.str.split("Vuosi", expand=True)
df_initial['Year_Built'] = df_initial['Year'].str[:4]
df_initial[["Raw_4", "Price_Iteration"]] = df_initial.Raw_3.str.split("Hinta", expand=True)
df_initial[["Price", "Size"]] = df_initial.Price_Iteration.str.split("Koko", expand=True)
df_initial[["Rooms","Address"]] = df_initial.Raw_4.str.split("check", expand=True)
df_initial['Price_Iteration_2'] = df_initial['Price'].str.replace('*',"")
df_initial['Price_Iteration_3'] = df_initial['Price_Iteration_2'].str.replace('€',"")
df_initial['Price_Iteration_4'] = df_initial['Price_Iteration_3'].str[:7]
df_initial['Price_euro'] = df_initial['Price_Iteration_4'].str.replace("\s+","")
df_initial["Size_m²"] = df_initial["Size"].str.replace(" m²","")
print(df_initial.head())

In [None]:
# save the processed colums, rearrange the columns, and get rid of the raw data
data_cleansed = df_initial[["Price_euro","Year_Built","Size_m²","Type","Rooms","Address"]]

## Saving the dataframe to Excel

In [None]:
#add a datetime for naming the excel sheet
dt = datetime.now(tz=None)
dt_str = dt.strftime("%Y" + "%m" + "%d")

In [None]:
# save the data into Excel (change the file path)
data_cleansed.to_excel(r'C:\Users\JohnSmith\NeighborhoodWatch.xlsx', index = False, sheet_name=dt_str)

## Plotting the results on map

In [None]:
# setting the locator and the "NeighborhoodWatch.xlsx" file for plotting
locator = Nominatim(user_agent="myGeocoder")
df = pd.read_excel("NeighborhoodWatch.xlsx")
df['Address_Geo'] = df["Address"]+","+"Finland"
df.head()

In [None]:
# 1 - conveneint function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
# 2- - create location column
df['location'] = df['Address_Geo'].apply(geocode)
# 3 - create longitude, laatitude and altitude from location column (returns tuple)
df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# 4 - split point column into latitude, longitude and altitude columns
df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)

In [None]:
# cleaning the unnecessary columns
df = df.drop(["Price_euro", "Year_Built", "Size_m²", "Type", "Rooms", "Address", "location", "point"], axis=1)
df.head()

In [None]:
# I had a faulty input from the scrape with no latitude/longitudem, which I cleared with dropna (this could be improved)
df_real = df.dropna(subset = ["latitude"])
print(df_real)

In [None]:
# the initial map
map1 = folium.Map(
    location=[60.1692898243506, 24.94011732881071],
    tiles='cartodbpositron',
    zoom_start=12,
)
df_real.apply(lambda row:folium.CircleMarker(location=[row["latitude"], row["longitude"]]).add_to(map1), axis=1)
map1

In [None]:
# Now we can choose to create the final map with two options:
    #1. With clustered dark map using FastMarkerCluster
    #2. With popups on each and every plotted address

## Option 1: clustered dark map using FastMarkerCluster

In [None]:
# setting up the FastMarkerCluster
from folium.plugins.marker_cluster import MarkerCluster
from folium.utilities import if_pandas_df_convert_to_numpy, validate_location
from jinja2 import Template


class FastMarkerCluster(MarkerCluster):
    """
    Add marker clusters to a map using in-browser rendering.
    Using FastMarkerCluster it is possible to render 000's of
    points far quicker than the MarkerCluster class.
    Be aware that the FastMarkerCluster class passes an empty
    list to the parent class' __init__ method during initialisation.
    This means that the add_child method is never called, and
    no reference to any marker data are retained. Methods such
    as get_bounds() are therefore not available when using it.
    Parameters
    ----------
    data: list of list with values
        List of list of shape [[lat, lon], [lat, lon], etc.]
        When you use a custom callback you could add more values after the
        lat and lon. E.g. [[lat, lon, 'red'], [lat, lon, 'blue']]
    callback: string, optional
        A string representation of a valid Javascript function
        that will be passed each row in data. See the
        FasterMarkerCluster for an example of a custom callback.
    name : string, optional
        The name of the Layer, as it will appear in LayerControls.
    overlay : bool, default True
        Adds the layer as an optional overlay (True) or the base layer (False).
    control : bool, default True
        Whether the Layer will be included in LayerControls.
    show: bool, default True
        Whether the layer will be shown on opening (only for overlays).
    icon_create_function : string, default None
        Override the default behaviour, making possible to customize
        markers colors and sizes.
    **kwargs
        Additional arguments are passed to Leaflet.markercluster options. See
        https://github.com/Leaflet/Leaflet.markercluster
    """
    _template = Template(u"""
        {% macro script(this, kwargs) %}
            var {{ this.get_name() }} = (function(){
                {{ this.callback }}
                var data = {{ this.data|tojson }};
                var cluster = L.markerClusterGroup({{ this.options|tojson }});
                {%- if this.icon_create_function is not none %}
                cluster.options.iconCreateFunction =
                    {{ this.icon_create_function.strip() }};
                {%- endif %}
                for (var i = 0; i < data.length; i++) {
                    var row = data[i];
                    var marker = callback(row);
                    marker.addTo(cluster);
                }
                cluster.addTo({{ this._parent.get_name() }});
                return cluster;
            })();
        {% endmacro %}""")

    def __init__(self, data, callback=None, options=None,
                 name=None, overlay=True, control=True, show=True, icon_create_function=None, **kwargs):
        if options is not None:
            kwargs.update(options)  # options argument is legacy
        super(FastMarkerCluster, self).__init__(name=name, overlay=overlay,
                                                control=control, show=show,
                                                icon_create_function=icon_create_function,
                                                **kwargs)
        self._name = 'FastMarkerCluster'
        data = if_pandas_df_convert_to_numpy(data)
        self.data = [[*validate_location(row[:2]), *row[2:]]  # noqa: E999
                     for row in data]

        if callback is None:
            self.callback = """
                var callback = function (row) {
                    var icon = L.AwesomeMarkers.icon();
                    var marker = L.marker(new L.LatLng(row[0], row[1]));
                    marker.setIcon(icon);
                    return marker;
                };"""
        else:
            self.callback = 'var callback = {};'.format(callback)

In [None]:
# Creating the final map with the option 1 style
map_option_1 = folium.Map(location=[60.1692898243506, 24.94011732881071],
                        zoom_start=12,
                        tiles='CartoDB dark_matter')


FastMarkerCluster(data=list(zip(df_real['latitude'].values, df_real['longitude'].values))).add_to(map_option_1)
folium.LayerControl().add_to(map_option_1)
map_option_1

## Option 2: creating popups for each address

In [None]:
# Setting up the map
map_option_2 = folium.Map(
    location=[60.1692898243506, 24.94011732881071],
    tiles='cartodbpositron',
    zoom_start=12,
)

In [None]:
for row in df_real.iterrows(): 
    row_values = row[1]
    location = [row_values['latitude'], row_values['longitude']]
    popup = (str(row_values['Address_Geo']))
    marker = folium.Marker(location = location, popup = popup)
    marker.add_to(map_option_2)

# Display the map.
display(map_option_2)