# Madrid Housing Prices Analysis

Data on flats from https://www.kaggle.com/datasets/mirbektoktogaraev/madrid-real-estate-market/data

TODO: Update sources

Location data from API (for neighborhoods and streets): https://here.com (geocoding)

## Initialization (aquiring data)

In [None]:
import pandas as pd
import geopandas as gpd
import json
import folium
import numpy as np

In [None]:
# Load data
df_houses = pd.read_csv('houses_Madrid.csv')

df_houses.info()

In [None]:
df_houses.groupby('neighborhood_id')

In [None]:
# Load locations that we got from an API
neighborhood_locations = {}

with open("saved_locations.json", "r") as f:
    neighborhood_locations = json.load(f)
    
street_locations = {}
with open("street_locations_UT.json", 'r') as f:
    street_locations = json.load(f)

## Data analysis and visualization

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Price VS number of rooms
nroom_group = df_houses.groupby('n_rooms')

nroom_group['buy_price'].mean().plot(legend=True)

In [None]:
# Price VS Area
group_areas = df_houses.groupby('sq_mt_built')

plt.plot(group_areas['buy_price'].mean())

In [None]:
def clean_neighborhood_code(neighborhood_id):
    fmt_str = neighborhood_id.replace('Neighborhood ', '')
    neighborhood_id = fmt_str.split(' ')[0][:-1]
    return neighborhood_id

In [None]:
# Number of rows VS neighborhood ids
neighborhood_group = df_houses.groupby('neighborhood_id')

neighborhood_codes = set()

for neighborhood_id in neighborhood_group.groups:
    neighborhood_codes.add(int(clean_neighborhood_code(neighborhood_id)))

for i in range(0, 136):
    if i not in neighborhood_codes:
        print(f"{i} code is not present")

nhg_sizes = neighborhood_group.size()
nhg_sizes.plot(kind='bar')

### Visualizing street data

In [None]:
# Initialize map
m = folium.Map(location=[40.37654, -3.60837], zoom_start=9.5)

In [None]:
import colorsys

In [None]:
def plot_dot(map: folium.Map, lat: float, lng: float, **kwargs):
    rad = kwargs.get('radius')
    weight = kwargs.get('weight')
    color = kwargs.get('color')
    popup_text = kwargs.get('popup')
    
    folium.CircleMarker(location=[lat, lng],
                        radius=rad if rad is not None else 1,
                        weight=weight if weight is not None else 3,
                        color=color if color is not None else "#3388FF").add_to(map)\
                            .add_child(folium.Popup(popup_text))

def rgb_to_hexstr(r: int, g: int, b: int) -> str:
    r = (r if r >= 0 else 0) if r <= 255 else 255
    g = (g if g >= 0 else 0) if g <= 255 else 255
    b = (b if b >= 0 else 0) if b <= 255 else 255
    return f'#{r:02x}{g:02x}{b:02x}'

def mapf(n: float, min1: float, max1: float, min2: float, max2: float) -> float:
    return ((n - min1) / (max1 - min1)) * (max2 - min2) + min2

In [None]:
df_houses_clean = df_houses.loc[df_houses['buy_price'].notna() & \
    df_houses['sq_mt_built'].notna() & \
    df_houses['buy_price_by_area'].notna() & \
    (df_houses['sq_mt_built'] > 50.0) & \
    (df_houses['sq_mt_built'] < 90.0)]
# we cut everything lower than 50sqmt and above 90sqmt

In [None]:
min_price = df_houses_clean['buy_price'].min()
max_price = df_houses_clean['buy_price'].max()

min_price_by_area = df_houses_clean['buy_price_by_area'].min()
max_price_by_area = df_houses_clean['buy_price_by_area'].max()

In [None]:
# TODO: Process data and add to map
for street in street_locations.values():
    try:
        data = df_houses_clean.iloc[street['idx']]
    except IndexError:
        continue # skip if index isn't in dataframe
    #data = df_houses.iloc[street['idx']
    mapped_price = mapf(data['buy_price_by_area'], min_price_by_area, max_price_by_area, 0.0, 1.0) ** 0.5
    color = [int(c * 255) for c in colorsys.hsv_to_rgb(0.3, 0.5 + mapped_price * 0.5, 1.0 - mapped_price)]
    plot_dot(m, street['loc']['lat'], street['loc']['lng'], 
             radius=2, weight=4, color=rgb_to_hexstr(color[0], color[1], color[2]),
             popup=f'Price per SqMt: {data['buy_price_by_area']}')

In [None]:
gdf_districts = gpd.GeoDataFrame.from_file("discrict_info/Barrios.shp")

In [None]:
ENABLE_DISTRICTS = False

if ENABLE_DISTRICTS:
    choropleth = folium.Choropleth(
        geo_data = gdf_districts,
        data = None,
        columns = ['COD_BAR', 'prices'],
        key_on = 'features.properties.COD_BAR',
        fill_opacity = 0.7,
        fill_color = 'Greens',
        name = 'Prices',
        line_opacity = 1.0,
        line_weight = 2.0,
        legend_name = "Neighborhood average prices",
        smooth_factor = 0.0
    ).add_to(m)

In [None]:
m

## Second dataset

In [None]:
df_abnb_listings = pd.read_csv("AirbnbListingsNew.csv")

df_abnb_listings.info()

In [None]:
gdf_neighborhoods: gpd.GeoDataFrame = gpd.GeoDataFrame.from_file('neighbourhoods.geojson')

In [None]:
df_abnb_price_clean = df_abnb_listings.loc[(df_abnb_listings['price'].notna()) & (df_abnb_listings['price'] < 500.0) & (df_abnb_listings['price'] > 15.0)]

In [None]:
import re

def extract_bedrooms_from_title(title: str) -> int | None:
    rgx = re.compile(r"(\d+)\sbedroom")
    rgx_studio = re.compile(r"[Ss]tudio")
    
    rgx_match = rgx.search(title)
    
    if rgx_match is None:
        if rgx_studio.search(title) is None:
            return None
        return 1
    
    rgx_num = rgx_match.group(1)

    return int(rgx_num)

In [None]:
bedroom_counts = []
prices_per_bedroom = []
for idx, listing in df_abnb_price_clean.iterrows():
    bedroom_count = extract_bedrooms_from_title(listing['name'])
    if bedroom_count is None:
        bedroom_counts.append(np.nan)
        prices_per_bedroom.append(np.nan)
        continue
    bedroom_counts.append(float(bedroom_count))
    prices_per_bedroom.append(listing['price'] / bedroom_count)

In [None]:
df_abnb_price_clean = df_abnb_price_clean.assign(bedroom_count=bedroom_counts,
                                                 price_per_bedroom=prices_per_bedroom)
df_abnb_price_clean = df_abnb_price_clean.loc[
    df_abnb_price_clean['bedroom_count'].notna() & \
    df_abnb_price_clean['price_per_bedroom'].notna()
] # constrain bedroom count to be less than 5

df_abnb_price_clean['bedroom_count_str'] = df_abnb_price_clean['bedroom_count'].astype(str)
df_abnb_price_clean.loc[df_abnb_price_clean['bedroom_count'] >= 4, 'bedroom_count_str'] = '>4'

df_abnb_room_constraint = df_abnb_price_clean

In [None]:
min_price = df_abnb_room_constraint['price'].min()
max_price = df_abnb_room_constraint['price'].max()

min_price_per_bedroom = df_abnb_room_constraint['price_per_bedroom'].min()
max_price_per_bedroom = df_abnb_room_constraint['price_per_bedroom'].max()

In [None]:
from shapely.geometry import Point, Polygon

In [None]:
# Calculate and visualize points on a map
new_map = folium.Map(location=[40.37654, -3.60837], zoom_start=9.5)

In [None]:
points = [Point(listing['longitude'], listing['latitude']) for _, listing in df_abnb_room_constraint.iterrows()]

In [None]:
gdf_points = gpd.GeoDataFrame(df_abnb_room_constraint, geometry=points, crs=4326)

gdf_points_in = gpd.sjoin(gdf_points, gdf_neighborhoods, predicate='within')

In [None]:
gdf_neigborhood_group = gdf_points_in.groupby('neighbourhood_right')

df_neighborhood_mean_price = gdf_neigborhood_group['price_per_bedroom'].mean()
df_neighborhood_flats_count = gdf_neigborhood_group.size()

In [None]:
# Count listings in neighborhood with specific amount of rooms

gdf_neigborhood_groups_group = gdf_points_in.groupby('neighbourhood_group_right')

df_bedroom_count_groups = pd.DataFrame(gdf_neigborhood_groups_group['bedroom_count_str'].value_counts())

df_bedroom_count_groups['percentage'] = 100.0 * df_bedroom_count_groups['count'] / gdf_neigborhood_groups_group['bedroom_count'].size()

df_bedroom_pivot_table = df_bedroom_count_groups.pivot_table(values='percentage', index='neighbourhood_group_right', columns='bedroom_count_str')
df_bedroom_pivot_table.plot(kind='bar', stacked=True, title='Percentage of flats with specific number of rooms in each neighbourhood')

In [None]:
df_bedroom_pivot_table = df_bedroom_count_groups.pivot_table(values='count', index='neighbourhood_group_right', columns='bedroom_count_str')
df_bedroom_pivot_table = df_bedroom_pivot_table.sort_values('1.0', ascending=False)
df_bedroom_pivot_table.plot(kind='bar', stacked=True, title='Number of flats with specific number of rooms in each neighbourhood')

In [None]:
gdf_neigborhood_groups_group['price_per_bedroom'].mean() \
    .plot(kind='bar', title='Average price per bedroom in each neighborhood')

In [None]:
folium.Choropleth(
    geo_data=gdf_neighborhoods,
    name='choropleth',
    data=df_neighborhood_mean_price,
    columns=['neighbourhood_group_right', 'price_per_bedroom'],
    key_on='feature.properties.neighbourhood',
    fill_color='YlGnBu',
    fill_opacity=0.7,
    line_opacity=1.0,
    line_weight=2.0,
    legend_name="Neighborhood mean price per bedroom"
).add_to(new_map)

In [None]:
PLOT_DOTS = True

if PLOT_DOTS:
    for name, listing in df_abnb_room_constraint.iterrows():
        price_per_bedroom = listing['price_per_bedroom']
        mapped_price = mapf(price_per_bedroom, min_price_per_bedroom, max_price_per_bedroom, 0.0, 1.0) ** 0.5
        color = [int(c * 255) for c in colorsys.hsv_to_rgb(0.3, 0.5 + mapped_price * 0.5, 1.0 - mapped_price)]
        plot_dot(new_map, listing['latitude'], listing['longitude'], 
                color=rgb_to_hexstr(color[0], color[1], color[2]), 
                radius=1, weight=2, popup=f'Price per bedroom: {listing['price_per_bedroom']}\nTotal price: {listing['price']}')

In [None]:
new_map