# Title: ...

## 0. Imports

In [15]:
# Notebook similar to: https://www.kaggle.com/code/gracehephzibahm/prediction-of-rent-prices-in-barcelona
# Web scraper with good insights and EDA: https://github.com/agonzalezramos/Idealista-Price-Prediction/tree/main

# Dataset pisos.csv https://www.kaggle.com/datasets/thedevastator/analysis-of-spanish-apartment-pricing-and-size-p (cita: https://zenodo.org/records/4263693)

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Load Data

In [17]:
# Load Data
og_df = pd.read_csv('../data/pisos_clean.csv', sep=',')

  og_df = pd.read_csv('../data/pisos_clean.csv', sep=',')


## 2. Data Preparation

In [18]:
df = og_df.copy()

df.head(5)

Unnamed: 0,summary,location,price,size,rooms,price/m2,bathrooms,Num Photos,type,region
0,Casa en calle Urb. Las Mimosas,Ames (San Tome),250.000 €,315 m²,4,793 €/m²,3,31,pisos,a_coruna
1,Piso en Milladoiro,Ames (San Tome),90.000 €,72 m²,2,1.250 €/m²,1,11,pisos,a_coruna
2,Casa en calle La Piedra,A Ortigueira (Cariño),50.000 €,92 m²,2,543 €/m²,1,40,pisos,a_coruna
3,Casa en Perillo,Perillo (Oleiros),359.000 €,244 m²,4,1.471 €/m²,3,39,pisos,a_coruna
4,Chalet en Urbanización Las Mimosas,O Milladoiro (Ames),200.430 €,315 m²,4,636 €/m²,3,24,pisos,a_coruna


In [19]:
# Drop NaN values
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 719459 entries, 0 to 777178
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   summary     719459 non-null  object
 1   location    719459 non-null  object
 2   price       719459 non-null  object
 3   size        719459 non-null  object
 4   rooms       719459 non-null  object
 5   price/m2    719459 non-null  object
 6   bathrooms   719459 non-null  object
 7   Num Photos  719459 non-null  object
 8   type        719459 non-null  object
 9   region      719459 non-null  object
dtypes: object(10)
memory usage: 60.4+ MB


## 3. Exploratory Data Analysis

In [20]:
df.head(5)

Unnamed: 0,summary,location,price,size,rooms,price/m2,bathrooms,Num Photos,type,region
0,Casa en calle Urb. Las Mimosas,Ames (San Tome),250.000 €,315 m²,4,793 €/m²,3,31,pisos,a_coruna
1,Piso en Milladoiro,Ames (San Tome),90.000 €,72 m²,2,1.250 €/m²,1,11,pisos,a_coruna
2,Casa en calle La Piedra,A Ortigueira (Cariño),50.000 €,92 m²,2,543 €/m²,1,40,pisos,a_coruna
3,Casa en Perillo,Perillo (Oleiros),359.000 €,244 m²,4,1.471 €/m²,3,39,pisos,a_coruna
4,Chalet en Urbanización Las Mimosas,O Milladoiro (Ames),200.430 €,315 m²,4,636 €/m²,3,24,pisos,a_coruna


In [21]:
# Filtrar per BARCELONA
df = df[df['region'] == 'barcelona']

## 4. Model

In [22]:
# Data Cleaning
df['price'] = df['price'].str.replace(' €', '')
df['price'] = df['price'].str.replace('.', '')
# Convertir a numérico manejando errores
df['price'] = pd.to_numeric(df['price'], errors='coerce')
# Opcionalmente, eliminar filas con valores NaN si se generaron
df = df.dropna(subset=['price'])
# Convertir a entero
df['price'] = df['price'].astype(int)

df['size'] = df['size'].str.replace(' m²', '')
df['size'] = pd.to_numeric(df['size'], errors='coerce')
df = df.dropna(subset=['size'])
df['size'] = df['size'].astype(int)

df['price/m2'] = df['price'] / df['size']
df['price/m2'] = df['price/m2'].astype(float)

df.drop(columns=['summary', 'Num Photos'], inplace=True)

df['location'] = df['location'].astype(str)
df['rooms'] = df['rooms'].astype(int)
df['bathrooms'] = df['bathrooms'].astype(int)
df['type'] = df['type'].astype(str)
df['region'] = df['region'].astype(str)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9883 entries, 20983 to 269331
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   location   9883 non-null   object 
 1   price      9883 non-null   int32  
 2   size       9883 non-null   int32  
 3   rooms      9883 non-null   int32  
 4   price/m2   9883 non-null   float64
 5   bathrooms  9883 non-null   int32  
 6   type       9883 non-null   object 
 7   region     9883 non-null   object 
dtypes: float64(1), int32(4), object(3)
memory usage: 540.5+ KB


In [24]:
# Label encoding
from sklearn.preprocessing import LabelEncoder

#df.drop(columns=['location'], inplace=True)
le_location = LabelEncoder()
le_type = LabelEncoder()

df.drop(columns=['region'], inplace=True)
#le_region = LabelEncoder()

df['location'] = le_location.fit_transform(df['location'])
df['type'] = le_type.fit_transform(df['type'])
#df['region'] = le_region.fit_transform(df['region'])

In [25]:
#df['location'].value_counts()

# Drop locations with less than 10 ocurrences
df = df.groupby('location').filter(lambda x: len(x) > 10)

#df['location'].value_counts()

#df

In [26]:
# Model
from sklearn.model_selection import train_test_split

X = df.drop(columns=['price'])
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

(6468, 2157, 6468, 2157)

In [27]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

## 5. Evaluation

In [28]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))

Mean Squared Error: 12647054210.110846
Mean Absolute Error: 12240.745164580436
R2 Score: 0.9415559663186119


## 6. Explainability