## Read the data

In [60]:
import pandas as pd
from matplotlib import pyplot as plt


df = pd.read_excel('madrid_rent_data.xlsx')

## Explore the data
df.head()

Unnamed: 0,Id,District,Address,Number,Area,Rent,Bedrooms,Sq.Mt,Floor,Outer,Elevator,Penthouse,Cottage,Duplex,Semidetached
0,1,Ciudad Lineal,Piso en Quintana,,Quintana,1300,2.0,72,3.0,1.0,1.0,0,0,0,0
1,2,Ciudad Lineal,Piso en calle de Arturo Soria,,Costillares,3000,5.0,260,2.0,1.0,1.0,0,0,0,0
2,3,Ciudad Lineal,Piso en calle de Vicente Muzas,4.0,Colina,1300,2.0,100,3.0,1.0,1.0,0,0,0,0
3,4,Ciudad Lineal,Piso en calle Badajoz,,San Pascual,1600,3.0,120,4.0,1.0,1.0,0,0,0,0
4,5,Ciudad Lineal,Piso en calle de Nuestra Señora del Villar,9.0,Ventas,800,2.0,60,3.0,1.0,0.0,0,0,0,0


## Get the data types

In [61]:
df.shape

(2089, 15)

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2089 entries, 0 to 2088
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            2089 non-null   int64  
 1   District      2089 non-null   object 
 2   Address       2089 non-null   object 
 3   Number        747 non-null    object 
 4   Area          2085 non-null   object 
 5   Rent          2089 non-null   int64  
 6   Bedrooms      2000 non-null   float64
 7   Sq.Mt         2089 non-null   int64  
 8   Floor         1948 non-null   float64
 9   Outer         1927 non-null   float64
 10  Elevator      1956 non-null   float64
 11  Penthouse     2089 non-null   int64  
 12  Cottage       2089 non-null   int64  
 13  Duplex        2089 non-null   int64  
 14  Semidetached  2089 non-null   int64  
dtypes: float64(4), int64(7), object(4)
memory usage: 244.9+ KB


In [63]:
df.isna().sum()

Id                 0
District           0
Address            0
Number          1342
Area               4
Rent               0
Bedrooms          89
Sq.Mt              0
Floor            141
Outer            162
Elevator         133
Penthouse          0
Cottage            0
Duplex             0
Semidetached       0
dtype: int64

In [64]:
df = df.fillna(0)
df.sample(5)

Unnamed: 0,Id,District,Address,Number,Area,Rent,Bedrooms,Sq.Mt,Floor,Outer,Elevator,Penthouse,Cottage,Duplex,Semidetached
444,469,Moncloa,Piso en calle Valdeverdeja,33,Valdezarza,1600,4.0,120,2.0,1.0,1.0,0,0,0,0
939,985,Salamanca,Piso en calle de Núñez de Balboa,0,Recoletos,1250,1.0,70,5.0,0.0,1.0,0,0,0,0
199,208,Fuencarral,Piso en Montecarmelo,0,Montecarmelo,1175,2.0,83,0.0,0.0,1.0,0,0,0,0
1278,1341,Centro,Piso en Lavapiés-Embajadores,0,Lavapiés-Embajadores,1300,2.0,70,0.0,0.0,1.0,0,0,0,0
421,445,Moncloa,Piso en calle Velayos,0,Ciudad Universitaria,2100,5.0,190,2.0,1.0,1.0,0,0,0,0


## Data Preprocessing

In [65]:
df = pd.get_dummies(df, columns=['District'], drop_first=True)
df = pd.get_dummies(df, columns=['Area'], drop_first=True)

In [66]:
df.sample(5)

Unnamed: 0,Id,Address,Number,Rent,Bedrooms,Sq.Mt,Floor,Outer,Elevator,Penthouse,...,Area_ZofÁ­o,Area_chalet independiente en Nueva España,Area_en Almagro,Area_en Ciudad Jardín,Area_en El Viso,Area_en Nueva España,Area_en Nuevos Ministerios-Ríos Rosas,Area_en Trafalgar,Area_plaza de España,Area_Águilas
410,432,Casa o chalet independiente en Ciudad Universi...,0,12000,7.0,1230,0.0,0.0,0.0,0,...,False,False,False,False,False,False,False,False,False,False
784,827,Piso en calle hermosilla,0,1400,2.0,78,3.0,1.0,1.0,0,...,False,False,False,False,False,False,False,False,False,False
1869,1957,Piso en Cuatro Caminos,0,1750,1.0,74,15.0,1.0,1.0,0,...,False,False,False,False,False,False,False,False,False,False
1923,2016,Chalet adosado en calle de la vÁ­a lÁ­mite,85,1800,4.0,230,0.0,0.0,0.0,0,...,False,False,False,False,False,False,False,False,False,False
796,839,Piso en Doctor Esquerdo,47,1200,2.0,70,6.0,1.0,1.0,0,...,False,False,False,False,False,False,False,False,False,False


In [67]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

X = df.drop(['Id', 'Address', 'Number', 'Rent'], axis=1)
y = df['Rent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_cols = ['Bedrooms', 'Sq.Mt', 'Floor', 'Outer', 'Elevator', 'Penthouse']
ct = ColumnTransformer(
    transformers=[('num', StandardScaler(), num_cols)],
    remainder='passthrough'
)
X_train_scaled = ct.fit_transform(X_train)
X_test_scaled = ct.transform(X_test)

In [74]:
threshold = 0.6

# Get pairs
corr_pairs = (
    corr_matrix
    .abs()
    .where(lambda x: x > threshold) # Filter by threshold
    .stack()                        # Convert to Series with MultiIndex
    .reset_index()
)

corr_pairs.columns = ['Variable_1', 'Variable_2', 'Correlation']

# Remove self-correlations and duplicate pairs
corr_pairs = corr_pairs[corr_pairs['Variable_1'] != corr_pairs['Variable_2']]
corr_pairs = corr_pairs[corr_pairs['Variable_1'] < corr_pairs['Variable_2']]  # Keep only one of each pair

# Sort by absolute correlation
corr_pairs = corr_pairs.sort_values('Correlation', ascending=False).reset_index(drop=True)
corr_pairs

Unnamed: 0,Variable_1,Variable_2,Correlation
0,Area_Barajas,District_Barajas,1.0
1,Area_Ensanche de Vallecas - La Gavia,District_Villa de Vallecas,0.80747
2,Bedrooms,Sq.Mt,0.743397
3,Area_Rejas,District_San Blás,0.713713
4,Area_Ibiza,District_Retiro,0.691195
5,Area_San Diego,District_Puente Vallecas,0.62481
6,Cottage,Sq.Mt,0.605316
7,Area_Argüelles,District_Moncloa,0.600166
