## Libraries

In [1]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse

## Extract

In [2]:
montenero = pd.read_csv('porta-romana-cadore-montenero_2023-12-29.csv')
medaglie = pd.read_csv('porta-romana-medaglie-d-oro_2023-12-29.csv')

# Concatenar os DataFrames
concatenated_df = pd.concat([montenero, medaglie], ignore_index=True)

# Salvar o DataFrame resultante em um novo arquivo CSV, se necessário
concatenated_df.to_csv('concatenated_data.csv', index=False)

In [3]:
df = pd.read_csv('concatenated_data.csv')

In [4]:
# Função para extrair o último número da URL
def extract_house_id(url):
    parsed_url = urlparse(url)
    path_segments = parsed_url.path.split('/')
    for segment in reversed(path_segments):
        if segment.isdigit():
            return int(segment)
    return None

# Aplicar a função à coluna 'links' para criar a coluna 'house_id'
df['house_id'] = df['links'].apply(extract_house_id)

# Exibir o DataFrame resultante

In [5]:
display(df)

Unnamed: 0,day,district,description,rooms,m2,bathrooms,price,links,house_id
0,2023-12-29,porta-romana-cadore-montenero,"2-room flat viale Umbria 64, Martini - Insubri...",2,50m²,1,"€ 325,000",https://www.immobiliare.it/en/annunci/108349979/,108349979
1,2023-12-29,porta-romana-cadore-montenero,"2-room flat via Carlo Botta 39, Porta Romana -...",2,68m²,1,"€ 399,000",https://www.immobiliare.it/en/annunci/108346881/,108346881
2,2023-12-29,porta-romana-cadore-montenero,"Penthouse viale Monte Nero, Montenero, Milan",4,140m²,2,"€ 1,100,000",https://www.immobiliare.it/en/annunci/108337689/,108337689
3,2023-12-29,porta-romana-cadore-montenero,"Loft viale Umbria, Martini - Insubria, Milan",1,60m²,1,"€ 290,000",https://www.immobiliare.it/en/annunci/108340651/,108340651
4,2023-12-29,porta-romana-cadore-montenero,"Apartment via Adige, Porta Romana - Medaglie d...",5,283m²,3+,"€ 1,690,000",https://www.immobiliare.it/en/annunci/108317131/,108317131
...,...,...,...,...,...,...,...,...,...
621,2023-12-29,porta-romana-medaglie-d-oro,"2-room flat viale Isonzo, Porta Romana - Medag...",2,45m²,1,"€ 300,000",https://www.immobiliare.it/en/annunci/97564788/,97564788
622,2023-12-29,porta-romana-medaglie-d-oro,"3-room flat viale Isonzo, Porta Romana - Medag...",3,85m²,2,"€ 570,000",https://www.immobiliare.it/en/annunci/97564864/,97564864
623,2023-12-29,porta-romana-medaglie-d-oro,"2-room flat Strada della Carità 2, Porta Roman...",2,70m²,1,"€ 399,000",https://www.immobiliare.it/en/annunci/97151200/,97151200
624,2023-12-29,porta-romana-medaglie-d-oro,"2-room flat via Adige 20, Porta Romana - Medag...",2,50m²,1,"€ 340,000",https://www.immobiliare.it/en/annunci/96513822/,96513822


## Treatment

### Rooms

In [6]:
df.dtypes

day            object
district       object
description    object
rooms          object
m2             object
bathrooms      object
price          object
links          object
house_id        int64
dtype: object

In [7]:
df['rooms'].value_counts()

3        220
2        208
4         93
1         33
5+        32
5         31
1 - 3      1
1 - 2      1
3 - 4      1
Name: rooms, dtype: int64

In [8]:
def transform_rooms(value):
    if isinstance(value, str):
        if '+' in value:
            return int(value[:-1])  # Remove the '+' and convert to integer
        elif '-' in value:
            # Extract the last number in the range
            return int(value.split('-')[-1])
        else:
            return int(value)
    else:
        # If the value is not a string, return it as is
        return value

# Applying the function to the 'rooms' column
df['rooms'] = df['rooms'].apply(transform_rooms)

### m²

In [9]:
df['m2'].value_counts()

110m²    27
100m²    26
70m²     26
50m²     21
90m²     21
         ..
176m²     1
141m²     1
196m²     1
44m²      1
98m²      1
Name: m2, Length: 128, dtype: int64

In [10]:
def remove_square_meter(value):
    # Check if the value is a string
    if isinstance(value, str):
        # Remove "m²" and return the cleaned string
        return value.replace('m²', '').strip()
    else:
        # If the value is not a string (e.g., it's a float), return it as is
        return value

# Applying the function to the 'm2' column
df['m2'] = df['m2'].apply(remove_square_meter)

# Checking the result
print(df['m2'].value_counts())


110    27
100    26
70     26
50     21
90     21
       ..
176     1
141     1
196     1
44      1
98      1
Name: m2, Length: 128, dtype: int64


### Bathrooms

In [11]:
df['bathrooms'].value_counts()

1     360
2     199
3      35
3+     20
4       2
G       2
8       1
6       1
Name: bathrooms, dtype: int64

In [12]:
df[df['bathrooms']=='G']

Unnamed: 0,day,district,description,rooms,m2,bathrooms,price,links,house_id
377,2023-12-29,porta-romana-cadore-montenero,"Loft via Giuseppe Ripamonti 2, Bocconi, Milan",2.0,50,G,"€ 259,000",https://www.immobiliare.it/en/annunci/100790007/,100790007
607,2023-12-29,porta-romana-medaglie-d-oro,"Loft via Giuseppe Ripamonti 2, Bocconi, Milan",2.0,50,G,"€ 259,000",https://www.immobiliare.it/en/annunci/100790007/,100790007


In [13]:
def transform_bathrooms(value):
    # Check if the value is a string
    if isinstance(value, str):
        # Remove "+" and convert to integer
        if '+' in value:
            return int(value.replace('+', '').strip())
        # Remove "G" and convert to integer
        elif 'G' in value:
            return None  # Returning None will effectively remove the line
        else:
            return int(value)
    else:
        # If the value is not a string (e.g., it's a float), return it as is
        return value

# Applying the function to the 'bathrooms' column
df['bathrooms'] = df['bathrooms'].apply(transform_bathrooms)

# Checking the result
print(df['bathrooms'].value_counts())


1.0    360
2.0    199
3.0     55
4.0      2
8.0      1
6.0      1
Name: bathrooms, dtype: int64


### Price

In [14]:
df['price'].value_counts()

€ 399,000               12
€ 450,000               12
€ 990,000               11
Price on application    11
€ 750,000               10
                        ..
€ 629,000                1
€ 558,000                1
€ 445,000                1
€ 797,000                1
€ 245,000                1
Name: price, Length: 214, dtype: int64

In [15]:
import re

def transform_price(value):
    # Check if the value is a string
    if isinstance(value, str):
        # Check if the value is 'Price on application'
        if value.lower() == 'price on application':
            return None  # Returning None will effectively remove the line
       
    return None

# Applying the function to the 'price' column
df['price'] = df['price'].apply(transform_price)

# Remove rows where 'price' is None (i.e., 'Price on application' or other non-convertible cases)
df = df.dropna(subset=['price'])

# Checking the result
print(df['price'].value_counts())


Series([], Name: price, dtype: int64)


In [16]:
df[df['price']=='270000 298000(-94%)']

Unnamed: 0,day,district,description,rooms,m2,bathrooms,price,links,house_id
