In [22]:
import pandas as pd
pd.set_option('display.max_columns', None)


data = pd.read_csv("./preprocessed_data.csv")
data.drop_duplicates(subset="link")
display(data["description"])

0       Apartamento T0, com vista de Mar.\nNão perca a...
1       Não perca a oportunidade de viver num apartame...
2       Excelente apartamento t1 na Trafaria, a 2 minu...
3       Estúdio com muita luz, casa de banho com duche...
4       Disponível a partir de 01 de Junho de 2024.\nE...
                              ...                        
2449    Apartamento no centro de Viseu no último andar...
2450    Apartamento com 3 quartos com moveis\nCozinha ...
2451                                                  NaN
2452                                                  NaN
2453                                                  NaN
Name: description, Length: 2454, dtype: object

In [25]:
import spacy

# Load Portuguese language model
nlp = spacy.load("pt_core_news_sm")

def extract_features(text):
    # Handle NaN or None
    if not text:
        return {
            "quartos": 0,
            "casas de banho": 0,
            "piscina": 0,
            "terraço": 0,
            "jardim": 0,
            "luxo": 0,
            "porteiro": 0,
            "mobilado": 0,
            "equipado": 0,
            "reabilitado": 0,
            "arrecadação": 0,
            "recem-construido": 0,
            "novo": 0,
            "suite": 0,
            "sauna": 0,
            "banho turco": 0,
            "ginasio": 0,
            "ar condicionado": 0,
            "concierge": 0
        }

    text = str(text)
    doc = nlp(text.lower())
    quartos = 0
    casas_de_banho = 0

    # Define keywords
    quartos_keywords = ["quarto", "quartos"]
    casas_de_banho_keywords = ["casas de banho", "casa de banho", "wc", "banheiro"]
    feature_keywords = {
        "piscina": 0,
        "terraço": 0,
        "jardim": 0,
        "luxo": 0,
        "porteiro": 0,
        "mobilado": 0,
        "equipado": 0,
        "reabilitado": 0,
        "arrecadação": 0,
        "recem-construido": 0,
        "novo": 0,
        "suite": 0,
        "sauna": 0,
        "banho turco": 0,
        "ginasio": 0,
        "ar condicionado": 0,
        "concierge": 0
    }

    for token in doc:
        # Check for numbers and their neighboring tokens
        if token.like_num:
            for child in token.children:
                if child.text in quartos_keywords:
                    quartos = token.text
                if child.text in casas_de_banho_keywords:
                    casas_de_banho = token.text
            if token.head.text in quartos_keywords:
                quartos = token.text
            if token.head.text in casas_de_banho_keywords:
                casas_de_banho = token.text
        elif token.text in quartos_keywords or token.text in casas_de_banho_keywords:
            for child in token.children:
                if child.like_num:
                    if token.text in quartos_keywords:
                        quartos = child.text
                    if token.text in casas_de_banho_keywords:
                        casas_de_banho = child.text
        
        # Check for feature keywords
        if token.text in feature_keywords:
            feature_keywords[token.text] = 1

    features = {"quartos": quartos, "casas de banho": casas_de_banho}
    features.update(feature_keywords)
    
    return features

In [26]:
# Apply the function to the DataFrame column
data['features'] = data['description'].apply(extract_features)

# Convert the resulting series of dictionaries to a DataFrame
features_df = pd.DataFrame(data['features'].tolist())

In [27]:
features_df

Unnamed: 0,quartos,casas de banho,piscina,terraço,jardim,luxo,porteiro,mobilado,equipado,reabilitado,arrecadação,recem-construido,novo,suite,sauna,banho turco,ginasio,ar condicionado,concierge
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2449,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2450,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2451,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2452,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
