In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from IPython.display import clear_output
import pandas as pd
import random

In [2]:
df_urls = pd.read_csv('urls_propiedades_en_venta_san_martin_argenprop', index_col=False)
urls = df_urls['0'].tolist()

['https://www.argenprop.com/departamento-en-venta-en-general-san-martin-3-ambientes--11586948',
 'https://www.argenprop.com/departamento-en-venta-en-villa-maipu-1-ambiente--7956308',
 'https://www.argenprop.com/departamento-en-venta-en-centro-2-ambientes--7957043',
 'https://www.argenprop.com/ph-en-venta-en-villa-ballester-2-ambientes--4867379',
 'https://www.argenprop.com/casa-en-venta-en-san-andres-4-ambientes--9967599',
 'https://www.argenprop.com/casa-en-venta-en-villa-ballester-3-ambientes--8905243',
 'https://www.argenprop.com/departamento-en-venta-en-san-andres-2-ambientes--11524258',
 'https://www.argenprop.com/departamento-en-venta-en-villa-ballester-1-ambiente--11334852',
 'https://www.argenprop.com/departamento-en-venta-en-general-san-martin-2-ambientes--8940151',
 'https://www.argenprop.com/departamento-en-venta-en-centro-2-ambientes--8427924',
 'https://www.argenprop.com/casa-en-venta-en-villa-ballester-3-ambientes--9065863',
 'https://www.argenprop.com/departamento-en-ven

In [35]:
%%time

features_global = []
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
localidades = ['centro','ballester','suarez','martin', 'maipu','lynch','andres','billinghurst','malaver',
              'bonich','hermosa','hidalgo','progreso','golf', 'chilavert', 'parque']

for url in urls[:1000]:
    features_local = {}
    driver.get(url)
    
    # Extraccion del precio en USD
    
    try:
        element = driver.find_element(By.CLASS_NAME, 'titlebar__price').text
        element = element.replace('USD', '')
        element = element.replace('.', '')
        element = element.replace(' ', '')
        features_local['precio_USD'] = int(element)
    except:
        pass
    
    # Categoria de la propiedad (casa, departamento o ph)
    
    if 'casa' in url:
        features_local['categoria'] = 'casa'
    elif 'ph' in url: 
        features_local['categoria'] = 'ph'
    elif 'departamento' in url:
        features_local['categoria'] = 'departamento'
        
    # Extracción de la localidad
    
    for loc in localidades:
        if loc in url:
            features_local['localidad'] = loc
    
    # Extraccion de los elementos principales
    
    try:
        element = driver.find_element(By.CLASS_NAME, 'property-main-features')
        elements = element.find_elements(By.TAG_NAME, 'p')
    except:
        pass
    
    try:
        features_local[elements[0].text] = elements[1].text
    except:
        pass

    try:
        features_local[elements[2].text] = elements[3].text
    except:
        pass

    try:
        features_local[elements[4].text] = elements[5].text
    except:
        pass

    try:
        features_local[elements[6].text] = elements[7].text
    except:
        pass

    try:
        features_local[elements[8].text] = elements[9].text
    except:
        pass

    # Correcion de elementos que son equivalentes, son inecesarios o hay que formatear.
     
    try:
        features_local['Baños'] = features_local['Cantidad de baños']
        del features_local['Cantidad de baños']
    except:
        pass

    try:
        features_local['Baños'] = int(float(features_local['Baños']))
    except:
        pass
    
    try:
        features_local['Dormitorios'] = features_local['Cantidad de dormitorios']
        del features_local['Cantidad de dormitorios']
    except:
        pass
    
    try:
        features_local['Dormitorios'] = features_local['Dormitorios'].replace('Monoambiente', '0')
        features_local['Dormitorios'] = int(float(features_local['Dormitorios']))
    except:
        pass
    
    try:
        features_local['Antiguedad'] = features_local['Antigüedad']
        del features_local['Antigüedad']
    except:
        pass
    
    try:
        features_local['Antiguedad'] = features_local['Antiguedad'].replace('A Estrenar', '0')
        features_local['Antiguedad'] = features_local['Antiguedad'].replace('A estrenar', '0')
        features_local['Antiguedad'] = features_local['Antiguedad'].replace(' años', '')
        features_local['Antiguedad'] = features_local['Antiguedad'].replace(' año', '')
        features_local['Antiguedad'] = features_local['Antiguedad'].replace(',', '.')
        features_local['Antiguedad'] = int(float(features_local['Antiguedad']))
    except:
        pass
    
    try:
        features_local['Superficie cubierta'] = features_local['Superficie cubierta'].replace('m²', '')
        features_local['Superficie cubierta'] = features_local['Superficie cubierta'].replace(' ', '')
        features_local['Superficie cubierta'] = features_local['Superficie cubierta'].replace(',', '.')
        features_local['Superficie cubierta'] = float(features_local['Superficie cubierta'])
        features_local['Superficie cubierta'] = int(features_local['Superficie cubierta'])
    except:
        pass
    
    try:
        del features_local['Disposición']
    except:
        pass
    
    # Extracion de la latitud y la longitud
    
    try:
        element = driver.find_element(By.CLASS_NAME, 'map-container')
        element = element.find_element(By.TAG_NAME, 'div').get_attribute('data-latitude')
        features_local['latitud'] = float(element.replace(',','.'))
    except:
        pass
    
    try:
        element = driver.find_element(By.CLASS_NAME, 'map-container')
        element = element.find_element(By.TAG_NAME, 'div').get_attribute('data-longitude')
        features_local['logitud'] = float(element.replace(',','.'))
    except:
        pass
    
    
    features_global.append(features_local)
    
    clear_output(wait=True)
    
    print(f'Propiedad número {urls.index(url) + 1} extraída ({round(((urls.index(url) + 1)/len(urls))*100 , 2)}%)')
    
    df = pd.DataFrame(features_global)
    
    df = df.rename(columns={'Superficie cubierta':'superficie_cubierta_m2',
                        'Dormitorios':'dormitorios',
                        'Baños':'banos',
                        'Antiguedad':'antiguedad'})
    
    df.to_csv('features_propiedades.csv')
    
driver.close()

Propiedad número 8 extraída (0.1%)


MaxRetryError: HTTPConnectionPool(host='localhost', port=45505): Max retries exceeded with url: /session/6f001659974e166d49a7be5a82374f0b/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc867570790>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102400 entries, 0 to 199
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   url                     102400 non-null  object 
 1   precio_USD              100352 non-null  float64
 2   categoria               102400 non-null  object 
 3   localidad               100864 non-null  object 
 4   dormitorios             94720 non-null   float64
 5   banos                   91648 non-null   float64
 6   antiguedad              70144 non-null   float64
 7   latitud                 96256 non-null   float64
 8   logitud                 96256 non-null   float64
 9   superficie_cubierta_m2  90624 non-null   float64
dtypes: float64(7), object(3)
memory usage: 8.6+ MB
