### Web Scrapping - BeautifulSoup
*24 de Noviembre de 2021*

*Nicolás Tibatá*

In [1]:
import json
from time import sleep
import requests
from bs4 import BeautifulSoup as bs # AKA 'bs'
import pandas as pd
import numpy as np 

In [2]:
#Now read the file back into a Python list object
with open('base_links.txt', 'r') as f:
    base_links = json.loads(f.read())

In [3]:
# We need to eliminate duplicates
unique_links = []
for element in base_links:
    if element not in unique_links:
        unique_links.append(element)
    else:
        continue

In [4]:
len(unique_links) # The unique links of properties

3525

In [None]:
# Bs loop. For each element in unique elements take the price, location and details
info_precio = []
info_ubi = []
info = []
for elements in unique_links:
    try:
        html = requests.get(elements).text
        soup = bs(html, 'lxml')
        precio = soup.find('span', {'class': '_2xKfz'})
        ubicacion = soup.find('span', {'class': '_2FRXm'})
        informacion = soup.find('div', {'class': '_3JPEe'})
        info_precio.append(precio.text)
        info_ubi.append(ubicacion.text)
        info.append(informacion.text)
        print(precio.text)
        print(ubicacion.text)
        print(informacion.text)
    except Exception as e: print(e)
    sleep(2) 

`We need the same lenght always, to make the dataframe`

In [6]:
len(info)

3505

In [7]:
len(info_precio)

3505

In [8]:
len(info_ubi)

3505

`Save info, price and location of the property, in case the kernel dies`

In [9]:
with open('info.txt', 'w') as f:
    f.write(json.dumps(info))

In [10]:
with open('info_precio.txt', 'w') as f:
    f.write(json.dumps(info_precio))

In [11]:
with open('info_ubi.txt', 'w') as f:
    f.write(json.dumps(info_ubi))

--------

#### Get the variables

Open the files and check if theres no empty elements

In [5]:
with open('info_precio.txt', 'r') as f:
    precio = json.loads(f.read())

In [6]:
[i for i,x in enumerate(precio) if not x] # all good for prices

[]

In [7]:
# The $ symbol is not useful on the dataframe
precio_n = [s.replace("$ ", "") for s in precio]

In [8]:
len(precio_n) # Price Variable

3505

In [9]:
with open('info_ubi.txt', 'r') as f:
    ubicacion = json.loads(f.read())

In [10]:
[i for i,x in enumerate(ubicacion) if not x] # all good for location

[]

In [11]:
# We just need the neighborhood and locality
ubicacion_n = [s.replace(", Bogotá", "") for s in ubicacion]

In [12]:
len(ubicacion_n) # Location Variable

3505

--------

In [13]:
import re # To get all the variables in a list of the info.txt

In [14]:
with open('info.txt', 'r') as f:
    informacion = json.loads(f.read())

In [15]:
lista_tipo = []
for sentences in informacion:
    y = re.findall('Tipo(\w+)Habitaciones', sentences)
    lista_tipo.append(y)

In [16]:
[i for i,x in enumerate(lista_tipo) if not x] # Element 3033 is empty

[3033]

In [17]:
lista_tipo[3033] = ['Casa'] # Becuase the details describe it as a House

In [18]:
# Lets flat the list
tipo = []
for sublist in lista_tipo:
    for item in sublist:
        tipo.append(item)

In [19]:
len(tipo) # Its necesary the same length to make the dataframe. Type Variable

3505

In [20]:
lista_habitaciones = []
for sentences in informacion:
    y = re.findall('Habitaciones([0-9]*)', sentences)
    lista_habitaciones.append(y)

In [21]:
[i for i,x in enumerate(lista_habitaciones) if not x] # None of the elements are empty

[]

In [22]:
habitaciones = []
for sublist in lista_habitaciones:
    for item in sublist:
        habitaciones.append(item)

In [23]:
len(habitaciones) # Rooms Variable

3505

In [24]:
lista_baños = []
for sentences in informacion:
    y = re.findall('Baños([0-9]*)', sentences)
    lista_baños.append(y)

In [25]:
[i for i,x in enumerate(lista_baños) if not x]

[]

In [26]:
baños = []
for sublist in lista_baños:
    for item in sublist:
        baños.append(item)

In [27]:
len(baños) # Restrooms variable

3505

In [28]:
lista_metros = []
for sentences in informacion:
    y = re.findall('Totales([0-9]*)', sentences)
    lista_metros.append(y)

In [29]:
[i for i,x in enumerate(lista_metros) if not x]

[]

In [30]:
metros = []
for sublist in lista_metros:
    for item in sublist:
        metros.append(item)

In [31]:
len(metros) #Squared Meters Variable

3505

In [33]:
lista_estrato = []
for sentences in informacion:
    y = re.findall('Estrato([0-9]*)', sentences)
    lista_estrato.append(y)

In [34]:
[i for i,x in enumerate(lista_estrato) if not x] # Many empty elements 

[162,
 198,
 299,
 313,
 348,
 357,
 405,
 407,
 408,
 417,
 418,
 489,
 502,
 516,
 520,
 523,
 524,
 537,
 538,
 539,
 540,
 542,
 543,
 548,
 552,
 553,
 574,
 575,
 583,
 670,
 678,
 733,
 862,
 949,
 1035,
 1044,
 1055,
 1298,
 1301,
 1406,
 1419,
 1732,
 1815,
 1877,
 2029,
 2186,
 2262,
 2331,
 2345,
 2346,
 2364,
 2389,
 2390,
 2531,
 2626,
 2629,
 2666,
 2667,
 2706,
 2711,
 2776,
 2840,
 2914,
 2919,
 2952,
 3050,
 3085,
 3097,
 3119,
 3198,
 3292,
 3293,
 3301,
 3315,
 3318,
 3323,
 3324,
 3325,
 3327,
 3355,
 3356,
 3358,
 3360,
 3374,
 3375,
 3376,
 3429,
 3430,
 3443,
 3445,
 3474,
 3479,
 3485]

In [35]:
informacion[198] # these elements doesnt have that information, so lets change the empty for a
# median result as 3. 

'TipoCasaHabitaciones10Baños5Metros Cuadrados Totales300 m2AntigüedadEntre 5 y 10 añosParqueaderoNoPiso4Tipo de vendedorDueño DirectoNúmero de contacto (+57)3125904484'

In [36]:
# Replacing the values of the sublist and flatten the list on the same line
estrato = [name for sublist in lista_estrato for name in (sublist or ['3'])]

In [37]:
len(estrato) # Social Class Variable

3505

In [38]:
lista_contacto = []
for sentences in informacion:
    y = re.findall('\)([0-9]*)', sentences)
    lista_contacto.append(y)

In [39]:
#(ELIMINATE THESE WORDS TO RUN THE CODE) [i for i,x in enumerate(lista_contacto) if not x] 
# Many empty elements. Not all properties have the contact number

In [40]:
contacto = [name for sublist in lista_contacto for name in (sublist or ['0'])]
# Replacing the empty elements with 0. You can use NaN either. 

In [41]:
len(contacto)

3505

In [42]:
lista_parqueadero = []
for sentences in informacion:
    y = re.findall('Parqueadero([A-Za-z][A-Za-z])', sentences)
    lista_parqueadero.append(y)

In [43]:
parqueadero = [name for sublist in lista_parqueadero for name in (sublist or [np.nan])]
# Replacing the empty elements with NaN

In [44]:
len(parqueadero)

3505

In [45]:
# ANOTHER WAY TO SEE EMPTY ELEMENTS ON A LIST

#for sublist in lista_tipo:
#    if sublist:
#        print('not empty')
#     else:
#        print('empty')

-------

#### Build the Dataframe

Lets split `Location Variable` into neighborhood and locality

In [46]:
barrio = [x.split(',') for x in ubicacion_n]
# This help me to split into a sublist of 2 elements

In [47]:
#(ELIMINATE THESE WORDS TO RUN THE CODE) barrio

In [48]:
localidad = []
for sublists in barrio:
     localidad.append(sublists[1])

In [50]:
len(localidad)

3505

In [52]:
barrios = []
for sublists in barrio:
    barrios.append(sublists[0])

In [53]:
len(barrios)

3505

Easier way than RegEx but `doesn't work all the time.` 

In [54]:
df = pd.DataFrame({'Precio':precio_n,
                   'Metros Cuadrados' : metros,
                   'Habitaciones': habitaciones,
                   'Baños': baños,
                   'Parqueadero': parqueadero,
                   'Estrato': estrato,
                   'Localidad': localidad,
                   'Barrio': barrios,
                   'Contacto' : contacto})
df

Unnamed: 0,Precio,Metros Cuadrados,Habitaciones,Baños,Parqueadero,Estrato,Localidad,Barrio,Contacto
0,1.650.000.000,250,3,3,,6,Chapinero,Chicó Alto,0
1,410.000.000,53,1,2,,5,Usaquén,Rincón Del Chicó,0
2,583.300.000,106,3,3,,6,Suba,Niza Suba - Torres Del Monte,0
3,520.000.000,130,3,3,,6,Usaquén,Rincón De Santa Paula,3165279629
4,350.000.000,118,3,3,Si,3,Suba,Cantalejo,0
...,...,...,...,...,...,...,...,...,...
3500,602.370.000,88,3,2,,4,Fontibón,Urbanización Industrial Montevideo,0
3501,420.000.000,60,1,1,,4,Teusaquillo,San Luís,0
3502,560.000.000,360,4,2,,3,Antonio Nariño,La Fragua,0
3503,898.000.000,292,4,4,,4,Teusaquillo,Pablo VI Etapa II,0


In [212]:
df.to_csv('df_olx.csv')

-------