In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 

In [2]:
# Base URL for the website
base_url = "https://www.boligportal.dk/lejligheder,v%C3%A6relser/k%C3%B8benhavn/?include_units=1"

# Dictionary to store the HTML content with page numbers as keys
html_dict = {}

# Iterate through the pages by modifying the offset
for i in range(109):  # i goes from 0 to 108
    # Construct the URL for the current page
    url = f"{base_url}&offset={18*i}"
    
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Store the HTML content in the dictionary with the page number as the key
        html_dict[i + 1] = response.text
    else:
        print(f"Failed to retrieve page {i + 1}")
    
# Optionally, you can print or save the html_dict to a file
print(f"Scraped {len(html_dict)} pages successfully.")

# To access the HTML content of a specific page
# page_1_html = html_dict[1]


Scraped 109 pages successfully.


In [5]:
links = []

# Iterate through the HTML pages stored in html_dict
for page_number, html_content in html_dict.items():
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all div elements with class "css-krvsu4"
    divs = soup.find_all('div', class_='css-krvsu4')
    
    # Extract the href from each div and store it in a list
    hrefs = [div.find('a')['href'] for div in divs if div.find('a')]
    
    for href in hrefs:
    # Store the hrefs in the dictionary with the page number as the key
        links.append(href)

links[:5]

['/lejligheder/k%C3%B8benhavn/86m2-3-vaer-id-5080588',
 '/lejligheder/k%C3%B8benhavn/113m2-4-vaer-id-5455654',
 '/v%C3%A6relser/k%C3%B8benhavn/9m2-1-vaer-id-5455638',
 '/v%C3%A6relser/k%C3%B8benhavn/14m2-1-vaer-id-5144039',
 '/v%C3%A6relser/k%C3%B8benhavn/7m2-1-vaer-id-5455643']

In [6]:
len(links)

1897

In [8]:
data = []

for link in links:
    full_url = f"https://www.boligportal.dk{link}"
    response = requests.get(full_url)
    if response.status_code == 200:
        html_code = response.text
        data.append({'url': full_url, 'html_code': html_code})
    else:
        print(f"Failed to retrieve content from {full_url}")

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(data, columns=['url', 'html_code'])

# Save the DataFrame to a CSV file (optional)
df.to_csv('boligportal_pages.csv', index=False)

# Display the DataFrame (optional)
print(df.head())

                                                 url  \
0  https://www.boligportal.dk/lejligheder/k%C3%B8...   
1  https://www.boligportal.dk/lejligheder/k%C3%B8...   
2  https://www.boligportal.dk/v%C3%A6relser/k%C3%...   
3  https://www.boligportal.dk/v%C3%A6relser/k%C3%...   
4  https://www.boligportal.dk/v%C3%A6relser/k%C3%...   

                                           html_code  
0   \n<!DOCTYPE html>\n<html>\n  <head>\n    \n  ...  
1   \n<!DOCTYPE html>\n<html>\n  <head>\n    \n  ...  
2   \n<!DOCTYPE html>\n<html>\n  <head>\n    \n  ...  
3   \n<!DOCTYPE html>\n<html>\n  <head>\n    \n  ...  
4   \n<!DOCTYPE html>\n<html>\n  <head>\n    \n  ...  


In [9]:
len(df)

1897

In [49]:
def extract_apartment_info(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    apartment_info = {}

    # Extract the breadcrumb (location) information
    breadcrumb = " > ".join([item.get_text(strip=True) for item in soup.select('.css-7kp13n a')])
    apartment_info['breadcrumb'] = breadcrumb

    # Extract the title of the apartment
    title = soup.select_one('h3.css-1o5zkyw').get_text(strip=True)
    apartment_info['title'] = title

    # Extract the main description
    description = soup.select_one('div.css-1f7mpex').get_text(strip=True)
    apartment_info['description'] = description

    # Extract the address
    address = soup.find_all('div',class_='css-o9y6d5')[0].get_text(strip=True)+', '+soup.find_all('div',class_='css-o9y6d5')[1].get_text(strip=True)
    apartment_info['address'] = address

    # Extract rent details
    monthly_rent = soup.select_one('.css-woykcw .css-1fhvb05').get_text(strip=True) + ' kr.'
    apartment_info['monthly_rent'] = monthly_rent

    monthly_aconto = soup.select_one('.css-30nv8k').get_text(strip=True)
    apartment_info['monthly_aconto'] = monthly_aconto

    move_in_price = soup.select('.css-30nv8k')[1].get_text(strip=True)
    apartment_info['move_in_price'] = move_in_price

    # Extract availability
    available_from = soup.select_one('.css-2kngtw').get_text(strip=True)
    apartment_info['available_from'] = available_from

    rental_period = soup.select('.css-14bctuo')[1].get_text(strip=True)
    apartment_info['rental_period'] = rental_period

    # Extract detailed characteristics
    details = {item.select_one('.css-1td16zm').get_text(strip=True): item.select_one('.css-1f8murc').get_text(strip=True) for item in soup.select('.css-1n6wxiw') if item.select_one('.css-1f8murc')}
    apartment_info.update(details)

    if soup.select_one('img.css-rdsunt'):
        apartment_info['energy_mark_src'] = soup.select_one('img.css-rdsunt').get('src')
    else:
        apartment_info['energy_mark_src'] = 'none'

    return apartment_info


In [53]:
df_list = [extract_apartment_info(html_code) for html_code in df.html_code]

In [54]:
new_df = pd.DataFrame(df_list)

In [55]:
new_df

Unnamed: 0,breadcrumb,title,description,address,monthly_rent,monthly_aconto,move_in_price,available_from,rental_period,Boligtype,...,Ledig fra,Månedlig leje,Aconto,Depositum,Forudbetalt husleje,Indflytningspris,Oprettelsesdato,Sagsnr.,energy_mark_src,Energimærke
0,Hjem > Lejligheder > København > 3 værelses > ...,"Furnished apartment, 2 bedroom",Fully furnished charming apartment for rent in...,"3 timer siden, Erik Menveds Vej, 1965 Københav...",12.850 kr.,1.350 kr.,52.750 kr.,Snarest muligt,52.750 kr.,Lejlighed,...,Snarest muligt,12.850 kr.,1.350 kr.,38.550 kr.,0 kr.,52.750 kr.,7.9.2024,5080588,/static/images/energy_labels/D_str2.png,
1,Hjem > Lejligheder > København > 4 værelses > ...,Velindrettet 4-værelses lejlighed på 113 m2 me...,Indretning:\n\nVelkommen indefor i denne store...,"9 timer siden, Laurentsvej, 2880 København, Ba...",15.700 kr.,1.224 kr.,79.724 kr.,15. november 2024,79.724 kr.,Lejlighed,...,15. november 2024,15.700 kr.,1.224 kr.,47.100 kr.,15.700 kr.,79.724 kr.,7.9.2024,5455654,/static/images/energy_labels/A15_str2.png,
2,Hjem > Værelser > København > 1 værelses > Kas...,Brand new room for rent in newly build house,9m2 værelse i nybygget hus til leje \n(kan lej...,"11 timer siden, Pilegård Alle, 2770 København,...",5.000 kr.,1.000 kr.,21.000 kr.,1. december 2024,21.000 kr.,Værelse,...,1. december 2024,5.000 kr.,1.000 kr.,15.000 kr.,0 kr.,21.000 kr.,7.9.2024,5455638,/static/images/energy_labels/A20_str2.png,
3,Hjem > Værelser > København > 1 værelses > Køb...,Værelse i Villa ved strand og Metro,Et værelse udlejes i en skøn villa i et dejlig...,"11 timer siden, Borneovej, 2300 København, Køb...",3.500 kr.,500 kr.,11.500 kr.,1. november 2024,11.500 kr.,Værelse,...,1. november 2024,3.500 kr.,500 kr.,4.000 kr.,3.500 kr.,11.500 kr.,7.9.2024,5144039,/static/images/energy_labels/C_str2.png,
4,Hjem > Værelser > København > 1 værelses > Køb...,Small Single Room,Hi there!\n\nAre you going to study in Copenha...,"13 timer siden, Vesterbrogade, 1620 København,...",4.650 kr.,0 kr.,13.950 kr.,Snarest muligt,13.950 kr.,Værelse,...,Snarest muligt,4.650 kr.,0 kr.,9.300 kr.,0 kr.,13.950 kr.,7.9.2024,5455643,none,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1892,Hjem > Lejligheder > København > 1 værelses > ...,"Nyt lejlighedshotel i Nærum, Rundforbivej - St...",Bo inkl. alt forbrug (minimum 2 måneder) - f.e...,"21. april, Rundforbivej, 2850 København, Nærum",16.900 kr.,0 kr.,33.800 kr.,Snarest muligt,33.800 kr.,Lejlighed,...,Snarest muligt,16.900 kr.,0 kr.,16.900 kr.,0 kr.,33.800 kr.,21.4.2022,4631928,none,-
1893,Hjem > Lejligheder > København > 4 værelses > ...,4 værelses lejlighed på Hyrdindestien i Herlev...,Området består af tre bebyggelser i forskellig...,"16. februar, Hyrdindestien, 2730 København, He...",14.595 kr.,950 kr.,73.925 kr.,Snarest muligt,73.925 kr.,Lejlighed,...,Snarest muligt,14.595 kr.,950 kr.,43.785 kr.,14.595 kr.,73.925 kr.,16.2.2022,5143389,none,-
1894,Hjem > Værelser > København > 1 værelses > Køb...,Stille værelse tæt på station og UN city,Jeg udlejer et møbleret værelse.\nDu får adgan...,"17. juli, Marstalsgade, 2100 København, Københ...",8.000 kr.,0 kr.,18.000 kr.,Snarest muligt,18.000 kr.,Værelse,...,Snarest muligt,8.000 kr.,0 kr.,10.000 kr.,0 kr.,18.000 kr.,17.7.2021,5091127,none,-
1895,Hjem > Lejligheder > København > 3 værelses > ...,High end penthouse apartment in the heart of C...,Exclusive luxury penthouse apartment in the he...,"7. juli, Arkonagade, 1726 København, København...",28.000 kr.,0 kr.,38.000 kr.,1. oktober 2024,38.000 kr.,Lejlighed,...,1. oktober 2024,28.000 kr.,0 kr.,10.000 kr.,0 kr.,38.000 kr.,7.7.2021,5204260,/static/images/energy_labels/A20_str2.png,


In [56]:
new_df.to_csv('bolig_data.csv',index=False, header=True, encoding='utf-8')