# Scraping madlan.co.il

In this file we show scraping of madlan.co.il website for the apartments in Tel Aviv district.
You can adjust the code to scrape different city's pages.
In our project, we scraped the pages of Tel Aviv, Haifa, and Jerusalem.

## Using BeautifulSoup 

Functions to scrape the specific page html with BeautifulSoup to json file

In [20]:
from bs4 import BeautifulSoup
import requests
import json
import time
import random
from datetime import datetime


def append_to_json(data, filename):
    try:
        with open(filename, 'r',encoding='utf-8') as file:
            file_data = json.load(file)
    except FileNotFoundError:
        file_data = []
    
    file_data.append(data)
    
    with open(filename, 'w',encoding='utf-8') as file:
        json.dump(file_data, file, indent=4,ensure_ascii=False)
        

def scrape_page(url,page_number,headers):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser',from_encoding='utf-8')
        
        return {'page': page_number, 'source': str(soup()), 'timestamp': datetime.now().isoformat()}
    else:
        raise Exception(f"Failed to fetch page {page_number}, status code: {response.status_code}")

## Getting data - using for every 15-30 pages different ip from vpn

### Extracting apartments data - Buy

in order to scrape the madlan.co.il pages, you need to use different user-agents, time.sleep() , and ip addresses every 15-30 pages because of bot detection in the website. Here, we used Proton VPN.

In [35]:
if __name__ == "__main__":
    output_file = "madlan_scraped_pages1.json"
    headers_lst = [
        {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"},
        {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"},
        {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"},
        {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"},
        {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"},
        {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"},
        {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"},
        {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" }       
    ]
    headers = random.choice(headers_lst)
    
    #URL:
    domain = "https://www.madlan.co.il"
    sub_domain = "for-sale"
    city = "תל-אביב-יפו-ישראל"
    
    for page_number in range(136, 160): # adjust the range from 1-15, 16-30 and so on.
        start_page = page_number
        page = f"page={start_page}" if start_page>1 else ""
        
        #filtering the results in madlan
        filters = "filters=_0-10000000___agent%2Cprivate_______0-100000_____&sort=date-desc&tracking_search_source=filter_apply&marketplace=residential"
        url = f"{domain}/{sub_domain}/{city}?{page}&{filters}"
        try:
            page_data = scrape_page(url,page_number,headers) #scraping the page
            append_to_json(page_data, output_file) #adding the page to json format
            print(f"Page {page_number} scraped successfully.")
            time.sleep(3.43)  # Delay to avoid being blocked
        except Exception as e:
            print(e)
            break

Page 136 scraped successfully.
Page 137 scraped successfully.
Page 138 scraped successfully.
Page 139 scraped successfully.
Page 140 scraped successfully.
Page 141 scraped successfully.
Page 142 scraped successfully.
Page 143 scraped successfully.
Page 144 scraped successfully.
Page 145 scraped successfully.
Page 146 scraped successfully.
Page 147 scraped successfully.
Page 148 scraped successfully.
Page 149 scraped successfully.
Page 150 scraped successfully.
Page 151 scraped successfully.
Page 152 scraped successfully.
Page 153 scraped successfully.
Page 154 scraped successfully.
Page 155 scraped successfully.
Page 156 scraped successfully.
Page 157 scraped successfully.
Page 158 scraped successfully.
Page 159 scraped successfully.


### Extracting apartment data - Rent

In [None]:
            
if __name__ == "__main__":
    output_file = "madlan_scraped_pages2.json"
    headers_lst = [
        {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"},
        {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"},
        {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"},
        {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"},
        {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"},
        {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"},
        {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"},
        {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" }       
    ]
    headers = random.choice(headers_lst)
    
    #URL:
    domain = "https://www.madlan.co.il"
    sub_domain = "for-rent"
    city = "תל-אביב-יפו-ישראל"
    
    for page_number in range(1, 70): #same as the previous - change range 1-15, 16-30 and so on..
        start_page = page_number
        page = f"page={start_page}" if start_page>1 else ""
        filters = "filters=_0-15000__________0-10000_____&sort=date-desc&tracking_search_source=filter_apply&tracking_event_source=list_regular_card&tracking_list_index=1&marketplace=residential"
        url = f"{domain}/{sub_domain}/{city}?{page}&{filters}"
        try:
            page_data = scrape_page(url,page_number,headers)
            append_to_json(page_data, output_file)
            print(f"Page {page_number} scraped successfully.")
            time.sleep(3.43)  # Delay to avoid being blocked
        except Exception as e:
            print(e)
            break

### Combine to one JSON file: 

Create a combined json file that has all html page sources of apartment to buy, and apartments to rent.

In [36]:
path1 = r"madlan_scraped_pages1.json"
with open(path1, 'r', encoding='utf-8-sig') as file:
    buys = json.load(file)

buys = {
    "deal_type":"buys",
    "info":buys
}
    
path2 = r"madlan_scraped_pages2.json"
with open(path2, 'r', encoding='utf-8-sig') as file:
    rents = json.load(file)

rents = {
    "deal_type":"rents",
    "info":rents
}


data = {
    "data":[buys,rents]
}


path3 = r"madlan_scraped_pages_all.json"
with open(path3, 'w', encoding='utf-8-sig') as file:
    json.dump(data, file, indent=4,ensure_ascii=False)