In [335]:
import requests
import pandas as pd
#!pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import warnings
warnings.filterwarnings('ignore')

In [336]:
# How to get url and last_post_date? follow these steps: 
# go to divar.ir / find the proper page / use Ctrl+Shift+i (inspect) /
# / tab Network / sub_tab Fetch/XHR / Scroll down the page upto a new set of posts appears /
# / Click on the new parameters appeared in the inspect / for example here is "apartment-sell" /
# / then you can find the url in "Headers" / find json in "Payload" / 
# / find details of API response including "last_post_date" in the "Preview"

In [340]:
# Api address / should be updated!
url= "https://api.divar.ir/v8/web-search/1/apartment-sell"

# last date of posts / should be updated!
last_post_date= 1698081423460550

headers={"Content-Type": "application/json"}

list_of_tokens= []

# each page includes 24 posts
# so, for example to get 240 posts we have to repeat the process 10 times
for i in range(10):
    
    json= {"page": 1, "json_schema": {"category": {"value": "apartment-sell"}, "districts": {"vacancies": ["158"]},
                                      "sort": {"value": "sort_date"}, "cities": ["1"]},"last-post-date": last_post_date}
    # Other alternative
    #json= {"page": 1, "json_schema": {"category": {"value": "apartment-sell"}, "query": "سازمان برنامه",
    #                                  "sort": {"value": "sort_date"}, "cities": ["1"]}, "last-post-date": last_post_date}
    
    
    response= requests.post(url, json=json, headers= headers)
    data= response.json()
    last_post_date= data["last_post_date"]
    
    for item in data['web_widgets']['post_list']:
        if 'action_log' in item and 'server_side_info' in item['action_log'] and 'post_token' in item['action_log']['server_side_info']['info']:
            list_of_tokens.append(item['action_log']['server_side_info']['info']['post_token']) 

len(list_of_tokens)

240

In [341]:
# defining data frame
df= pd.DataFrame(columns= ['title', 'date', 'location', 'area', 'year', 'rooms', 'total_price', 'unit_price', 'story',
                           'total_stories', 'elevator', 'parking', 'store'])

In [None]:
# getting info based on list of tokens
for token in list_of_tokens:
    url= 'https://divar.ir/v/-/' + token
    print(url)
    
    # Set up the Selenium WebDriver (you'll need to specify the path to your driver executable)
    driver = webdriver.Chrome()
    
    # Navigate to the URL
    driver.get(url)
    
    # Wait for dynamic content to load (you may need to adjust the wait time)
    time.sleep(1)
    
    test_404 = driver.find_elements(By.CLASS_NAME, "kt-page-title__title--responsive-sized")

    # Find the elements
    if test_404:
        elements= driver.find_element(By.CSS_SELECTOR, '.kt-col-5')
        elements_list= elements.text.split(sep='\n')
        
        title= elements_list[0]
        date= elements_list[1].split('،')[0]
        location= elements_list[1].split('،')[1][1:]
        
        if 'متراژ' in elements_list:
            index = elements_list.index('متراژ')
            area= elements_list[index+1]
        else:
            area= None
    
        if 'ساخت' in elements_list:
            index = elements_list.index('ساخت')
            year= elements_list[index+1]
        else:
            year= None

        if 'اتاق' in elements_list:
            index = elements_list.index('اتاق')
            rooms= elements_list[index+1]
        else:
            rooms= None

        if 'قیمت کل' in elements_list:
            index = elements_list.index('قیمت کل')
            total_price= elements_list[index+1].split(' ')[0].replace('٬', '')
        else:
            total_price= None


        if 'قیمت هر متر' in elements_list:
            index = elements_list.index('قیمت هر متر')
            unit_price= elements_list[index+1].split(' ')[0].replace('٬', '')
        else:
            unit_price= None


        if 'طبقه' in elements_list:
            index = elements_list.index('طبقه')
            if len(elements_list[index+1].split(' '))>1:
                story= elements_list[index+1].split(' ')[0]
                total_stories= int(elements_list[index+1].split(' ')[2])
            else:
                story= elements_list[index+1].split(' ')[0]
                total_stories= elements_list[index+1].split(' ')[0]
        else:
            story= None
            total_stories= None


        if 'ویژگی\u200cها و امکانات' in elements_list:
            index = elements_list.index('ویژگی\u200cها و امکانات')
            elevator= elements_list[index+1]
            parking= elements_list[index+2]
            store= elements_list[index+3]
        else:
            elevator= None
            parking= None
            store= None


        # Close the WebDriver
        driver.quit()

        # saving data in the data frame
        df= df.append({'title': title, 'date':date, 'location':location, 'area':area, 'year':year, 'rooms':rooms,
                   'total_price':total_price, 'unit_price':unit_price, 'story':story, 'total_stories':total_stories,
                   'elevator':elevator, 'parking':parking, 'store':store}, ignore_index= True)
    else:
        driver.quit()
        
# export the data frame
df.to_csv('dataframe_1.csv')

In [None]:
df.head()