# Сбор детальной информации о предложениях по продаже квартир

Этот ноутбук содержит код сбора детальной информации о предложениях по продаже квартир в новостройках Санкт-Петербурга.

Для сбора информации используются наборы ссылок на страницы с предложениями от агентов и предложениями от застройщика, которые были подготовлены в ноутбуке `links_generator`.

### Какая информация о предложениях собирается
* Заголовок предложения
* Количество комнат в квартире
* Общая площадь квартиры
* Этаж, на котором находится квартира
* Общее количество этажей в здании
* Ссылка на предложение на сайте Циан
* Общая стоимость квартиры
* Стоимость за метр квадратный
* Название жк, в котором расположена квартира
* Адрес жк
* Дата завершения строительства ЖК
* Ближайшая станция метро
* Время в пути до ближайшей станции метро
* Находится ли метро в пешей доступности (0 если время в предыдущем пункте указано на транспорте и 1, если пешком)

### Важная информация, которую стоит узнать перед запуском

Так как на сайте Циан установлена защита от роботов, у вас не получится собрать всю информацию за один раз. Обычно за раз с использованием этого кода удается загрузить информацию с 20-250 страниц. После этого возникает капча, обход которой у нас не предусмотрен, поэтому при ее возникновении рекомендуется подождать 20-60 минут и продолжить сбор со страницы, на которой он прервался на прошлой итерации.

In [1]:
from bs4 import BeautifulSoup
import requests
import socket
import socks
import time
import re

import pandas as pd
import numpy as np
import math

In [3]:
# Proxy setup
socks.set_default_proxy(socks.SOCKS5, "localhost", 9150)
socks.socket = socks.socksocket

## Функции, которые используются для получения информации о предложениях

In [70]:
# Helper function for calculating total number of pages portions for given portion size and pages links pool
# Used in offers_info_grabber function
def get_portions_num(portion_size, pages_links_pool):
    return math.ceil(len(pages_links_pool) / portion_size)

In [71]:
# Function, that extract all neccessary information about flat from the flat offer's html page (offers_part) 
# and write it to the given file (flats_info)
def get_flat_offers_page_info(offers_part, flats_info):
    flats_top_titles = offers_part.findAll('div', attrs = {'data-name':'TopTitle'})
    to_get_links_top = offers_part.findAll('div', attrs = {'data-name':'TopOfferCard'})
    to_get_links = offers_part.findAll('div', attrs = {'data-name':'OfferCard'})
    flats_titles = offers_part.findAll('div', attrs = {'data-name':'Title'})
    flats_deadlines = offers_part.findAll('div', attrs = {'data-name':'Deadline'})
    complex_names = offers_part.findAll('a', attrs = {'class':'c6e8ba5398--building-link--1dQyE'})
    subway_info = offers_part.findAll('div', attrs = {'data-name':'Underground'})
    address_info = offers_part.findAll('div', attrs = {'data-name':'AddressItem'})
    top_price_info = offers_part.findAll('div', attrs = {'data-name':'TopPrice'})
    price_info = offers_part.findAll('div', attrs = {'data-name':'Price'})
    
    all_flats_titles_arr = []
    for i in range(len(flats_top_titles)):
        all_flats_titles_arr.append(flats_top_titles[i])
    for i in range(len(flats_titles)):
        all_flats_titles_arr.append(flats_titles[i])
        
    offers_num = len(all_flats_titles_arr)

    flat_titles_strs = []
    for i in range(len(all_flats_titles_arr)):
        title_str = re.findall(r">(.*)<", str(all_flats_titles_arr[i]))[0]
        flat_titles_strs.append(title_str)

    address_strs = []
    for i in range(len(address_info)):
        address_str = address_info[i].find('span', attrs = {'itemprop':'name'}).attrs['content']
        address_strs.append(address_str)
        
    # Check the length
    if len(address_strs) < offers_num:
        for k in range(len(address_strs), offers_num):
            address_strs.append('')

    total_prices_strs = []
    prices_for_meter_strs = []
    for i in range(len(top_price_info)):
        total_price = top_price_info[i].find('div', attrs = {'class':'c6e8ba5398--header--1dF9r'})
        total_price_str = re.findall(r">(.*)<", str(total_price))[0][:-2].replace(' ', '')
        total_prices_strs.append(total_price_str)

        price_for_meter = top_price_info[i].find('div', attrs = {'class':'c6e8ba5398--term--3kvtJ'})
        price_for_meter_str = re.findall(r">(.*)<", str(price_for_meter))[0][:-5].replace(' ', '')
        prices_for_meter_strs.append(price_for_meter_str)

    for i in range(len(price_info)):
        total_price = price_info[i].find('div', attrs = {'class':'c6e8ba5398--header--1df-X'})
        total_price_str = re.findall(r">(.*)<", str(total_price))[0][:-2].replace(' ', '')
        total_prices_strs.append(total_price_str)

        price_for_meter = price_info[i].find('div', attrs = {'class':'c6e8ba5398--term--3kvtJ'})
        price_for_meter_str = re.findall(r">(.*)<", str(price_for_meter))[0][:-5].replace(' ', '')
        prices_for_meter_strs.append(price_for_meter_str)
        
    # Check the length of total_prices_strs
    if len(total_prices_strs) < offers_num:
        for k in range(len(total_prices_strs), offers_num):
            total_prices_strs.append('')
    # Check the length of prices_for_meter_strs
    if len(prices_for_meter_strs) < offers_num:
        for k in range(len(prices_for_meter_strs), offers_num):
            prices_for_meter_strs.append('')

    all_flats_links = []
    for i in range(len(to_get_links_top)):
        all_flats_links.append(to_get_links_top[i].find('a', attrs = {'class':'c6e8ba5398--header--1fV2A'}).attrs['href'])
    for i in range(len(to_get_links)):   
        all_flats_links.append(to_get_links[i].find('a').attrs['href'])
        
    # Check the length of all_flats_links
    if len(all_flats_links) < offers_num:
        for k in range(len(all_flats_links), offers_num):
            all_flats_links.append('')

    complex_deadlines_strs = []
    for i in range(len(flats_deadlines)):
        deadline_str = re.findall(r">(.*)<", str(flats_deadlines[i]))[0]
        complex_deadlines_strs.append(deadline_str)
        
    # Check the length of complex_deadlines_strs
    if len(complex_deadlines_strs) < offers_num:
        for k in range(len(complex_deadlines_strs), offers_num):
            complex_deadlines_strs.append('')

    complex_names_strs = []
    for i in range(len(complex_names)):
        name_str = re.findall(r">(.*)</a>", str(complex_names[i]))[0]
        complex_names_strs.append(name_str)
        
    # Check the length of complex_names_strs
    if len(complex_names_strs) < offers_num:
        for k in range(len(complex_names_strs), offers_num):
            complex_names_strs.append('')

    subway_names_strs = []
    subway_time_strs = []
    is_walking_time_arr = [0] * offers_num
    for i in range(len(subway_info)):
        subway_name = subway_info[i].find('div', attrs = {'class':'c6e8ba5398--underground-name--1efZ3'})
        subway_name_str = re.findall(r">(.*)<", str(subway_name))[0]
        subway_names_strs.append(subway_name_str)

        subway_time = subway_info[i].find('div', attrs = {'class':'c6e8ba5398--remoteness--3bptF'})
        subway_time_arr = re.findall(r">(.*)</div>", str(subway_time))[0].split('<!-- -->')
        subway_time_str = subway_time_arr[0] + ' ' + subway_time_arr[2]
        if subway_time_arr[4] == 'пешком':
            is_walking_time_arr[i] = 1
        subway_time_strs.append(subway_time_str)
        
    # Check the length of subway_names_strs
    if len(subway_names_strs) < offers_num:
        for k in range(len(subway_names_strs), offers_num):
            subway_names_strs.append('')
    # Check the length of subway_time_strs
    if len(subway_time_strs) < offers_num:
        for k in range(len(subway_time_strs), offers_num):
            subway_time_strs.append('')
        
    room_numbers_strs = []
    total_square_strs = []
    floor_strs = []
    total_floor_numbers_strs = []
    for i in range(len(flat_titles_strs)):
        flat_arr = flat_titles_strs[i].split(', ')
        room_numbers_strs.append(flat_arr[0])
        total_square_strs.append(flat_arr[1][:-3])
        floor_info = flat_arr[2][:-5].split('/')
        if len(floor_info) > 1:
            floor_strs.append(floor_info[0])
            total_floor_numbers_strs.append(floor_info[1])
        else:
            floor_strs.append(floor_info[0])
            total_floor_numbers_strs.append("")
        
    # Запись в файл
    for i in range(len(flat_titles_strs)):
        flats_info.write(flat_titles_strs[i] + ';')
        flats_info.write(room_numbers_strs[i] + ';')
        flats_info.write(total_square_strs[i] + ';')
        flats_info.write(floor_strs[i] + ';')
        flats_info.write(total_floor_numbers_strs[i] + ';')
        flats_info.write(all_flats_links[i] + ';')
        flats_info.write(total_prices_strs[i] + ';')
        flats_info.write(prices_for_meter_strs[i] + ';')
        flats_info.write(complex_names_strs[i] + ';')
        flats_info.write(address_strs[i] + ';')
        flats_info.write(complex_deadlines_strs[i] + ';')
        flats_info.write(subway_names_strs[i] + ';')
        flats_info.write(subway_time_strs[i] + ';')
        flats_info.write(str(is_walking_time_arr[i]) + '\n')

In [77]:
# Main function that implements the scraping logic. Have 3 attributes:
# - pages_links_pool - array with offers pages links
# - portion_len - number of pages, after extracting information from which will be the long delay (40 recommended)
# - page_id_to_start - number of page, from which we want to start scrapping. 
#   At first time it is 0 and on the next runs it is the page on which the grabber stoped last time.

def offers_info_grabber(pages_links_pool, portion_len, page_id_to_start, file_to_write_result):
    total_pages_num = len(pages_links_pool)
    portions_num = get_portions_num(portion_len, pages_links_pool)
    portion_num_to_start = math.floor(page_id_to_start / portion_len)
    
    # Start values
    start_id = 0
    end_id = portion_len
    
    # Iterating by the portions of the given size
    for portion_id in range(portion_num_to_start, portions_num):   
        if portion_id == portion_num_to_start:
            start_id = page_id_to_start
            end_id = portion_len * (portion_id + 1)
        if portion_id == portions_num:
            end_id = len(pages_links_pool) - 1
        
        # Iterating by pages in the portion
        for i in range(start_id, end_id):
            # Get offers page link from the pool
            offers_page_link = pages_links_pool[i]
            
            # Get page content in lxml format
            search_page = requests.get(offers_page_link.format(i))
            search_page = search_page.text
            search_page = BeautifulSoup(search_page, 'lxml')

            # Get table with offers on the given page
            offers_part = search_page.html.body.findAll('div', attrs = {'data-name':'Offers'})

            # If we can't fing table with offers, it means that captcha appears. 
            # We write number of first page, information from which can't be read, and return
            if(len(offers_part) == 0):
                print('Captcha appears! Cannot find info about offers on page', i,
                      'Start scraping process again from this page in 20-60 minutes')
                return

            # If it is no captcha, we start extracting information from the page and write it to the given file
            offers_part = offers_part[0]
            get_flat_offers_page_info(offers_part, file_to_write_result)

            # After extracting information from each 50-th page, we'll get this message
            if (i % 50) == 0:
                print('Offers information from page', i, 'just loaded')

        # After we finish extracting information from all pages in one portion, we change the start and end ids
        start_id = end_id
        end_id = start_id + portion_len
        
        if end_id > total_pages_num:
            end_id = total_pages_num
            
        # Delay after each portion (in seconds)
        time.sleep(40)

## Загрузка информации о страницах с предложениями

Данные о страницах с предложениями были подготовлены в ноутбуке `links_generator`.

In [73]:
flats_info = open('dev_links.txt', 'r', encoding='utf-8')
agents_links_pool = flats_info.readlines()
flats_info.close()

agents_links_pool = [re.findall(r"(.*)\n", x)[0] for x in agents_links_pool]
agents_links_pool[:10]

['https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&from_developer=1&newobject=96&offer_type=flat&p=1',
 'https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&from_developer=1&newobject=44193&offer_type=flat&p=1',
 'https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&from_developer=1&newobject=44193&offer_type=flat&p=2',
 'https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&from_developer=1&newobject=44193&offer_type=flat&p=3',
 'https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&from_developer=1&newobject=44193&offer_type=flat&p=4',
 'https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&from_developer=1&newobject=44193&offer_type=flat&p=5',
 'https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&from_developer=1&newobject=44193&offer_type=flat&p=6',
 'https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&from_developer=1&newobject=44193&offer_type=flat&p=7',
 'https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&from_

## Запуск сбора данных

In [79]:
flats_info = open('flat_offers_from_devs_info.txt', 'w', encoding='utf-8')
# Запускаем функцию с параметрами: массив ссылок и размер порции
offers_info_grabber(agents_links_pool, 100, 1100, flats_info)
flats_info.close()

Captcha appears! Cannot find info about offers on page 1100 Start scraping process again from this page in 20-60 minutes
