In [1]:
import requests      # Библиотека для отправки запросов
import numpy as np   # Библиотека для матриц, векторов и линала
import pandas as pd  # Библиотека для табличек 
import time          # Библиотека для времени
import re
import os
import pickle
import cloudscraper
import random

from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from tqdm.notebook import tqdm

In [2]:
# Получение кода страницы в формате beautful soup
def get_page_soup(url_link):    
    try:
        session = cloudscraper.create_scraper()
        session.headers = {'Accept-Language': 'en'}
        res = session.get(url=url_link)
        res.raise_for_status()
        page = res.text
        try:
            soup = BeautifulSoup(page, 'lxml')
        except:
            soup = BeautifulSoup(page, 'html.parser')
        return soup
    except:
        print ("Was not able to recieve HTML")

In [3]:
districts = ['district%5B0%5D=1',
'district%5B10%5D=325',
'district%5B1%5D=3',
'district%5B2%5D=4',
'district%5B3%5D=5',
'district%5B4%5D=6',
'district%5B5%5D=7',
'district%5B6%5D=8',
'district%5B7%5D=9',
'district%5B8%5D=10',
'district%5B9%5D=11']

In [3]:
# Получение и запись в файл базовых данных по квартирам
def get_flats(rooms, min_page=1, max_page=2, district='district%5B1%5D=3'):
    for i in range(min_page, max_page):
        url = f'https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&{district}&p={i}&room{rooms}=1'
        soup = get_page_soup(url)
       
        new_flats = get_info_from_search_page(soup, rooms)
        df = pd.DataFrame(columns=['rooms', 'title', 'subtitle', 'subway', 'dist_to_subway', 
                                   'city', 'admin_okrug', 'district', 'street', 'home_number', 'price', 
                                   'link'])
        df = pd.concat([df, new_flats], ignore_index=True)
        df.to_csv(f'../data/raw/{rooms}_rooms.csv', index=False, mode='a', header=False)
        
        # Время ожидания для обмана сервера
        time.sleep(random.randint(5, 13))
        

In [4]:
# Парсинг данных по квартирам со страницы
def get_info_from_search_page(page_soup, rooms):
    try:
        cards = page_soup.find_all('article', attrs = {'data-name':'CardComponent'})
        flats = []
        for card in cards:
            flat = {}
            flat['rooms'] = rooms
            flat['title'] = get_flat_title(card)
            flat['subtitle'] = get_flat_subtitle(card)
            flat['subway'], flat['dist_to_subway'] = get_flat_subway(card) 
            flat['city'], flat['admin_okrug'], flat['district'], flat['street'], flat['home_number'] = get_flat_geo(card)
            flat['price'] = get_flat_price(card)
            flat['link'] = get_flat_link(card)
            flats.append(flat)
        return pd.DataFrame(flats)
    except:
        return None


def get_flat_title(flat):
    try:
        flat_title = flat.find('span', attrs = {'data-mark':'OfferTitle'}).text
    except:
        flat_title = ''
    return flat_title

def get_flat_subtitle (flat):
    try:
        offer_subtitle = flat.find('span', attrs = {'data-mark':'OfferSubtitle'}).text
    except:
        offer_subtitle = ''
    return offer_subtitle

def get_flat_subway(flat):
    try:
        geo = flat.find('div', attrs = {'data-name':'SpecialGeo'})
        
        # Название станции метро
        try:
            subway_station = geo.find('div', attrs= {'class':''}).text
        except:
            subway_station = ''
        
        # Расстояние до метро
        try:
            dist_regex = re.compile('.*remoteness.*')
            distance_to_subway = geo.find('div', attrs={'class': dist_regex}).text    
        except:
            distance_to_subway = ''
    except:
        subway_station = ''
        distance_to_subway = ''
    return subway_station, distance_to_subway

def get_flat_geo(flat):
    try:
        geo_list = flat.find_all('a', attrs={'data-name': 'GeoLabel'})
        
        try:
            city = geo_list[0].text
        except:
            city = ''
        try:
            admin_okrug = geo_list[1].text
        except:
            admin_okrug = ''
        try:
            district = geo_list[2].text
        except:
            district = ''
        try:
            street = geo_list[4].text
        except:
            street = ''
        try:
            home_number = geo_list[5].text
        except:
            home_number = ''
    except:
        city = ''
        admin_okrug = ''
        district = ''
        street = ''
        home_number = ''
    return city, admin_okrug, district, street, home_number

def get_flat_price(flat):
    try:
        price = flat.find('span', attrs={'data-mark':'MainPrice'})
        price = price.find('span').text
    except:
        price = ''
    return price

def get_flat_link(flat):
    try:
        link = flat.find('div', attrs={'data-name':'LinkArea'})
        link_regex = re.compile('.*link.*')
        link = link.find('a', attrs={'class': link_regex}).get('href')
    except:
        link = ''
    return link    

## Запуск сбора данных по аренде
Необходимо определить блок квартир для сбора и набор страниц (если ранее собирались данные в файл)

In [8]:
#Создаем файлы для записи и сохранения промежуточных результатов

df = pd.DataFrame(columns=['rooms', 'title', 'subtitle', 'subway', 'dist_to_subway', 
                           'city', 'admin_okrug', 'district', 'street', 'home_number', 
                           'price', 'link'])
for i in [1, 2, 3, 4, 5, 6, 9]:
    df.to_csv(f'../data/raw/{i}_rooms.csv', index=False, mode='w', header=True)

In [9]:
min_page = 1
max_page = 15
#rooms = [1]
rooms = [1, 2, 3, 4, 5, 6, 9]
# Для 1-6 комнатных квартир выбрать от 1 до 6
# Для студий выбрать 9
# NB! 7 и 8 - не используются для анализа


for room in rooms:
    if room != 9:
        print(f'Собираем информацию по {room}-комнатным квартирам')
    else:
        print(f'Собираем информацию по студиям')
    for page_number in tqdm(range(min_page, max_page+1)):
        for district in districts:
            get_flats(room, page_number, page_number + 1, district)

Собираем информацию по 1-комнатным квартирам


  0%|          | 0/15 [00:00<?, ?it/s]

Собираем информацию по 2-комнатным квартирам


  0%|          | 0/15 [00:00<?, ?it/s]

Собираем информацию по 3-комнатным квартирам


  0%|          | 0/15 [00:00<?, ?it/s]

Собираем информацию по 4-комнатным квартирам


  0%|          | 0/15 [00:00<?, ?it/s]

Собираем информацию по 5-комнатным квартирам


  0%|          | 0/15 [00:00<?, ?it/s]

Was not able to recieve HTML
Собираем информацию по 6-комнатным квартирам


  0%|          | 0/15 [00:00<?, ?it/s]

Собираем информацию по студиям


  0%|          | 0/15 [00:00<?, ?it/s]

## Дополнение данными с детальных страниц

In [7]:
flats_df = pd.read_csv('../data/raw/1_rooms.csv')

In [8]:
FLOATS_NUMBERS_REG_EXPRESSION = r"[+-]? *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?"

def parse_page_offer(soup_offer_page):
        
        page_data = {
            "year_of_construction": -1,
            "total_meters": -1,
            "living_meters": -1,
            "kitchen_meters": -1,
            "floor": -1,
            "floors_count": -1,
            "flat_type": -1,
            "house_type": -1,
            "wc": -1,
            "ceiling": -1,
            "class": -1
        }

        spans = soup_offer_page.select("span")
        p_tags = soup_offer_page.select("p")

        for index, span in enumerate(spans):
            if "Год постройки" in span:
                page_data["year_of_construction"] = spans[index + 1].text

        if page_data["year_of_construction"] == -1:
            p_tags = soup_offer_page.select("p")

            for index, p_tag in enumerate(p_tags):
                if "Год постройки" in p_tag:
                    page_data["year_of_construction"] = p_tags[index + 1].text

        if page_data["year_of_construction"] == -1:
            for index, span in enumerate(spans):
                if "Год сдачи" in span:
                    page_data["year_of_construction"] = spans[index + 1].text

        for index, span in enumerate(spans):
            if "Общая площадь" in span:
                page_data["total_meters"] = spans[index + 1].text
                floats = re.findall(FLOATS_NUMBERS_REG_EXPRESSION, page_data["total_meters"])
                if len(floats) == 0:
                    page_data["total_meters"] = -1
                else:
                    page_data["total_meters"] = float(floats[0])
                    
        for index, span in enumerate(spans):
            if "Площадь кухни" in span:
                page_data["kitchen_meters"] = spans[index + 1].text
                floats = re.findall(FLOATS_NUMBERS_REG_EXPRESSION, page_data["kitchen_meters"])
                if len(floats) == 0:
                    page_data["kitchen_meters"] = -1
                else:
                    page_data["kitchen_meters"] = float(floats[0])

        for index, span in enumerate(spans):
            if "Тип жилья" in span:
                page_data["flat_type"] = spans[index + 1].text
                
        if page_data["flat_type"] == -1:
            for index, p_tag in enumerate(p_tags):
                if "Тип жилья" in p_tag:
                    page_data["flat_type"] = p_tags[index + 1].text        
        
        for index, span in enumerate(spans):
            if "Тип дома" in span:
                page_data["house_type"] = spans[index + 1].text
        
        if page_data["house_type"] == -1:
            for index, p_tag in enumerate(p_tags):
                if "Тип дома" in p_tag:
                    page_data["house_type"] = p_tags[index + 1].text
                
        for index, span in enumerate(spans):
            if "Класс" in span:
                page_data["class"] = spans[index + 1].text
                
        if page_data["class"] == -1:
            for index, p_tag in enumerate(p_tags):
                if "Класс" in p_tag:
                    page_data["class"] = p_tags[index + 1].text
                
        for index, span in enumerate(spans):
            if "Санузел" in span:
                page_data["wc"] = spans[index + 1].text
                
        if page_data["wc"] == -1:
            for index, p_tag in enumerate(p_tags):
                if "Санузел" in p_tag:
                    page_data["wc"] = p_tags[index + 1].text
                
        for index, span in enumerate(spans):
            if "Высота потолков" in span:
                page_data["ceiling"] = spans[index + 1].text
                
        if page_data["ceiling"] == -1:
            for index, p_tag in enumerate(p_tags):
                if "Высота потолков" in p_tag:
                    page_data["ceiling"] = p_tags[index + 1].text
                 
        for index, span in enumerate(spans):
            if "Жилая площадь" in span:
                page_data["living_meters"] = spans[index + 1].text
                floats = re.findall(FLOATS_NUMBERS_REG_EXPRESSION, page_data["living_meters"])
                if len(floats) == 0:
                    page_data["living_meters"] = -1
                else:
                    page_data["living_meters"] = float(floats[0])

        for index, span in enumerate(spans):
            if "Этаж" in span:
                text_value = spans[index + 1].text
                ints = re.findall(r'\d+', text_value)
                if len(ints) != 2:
                    page_data["floor"] = -1
                    page_data["floors_count"] = -1
                else:
                    page_data["floor"] = int(ints[0])
                    page_data["floors_count"] = int(ints[1])
        return page_data

In [9]:
url_list = list(flats_df.link)

In [10]:
url_list[0]
html_soup = get_page_soup(url_list[0])

In [11]:
flat_summary_dict = parse_page_offer(html_soup)

In [12]:
flat_summary_dict

{'year_of_construction': '2023',
 'total_meters': 37.0,
 'living_meters': -1,
 'kitchen_meters': -1,
 'floor': 6,
 'floors_count': 18,
 'flat_type': 'Новостройка',
 'house_type': 'Монолитно-кирпичный',
 'wc': -1,
 'ceiling': '3,02\xa0м',
 'class': 'Бизнес'}

In [130]:
url_list = list(flats_temp.link)

flats_infos = []
flat_summary_dict = {}
for url in tqdm(url_list, total=len(url_list)):
    try:
        html_soup = get_page_soup(url)
        
        flat_summary_dict = parse_page_offer(html_soup)
        flat_summary_dict['processed'] = 'y'
    except:
        flat_summary_dict['processed'] = 'n'
        
    flats_infos.append(flat_summary_dict)
    
    time.sleep(3)

info_df = pd.DataFrame(flats_infos)
info_df

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,year_of_construction,total_meters,living_meters,kitchen_meters,floor,floors_count,flat_type,house_type,wc,ceiling,class,processed
0,2023,37.0,-1.0,-1.0,6,18,Новостройка,Монолитно-кирпичный,-1,"3,02 м",Бизнес,y
1,2024,42.0,18.0,12.0,13,43,Новостройка,Монолитный,1 совмещенный,-1,-1,y
2,2023,47.0,16.0,16.0,7,21,Новостройка,"Монолитно-кирпичный, монолитный",2 совмещенных,"3,1 м",Бизнес,y
3,2020,32.0,-1.0,-1.0,24,24,Вторичка,Монолитный,1 совмещенный,-1,-1,y
4,2024,52.0,14.0,11.0,3,22,Новостройка,Монолитный,1 раздельный,"3,08 м",Премиум,y


In [131]:
flats_full = flats_temp.join(info_df)

In [132]:
flats_full.tail(1)

Unnamed: 0,rooms,title,subtitle,subway,dist_to_subway,city,admin_okrug,district,street,home_number,...,living_meters,kitchen_meters,floor,floors_count,flat_type,house_type,wc,ceiling,class,processed
4,1,"1-комн. кв., 52,4 м², 3/22 этаж",Секция 1 • Сдача корпуса 3 кв. 2024,Спартак,14 минут пешком,Москва,СЗАО,р-н Покровское-Стрешнево,Северо-Западный ао,Клубный Город на Реке Примавера ЖК,...,14.0,11.0,3,22,Новостройка,Монолитный,1 раздельный,"3,08 м",Премиум,y


In [6]:
#Создаем файлы для записи и сохранения промежуточных результатов

df = pd.DataFrame(columns=['rooms', 'title', 'subtitle', 'subway', 'dist_to_subway', 'city',
       'admin_okrug', 'district', 'street', 'home_number', 'price', 'link',
       'year_of_construction', 'total_meters', 'living_meters',
       'kitchen_meters', 'floor', 'floors_count', 'flat_type', 'house_type',
       'wc', 'ceiling', 'class', 'processed'])
# for i in [4, 5, 6, 9]:
    #df.to_csv(f'../data/raw/{i}_rooms_full.csv', index=False, mode='w', header=True)

In [13]:
flats_df = pd.read_csv(f'../data/raw/1_rooms.csv')

In [None]:
# for index, row in flats_df.iterrows():
#     if index>280:
#         url = row['link']
#         try:
#             html_soup = get_page_soup(url)
#             flat_summary_dict = parse_page_offer(html_soup)
#             for key, value in flat_summary_dict.items():
#                 row[key] = value
#             row['processed'] = 'y'
#             processed = pd.DataFrame([row])

#         except:
#             page_data = {
#                 "year_of_construction": -1,
#                 "total_meters": -1,
#                 "living_meters": -1,
#                 "kitchen_meters": -1,
#                 "floor": -1,
#                 "floors_count": -1,
#                 "flat_type": -1,
#                 "house_type": -1,
#                 "wc": -1,
#                 "ceiling": -1,
#                 "class": -1
#             }

#             for key, value in page_data.items():
#                 row[key] = value
#             row['processed'] = 'n'
#             processed = pd.DataFrame([row])

#         processed.to_csv('../data/raw/1_rooms_full.csv', mode='a', header=False, index=False)
#         time.sleep(random.randint(2,5))

In [15]:
    flats_df = pd.read_csv(f'../data/raw/6_rooms.csv')
    for index, row in tqdm(flats_df.iterrows(), total=flats_df.shape[0]):
        url = row['link']
        try:
            html_soup = get_page_soup(url)
            flat_summary_dict = parse_page_offer(html_soup)
            for key, value in flat_summary_dict.items():
                row[key] = value
            row['processed'] = 'y'
            processed = pd.DataFrame([row])

        except:
            page_data = {
                "year_of_construction": -1,
                "total_meters": -1,
                "living_meters": -1,
                "kitchen_meters": -1,
                "floor": -1,
                "floors_count": -1,
                "flat_type": -1,
                "house_type": -1,
                "wc": -1,
                "ceiling": -1,
                "class": -1
            }

            for key, value in page_data.items():
                row[key] = value
            row['processed'] = 'n'
            processed = pd.DataFrame([row])

        processed.to_csv(f'../data/raw/6_rooms_full.csv', mode='a', header=False, index=False)
        time.sleep(random.randint(2,5))

  0%|          | 0/3938 [00:00<?, ?it/s]

Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able to recieve HTML
Was not able t

KeyboardInterrupt: 