# Подключаем необходимые библиотеки

In [34]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import time
import sys
import urllib.request
import math

# Вводим необходимые функции

In [35]:
#удаление html 
def html_stripper(text):
    return re.sub('<[^<]+?>', '', str(text))

#цена квартиры
def getPrice(flat_page):
    price = flat_page.find('div', attrs={'class':'object_descr_price'})
    price = re.split('<div>|руб|\W', str(price))
    price = "".join([i for i in price if i.isdigit()][-3:])
    return int(price)

#расстояние от центра в км.
def getDist(flat_page):
    coords = flat_page.find('div', attrs={'class':'map_info_button_extend'}).contents[1]
    coords = re.split('&amp|center=|%2C', str(coords))
    coords_list = []
    for item in coords:
        if item[0].isdigit():
            coords_list.append(item)
    #координаты квартиры
    lat = float(coords_list[0])
    lon = float(coords_list[1])
    #нулевой километр
    center_lat = 55.755817 
    center_lon = 37.617633
    #расстояние
    dist = math.sqrt((lat-center_lat)**2+(lon-center_lon)**2)
    return dist

#число комнат
def getRoom(flat_page):
    rooms = flat_page.find('div', attrs={'class':'object_descr_title'})
    rooms = html_stripper(rooms)
    room_number = ''
    for i in re.split('-|\n', rooms):
        if 'комн' in i:
            break
        else:
            room_number += i
    room_number = "".join(room_number.split())
    return room_number

#расстояние до метро в минутах
def getMetrdist(flat_page):
    try:
        Metrdist = flat_page.find('span', attrs={'class': 'object_item_metro_comment'})
        Metrdist = html_stripper(Metrdist)
        Metrdist = re.split('<span>|\n|', Metrdist)[1]
    except:
        Metrdist = None
    return Metrdist

#1 – если пешком от метро, 0 – если на транспорте
def getWalk(flat_page):
    metro = flat_page.find('span', attrs={'class':'object_item_metro_comment'})
    metro = re.split('<span>|\n|', str(metro))
    for i in metro:
        if i.strip()=='пешком':
            return 1
    return 0

#номер этажа, на котором расположена квартира
def getFloor(flat_page):
    table = flat_page.find('table', attrs = {'class':'object_descr_props'})
    table = html_stripper(table)
    floor_info = re.split('Этаж|Тип дома', table)[1]
    try:
        floor_number = "".join([i for i in floor_info if i.isdigit()][0])
    except IndexError:
        return None
    else:
        return floor_number
    return floor_number

#всего этажей в доме
def getNFloors(flat_page):
    table = flat_page.find('table', attrs = {'class':'object_descr_props'})
    table = html_stripper(table)
    floor_info = re.split('Этаж|Тип дома', table)[1]
    try:
        nfloors_number = "".join([i for i in floor_info if i.isdigit()][1])
    except IndexError:
        return None
    else:
        return nfloors_number
    
#1 – кирпичный/монолит/жб, 0 – другой
def getBrick(flat_page):
    table = flat_page.find('table', attrs = {'class':'object_descr_props'})
    table = html_stripper(table)
    house_type = re.split('Тип дома:|Высота потолков:', table)[1]
    house_type = re.split('\n|,', house_type)
    try:
        house_type = "".join(house_type[4].split())
    except IndexError:
        return None
    else:
        brick = 1 if house_type in ['кирпичный', 'кирпично-монолитный', 'монолитный', 'панельный дом', 'панельный'] else 0
        return brick

#общая площадь квартиры, кв. м.
def getTotsp(flat_page):
    table = flat_page.find('table', attrs = {'class':'object_descr_props'})
    table = html_stripper(table)
    totsp = re.split('Общая площадь:|Площадь комнат', table)[1]
    totsp = re.split('\n|\xa0', totsp)[2].replace(',', '.')
    return totsp

#жилая площадь квартиры, кв. м.
def getLivesp(flat_page):
    table = flat_page.find('table', attrs = {'class':'object_descr_props'})
    table = html_stripper(table)
    livesp = re.split('Жилая площадь:|Площадь кухни', table)[1]
    livesp = re.split('\xa0', livesp)[0]
    livesp = ''.join(i for i in livesp if i.isdigit() or i==',' or i == '.').replace(',', '.')
    return livesp

#площадь кухни, кв. м.
def getKitsp(flat_page):
    table = flat_page.find('table', attrs = {'class':'object_descr_props'})
    table = html_stripper(table)
    kitsp = re.split('Площадь кухни:|Санузел|Раздельных санузлов|Совмещенных санузлов', table)[1]
    kitsp = re.split('\xa0', kitsp)[0]
    kitsp = ''.join([i for i in kitsp.split() if i.isdigit() or i==',' and i != '\n']).replace(',', '.')    
    return kitsp

#1 – первичный рынок, 0 - вторичка
def getNew(flat_page):
    table = flat_page.find('table', attrs = {'class':'object_descr_props'})
    table = html_stripper(table)
    house_type = re.split('Тип дома:|Высота потолков:', table)[1]
    house_type = re.split('\n|,', house_type)
    house_type = "".join(house_type[2].split())
    if house_type == 'новостройка':
        new = 1
    elif house_type == 'вторичка':
        new = 0
    else:
        new = None
    return new

#1 – если есть, 0 – нет
def getTel(flat_page):
    tel = flat_page.find('div', attrs = {'class': 'realtor-card__phone'})
    tel = html_stripper(tel)
    if tel == 'None':
        tel = 0
    else:
        tel = 1
    return tel

#1 – есть балкон/лоджия, 0 – нет
def getBal(flat_page):
    table = flat_page.find('table', attrs = {'class':'object_descr_props'})
    table = html_stripper(table)
    bal_info = re.split('Балкон:|Лифт', table)[1]
    bal = re.split('\n', bal_info)[1]
    bal = 1 if ''.join(bal.split()[0]).isdigit() else 0
    return bal

In [36]:
district = ['','','','','','','','','','','','']
#ЦАО
district[0] = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=13&district%5B1%5D=14&district%5B2%5D=15&district%5B3%5D=16&district%5B4%5D=17&district%5B5%5D=18&district%5B6%5D=19&district%5B7%5D=20&district%5B8%5D=21&district%5B9%5D=22&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'
#САО
district[1] = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=23&district%5B10%5D=33&district%5B11%5D=34&district%5B12%5D=35&district%5B13%5D=36&district%5B14%5D=37&district%5B15%5D=38&district%5B1%5D=24&district%5B2%5D=25&district%5B3%5D=26&district%5B4%5D=27&district%5B5%5D=28&district%5B6%5D=29&district%5B7%5D=30&district%5B8%5D=31&district%5B9%5D=32&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'
#ЮАО
district[2] = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=84&district%5B10%5D=94&district%5B11%5D=95&district%5B12%5D=96&district%5B13%5D=97&district%5B14%5D=98&district%5B15%5D=99&district%5B1%5D=85&district%5B2%5D=86&district%5B3%5D=87&district%5B4%5D=88&district%5B5%5D=89&district%5B6%5D=90&district%5B7%5D=91&district%5B8%5D=92&district%5B9%5D=93&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'
#ВАО
district[3] = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=56&district%5B10%5D=66&district%5B11%5D=67&district%5B12%5D=68&district%5B13%5D=69&district%5B14%5D=70&district%5B15%5D=71&district%5B1%5D=57&district%5B2%5D=58&district%5B3%5D=59&district%5B4%5D=60&district%5B5%5D=61&district%5B6%5D=62&district%5B7%5D=63&district%5B8%5D=64&district%5B9%5D=65&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'
#ЗАО
district[4] = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=112&district%5B10%5D=122&district%5B11%5D=123&district%5B12%5D=124&district%5B13%5D=348&district%5B14%5D=349&district%5B15%5D=350&district%5B1%5D=113&district%5B2%5D=114&district%5B3%5D=115&district%5B4%5D=116&district%5B5%5D=117&district%5B6%5D=118&district%5B7%5D=119&district%5B8%5D=120&district%5B9%5D=121&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'
#CЗАО
district[5] = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=125&district%5B1%5D=126&district%5B2%5D=127&district%5B3%5D=128&district%5B4%5D=129&district%5B5%5D=130&district%5B6%5D=131&district%5B7%5D=132&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'
#СВАО
district[6] = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=39&district%5B10%5D=49&district%5B11%5D=50&district%5B12%5D=51&district%5B13%5D=52&district%5B14%5D=53&district%5B15%5D=54&district%5B16%5D=55&district%5B1%5D=40&district%5B2%5D=41&district%5B3%5D=42&district%5B4%5D=43&district%5B5%5D=44&district%5B6%5D=45&district%5B7%5D=46&district%5B8%5D=47&district%5B9%5D=48&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'
#ЮЗАО
district[7] = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=100&district%5B10%5D=110&district%5B11%5D=111&district%5B1%5D=101&district%5B2%5D=102&district%5B3%5D=103&district%5B4%5D=104&district%5B5%5D=105&district%5B6%5D=106&district%5B7%5D=107&district%5B8%5D=108&district%5B9%5D=109&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'
#ЮВАО
district[8] = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=72&district%5B10%5D=82&district%5B11%5D=83&district%5B1%5D=73&district%5B2%5D=74&district%5B3%5D=75&district%5B4%5D=76&district%5B5%5D=77&district%5B6%5D=78&district%5B7%5D=79&district%5B8%5D=80&district%5B9%5D=81&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'
#ЗелАО
district[9] = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=152&district%5B1%5D=153&district%5B2%5D=154&district%5B3%5D=355&district%5B4%5D=356&district%5B5%5D=357&district%5B6%5D=358&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'
#ТроАО
district[10] = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=338&district%5B1%5D=339&district%5B2%5D=340&district%5B3%5D=341&district%5B4%5D=342&district%5B5%5D=343&district%5B6%5D=344&district%5B7%5D=345&district%5B8%5D=346&district%5B9%5D=347&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'
#НовомосАО
district[11] = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=327&district%5B10%5D=337&district%5B1%5D=328&district%5B2%5D=329&district%5B3%5D=330&district%5B4%5D=331&district%5B5%5D=332&district%5B6%5D=333&district%5B7%5D=334&district%5B8%5D=335&district%5B9%5D=336&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'

# Парсинг

In [33]:
flatstats = pd.DataFrame(columns=['N','Rooms', 'Price', 'Totsp', 'Livesp', 'Kitsp', 'Dist', 'Metrdist', 'Walk', 'Brick', 'Tel', 'Bal', 'Floor', 'Nfloors', 'New'])

count = 0

#по округам
for j in range(12):
    links = []
    for page in range(1, 30):
        page_url =  district[j].format(page)

        search_page = requests.get(page_url)
        search_page = search_page.content
        search_page = BeautifulSoup(search_page, 'lxml')

        flat_urls = search_page.findAll('div', attrs = {'ng-class':"{'serp-item_removed': offer.remove.state, 'serp-item_popup-opened': isPopupOpen}"})
        flat_urls = re.split('http://www.cian.ru/sale/flat/|/" ng-class="', str(flat_urls))

        for link in flat_urls:
            if link.isdigit():
                links.append(link)
    
    #по квартирам
    for i in range(len(links)):
        flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
        flat_page = requests.get(flat_url)
        flat_page = flat_page.content
        flat_page = BeautifulSoup(flat_page, 'lxml')

        table = html_stripper(flat_page.find('table', attrs = {'class':'object_descr_props'}))


        to_append = {'N': i, 'Rooms':getRoom(flat_page), 'Price':getPrice(flat_page), 
                    'Totsp':getTotsp(flat_page), 'Livesp':getLivesp(flat_page), 
                    'Kitsp':getKitsp(flat_page), 'Dist':getDist(flat_page), 
                    'Metrdist':getMetrdist(flat_page), 'Walk':getWalk(flat_page), 
                    'Brick':getBrick(flat_page), 'Tel':getTel(flat_page),
                    'Bal':getBal(flat_page), 'Floor':getFloor(flat_page), 
                    'Nfloors':getNFloors(flat_page), 'New':getNew(flat_page)}
        flatstats = flatstats.append(to_append, ignore_index=True)
        count +=1 
        print('The link is {} {}'.format(j+1,i+1))

TypeError: Can't convert 'int' object to str implicitly

# Записываем результаты в файл

In [174]:
flatstats.to_csv('D://Parsing_cian_flatstats.csv')