In [None]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import numpy as np
import re

import time
import sys


## Data parsing

In [None]:
# all buildings url list 

start_time = time.time()

url_list = []
prefix = 'https://dom.mingkh.ru/'

for i in range(234):    # site pages amount = 234
  url = f'https://dom.mingkh.ru/tatarstan/kazan/?page={i}'

  q = requests.get(url)
  result = q.content 

  soup = BS(result, 'lxml')
  build_list = soup.find(class_ = 'table table-condensed table-hover table-striped').find_all('a')
  
  for b in build_list:
    u = b['href']
    full_url = prefix + u
    url_list.append(full_url)

# delete same url's
url_list = list(set(url_list))

# time of url's parsing
url_parsing_time = time.time() - start_time

In [None]:
# save url_list
f = open('data/kazan_url_list.txt', 'w')
for index in url_list:
  f.write(index + ',')
f.close()

In [None]:
len(url_list)

5834

In [None]:
url_parsing_time

270.0915241241455

In [None]:
def num_from_str(str_text):
  ''' find the int number in str and return the number from string as str

  input: text sttring with some words
  return: number like str object
  
  '''
  split_str = str_text.split(' ')
  for item in split_str:
    if item.isdigit() == True:
      num = item
  return(num)

In [None]:
# data finding in url_list
address_list = []
year_list = []
flats_list = []
entrance_list = []
height_list = []
square_list = []
latitude_list = []
longitude_list = []
dd_list = []

start_time = time.time()

for u in url_list:
  q = requests.get(u)
  result = q.content
  soup = BS(result, 'lxml')

  # find all tr tags on page and get them text 
  page_tr_list = soup.find_all('tr')
  tr_text_list = []
  for t in page_tr_list:
    tr_text = t.text
    tr_text_list.append(tr_text)

  # check if all atributes are in tr_text_list for every building
  year_arr = []
  flats_arr = []
  entrance_arr = []
  height_arr = []
  square_arr = []
  for t in tr_text_list:
    year_arr.append(len(re.findall(r'Год ввода в эксплуатацию', t)))
    flats_arr.append(len(re.findall(r'Количество жилых помещений', t)))
    entrance_arr.append(len(re.findall(r'Количество подъездов', t)))
    height_arr.append(len(re.findall(r'Наибольшее количество этажей', t)))
    square_arr.append(len(re.findall(r'Площадь жилых помещений', t)))

  if len(set(year_arr)) > 1 and len(set(flats_arr)) > 1 and len(set(entrance_arr)) > 1 and len(set(height_arr)) > 1 and len(set(square_arr)) > 1 :

    # get values from valid tr
    for st in tr_text_list:
      if 'Год ввода в эксплуатацию' in st:
        year_list.append(int(num_from_str(st)))

      if 'Количество жилых помещений' in st:
        flats_list.append(int(num_from_str(st)))

      if 'Количество подъездов' in st:
        entrance_list.append(int(num_from_str(st)))

      if 'Наибольшее количество этажей' in st:
        height_list.append(int(num_from_str(st)))

      if 'Площадь жилых помещений' in st:
        split_str = st.split(' ')
        square_list.append(float(split_str[-2]))


    # get address from dd tag
    dd = soup.find('dd')
    dd_list.append(dd.text)

    # geo coordinates searching in got content by id, and get value

    latitude_tag = soup.find(id="mapcenterlat")
    longitude_tag = soup.find(id="mapcenterlng")
    latitude = float(latitude_tag['value'])
    longitude = float(longitude_tag['value'])
    latitude_list.append(latitude)
    longitude_list.append(longitude)  

# remove city name from address string and make address_list
for txt in dd_list:
  if 'казань' in txt.lower():
    address_str = txt.lower().rpartition(', казань, татарстан')
    address_list.append(address_str[0])

# time of data parsing
data_parsing_time = time.time() - start_time

In [None]:
data_parsing_time

3596.2859869003296

In [None]:
len(address_list) == len(year_list) == len(flats_list) == len(entrance_list) == len(height_list) == len(square_list) == len(latitude_list) == len(longitude_list) 

True

In [None]:
print(len(address_list))


5507


In [None]:
# creating dataframe
df_base = pd.DataFrame({'Address' : address_list, 
                   'Year' : year_list, 
                   'Flats': flats_list,
                   'Entrance': entrance_list,
                   'Height' : height_list, 
                   'Square' : square_list,
                   'Latitude' : latitude_list,
                   'Longitude' : longitude_list 
                  })

# save dataframe as csv 
df_base.to_csv('../data/kazan_buildings_df.csv')