In [131]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup as soup
from tqdm import tqdm_notebook as tqdm
from time import sleep

# Set up Page

In [132]:
PAGE_URL = 'https://www.rusprofile.ru/codes/610000'

In [133]:
def get_soup(url):
    """Makes tasty soup out of raw HTML"""
    return soup(requests.get(url).text)

# Get Number of Pages

In [134]:
base_page_soup = get_soup(PAGE_URL) # get page soup to extract n of pages
pg_list = base_page_soup.find('ul', class_="paging-list") # find pagination item on the page
N_PAGES = int(pg_list.find_all('li')[-2].text)
iterator_arg = range(1, N_PAGES + 1) # define iterator for browsing pages
print(f'Total pages to scrape from: {N_PAGES}')

Total pages to scrape from: 274


# Scapping functions

In [50]:
def get_companies(page_soup):
    """Finds companies on the page. Returns tags list"""
    return page_soup.find_all('div', class_ = 'company-item')

In [51]:
month_dict = {
    'января' : 1, 'февраля' : 2, 'марта' : 3, 'апреля' : 4, 'мая' : 5, 'июня' : 6,
    'июля' : 7, 'августа' : 8, 'сентября' : 9, 'октября' : 10, 'ноября' : 11, 'декабря' : 12
} # dictionary for transferring russian names into integer months numbers

In [84]:
def scrape_company(company):
    """Extracts data from each company item"""
    company_name = company.find('div', class_="company-item__title").find('a').text.strip()
    try:
        company_status = company.find('span', class_="warning-text").text.strip()
    except:
        company_status = 'Организация работает'
    company_address = ', '.join(company.find('address',  class_="company-item__text").text.strip().split(', ')[1:4])
    try:
        lst = company.findAll('div', class_='company-item-info')[1].findAll('dd')
    except:
        lst = company.find('div',  class_="company-item-info").findAll('dd')
    find_date = [i.text for i in lst if 'г.' in i.text]
    find_cap = [i.text for i in lst if 'руб.' in i.text]
    if len(find_date) != 0:
        company_estdate = find_date[0].strip(' г.').split(' ')
        company_estdate = pd.Timestamp(f'{company_estdate[2]}-{month_dict[company_estdate[1]]}-{company_estdate[0]}')
    else:
        company_estdate = None
        
    if len(find_cap) != 0:
        company_cap = pd.to_numeric(find_cap[0].strip(' руб.').replace(' ', '').replace(',', '.'))
    else:
        company_cap = None
    company_activity = company.findAll('div',  class_="company-item-info")[-1].find('dd').text.strip()
    return pd.Series({'name' : company_name, 'status' : company_status,
                      'address' : company_address, 'est_date' : company_estdate,
                      'cap' : company_cap, 'activity' : company_activity})

# Scraping

In [53]:
page_url_iterable = PAGE_URL + '/{}/' # Making the string changable

In [54]:
cat_num = PAGE_URL.split('/')[-1]

In [55]:
SLEEP_TIME = 10 # Essential! Could be increased manually if needed. Defines pause betweeen opening the pages

In [72]:
df_list = []
for page_n in tqdm(iterator_arg):
    current_page_soup = get_soup(page_url_iterable.format(page_n))
    sleep(SLEEP_TIME)
    companies = get_companies(current_page_soup)
    current_page_df = pd.DataFrame([scrape_company(company) for company in companies])
    df_list.append(current_page_df)
    
full_cat_df = pd.concat(df_list)
full_cat_df = full_cat_df.assign(base_category = int(cat_num))


HBox(children=(IntProgress(value=0, max=917), HTML(value='')))

KeyboardInterrupt: 

___

## Emergency cell

Use in case of error

In [73]:
pd.concat(df_list).assign(base_category = int(cat_num)).to_csv(f'data/database_{cat_num}_part_2.csv', sep = ';', index = None, encoding = 'utf-8')
iterator_arg = range(page_n, N_PAGES)

This next cell is set to be used after inital scrapping result is ready to concatenate with previously scrapped data before the error occured

In [None]:
full_cat_df = pd.concat([
    full_cat_df,
    pd.read_csv(f'data/database_{cat_num}_part.csv').assign(base_category = int(cat_num))
])

___

In [None]:
full_cat_df.head()

In [None]:
full_cat_df = full_cat_df[full_cat_df.activity.str.contains(cat_num[:2])]

In [211]:
full_cat_df.to_csv(f'data/database_{cat_num}.csv', index = None, encoding = 'utf-8', sep = ';')