In [82]:
import requests
import bs4
from bs4 import BeautifulSoup
import string
import re
from typing import Dict, List, Tuple
from collections import OrderedDict

#### Request data from the Website

In [83]:
soup = BeautifulSoup(requests.get("https://www.bashneft-azs.ru/network_azs/").text)
scripts_list = soup.find_all("script")

#### Filtering scripts which contain any data (text length more then 100 symbols)

In [87]:
def filter_blank_scripts(scripts_list: List[bs4.element.Tag]) -> List[Tuple[int, str]]:
    '''
    Function generates new list of scripts, 
    which contain any content which length higher than 100 symbols.
    '''
    return [(index, script.text) for index, script in enumerate(scripts_list) if len(script.text) > 100]

In [88]:
filtered_scripts: List[Tuple[int, str]] = filter_blank_scripts(scripts_list)

In [89]:
script_index, script_content = filtered_scripts[9]

#### Coordinates Analysis

In [133]:
def check_coordinates(content: str) -> int:
    '''
    Functions searches for float spatial coordinates.
    Success seach for: 45.34234, 123.12414, -34.342, -1.231
    '''
    matched_coordinates: List[str] = re.findall(r"(?<!\d)\d{,3}(?<=\d)\.\d{,20}", content)
    points_amount: float = len(matched_coordinates) / 2

    return int(points_amount)

check_coordinates("45.234")

0

In [131]:
def get_scripts_with_coords(scripts_list: List[Tuple[int, str]] ):
    '''
    Function generates new list of scripts, 
    which contain any content which length higher than 100 symbols.
    '''
    for script_index, script_content in scripts_list:
        amount_of_coords: int = check_coordinates(script_content)

        if amount_of_coords:
            yield (script_index, amount_of_coords)

In [132]:
list(get_scripts_with_coords(filtered_scripts))

[(21, 264)]

#### Data cleaning

In [85]:
def symbol_clean(content: str) -> List[str]:
    removed_tabs = re.sub(r'\t+', ' ', content)
    removed_new_lines = re.sub(r'\n+', ' ', removed_tabs)
    removed_return = re.sub(r'\r+', ' ', removed_new_lines)
    removed_spaces = re.sub(r'\s+', ' ', removed_new_lines)
    removed_punctuation = "".join([symbol for symbol in removed_spaces if symbol not in string.punctuation])

    tokenized_words = removed_punctuation.split(' ')
    return tokenized_words

#### Words calculation

In [86]:
def calculate_words(word_cloud: List[str]) -> Dict[str, int]:
    word_dict = {}

    for word in word_cloud:
        if word not in word_dict.keys():
            word_dict[word] = 1
        else:
            word_dict[word] += 1
    
    return word_dict

In [14]:
word_cloud = symbol_clean(scripts_list[21].text)
word_stats: Dict[str, int] = calculate_words(word_cloud)
result = sorted(word_stats.items(), key=lambda kv: kv[1], reverse=True)
result

[('', 2930),
 ('titleкарта', 789),
 ('картинки', 530),
 ('p', 526),
 ('dl', 526),
 ('азс', 282),
 ('new', 265),
 ('размеры', 265),
 ('смещение', 265),
 ('mygeoobjectspush', 263),
 ('clustercaption', 263),
 ('ballooncontentheader', 263),
 ('h3азс', 263),
 ('stylecolor', 263),
 ('ballooncontentbody', 263),
 ('dtтопливоdtddul', 263),
 ('classfuellistli', 263),
 ('classicon2span92spanlili', 263),
 ('dtуслугиdtddul', 263),
 ('classserviceslistli', 263),
 ('classiconpresent', 263),
 ('titleпрограмма', 263),
 ('лояльностиlili', 263),
 ('classiconhours', 263),
 ('titleкруглосуточная', 263),
 ('dt', 263),
 ('stylelineheight15pxспособы', 263),
 ('оплатыdtddul', 263),
 ('classcardslistli', 263),
 ('classiconrn', 263),
 ('titleтопливная', 263),
 ('картаlili', 263),
 ('classiconvisa', 263),
 ('visalili', 263),
 ('classiconmc', 263),
 ('mastercardlili', 263),
 ('classiconmir', 263),
 ('iconimagehref', 263),
 ('imagesmapmarkerpng', 263),
 ('iconimagesize', 263),
 ('35', 263),
 ('44', 263),
 ('iconima