<a href="https://colab.research.google.com/github/mukaseevru/ds-school/blob/main/final_project/views.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Подключение библиотек

In [1]:
# Загрузка необходимых модулей
import numpy as np
import pandas as pd
import datetime as dt
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from time import sleep
from bs4 import BeautifulSoup

# Настройки

In [2]:
# Таймаут, если ошибка соединения
timeout = 1
# Debug mode
debug = True
# Настройки по сайтам
sites = {}
# banki.ru
sites['banki'] = {}
sites['banki']['on'] = True
sites['banki']['count'] = 10
sites['banki']['send'] = True

In [3]:
sites

{'banki': {'count': 10, 'on': True, 'send': True}}

# Определение функций

In [4]:
# Определение функции загрузки страницы
def load_page(url):
  count = 0
  while True:
    try:
      with requests.Session() as session:
        session.headers['Connection'] = 'keep-alive'
        session.headers['Cache-Control'] = 'max-age=0'
        session.headers['Upgrade-Insecure-Requests'] = '1'
        session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 OPR/40.0.2308.81'
        session.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        session.headers['DNT'] = '1'
        session.headers['Accept-Encoding'] = 'gzip, deflate, lzma, sdch'
        session.headers['Accept-Language'] = 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4'
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)
        session.proxies = {
            # 'http': '185.198.189.21:8080',
            # 'https': '185.198.189.21:8080'
            }
        response = session.get(url)
      if response.status_code != 200:
        # log.info("Ошибка, код ответа: %s", response.status)
        print('Ошибка, код ответа: %s', response.status_code)
        sleep(timeout)
        count += 1
        if count > 5:
          return (-1)
        # Попробуем снова на следующей итерации цикла
        continue
      # Если дошли до сюда, значит ошибок не было
      return response
    except ConnectionError:
      # Выводим свое сообщение плюс стек трассы
      # log.exception("Ошибка ConnectionError")
      print('Ошибка ConnectionError')
      sleep(timeout)

# Определение функции очистки текста от спецсимволов
def clean_text(text):
  if pd.isnull(text):
    return np.nan
  text = text.replace('\n',' ')\
  .replace('\r',' ')\
  .replace('\t',' ')\
  .replace('\u200b',' ')\
  .replace('  ',' ')\
  .replace('✅',' ')\
  .replace('•',' ')\
  .replace('📰',' ')\
  .replace('🔸',' ')\
  .replace('🔹',' ')\
  .replace('🔸',' ')\
  .replace('🔸',' ')\
  .replace('💰',' ')\
  .replace('👍',' ')\
  .replace('📅',' ')\
  .replace('🍂',' ')\
  .replace('🔥',' ')\
  .replace('🎥',' ')\
  .replace('🎬',' ')\
  .replace('<',' ')\
  .replace('>',' ')\
  .strip()
  while '  ' in text:
    text = text.replace('  ', ' ')
  return text

# banki.ru

## Загрузка имеющихся данных

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import os
os.getcwd()

'/content'

In [7]:
df_views = pd.DataFrame(columns=[np.arange(sites['banki']['count'])])
df_views.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   (0,)    0 non-null      object
 1   (1,)    0 non-null      object
 2   (2,)    0 non-null      object
 3   (3,)    0 non-null      object
 4   (4,)    0 non-null      object
 5   (5,)    0 non-null      object
 6   (6,)    0 non-null      object
 7   (7,)    0 non-null      object
 8   (8,)    0 non-null      object
 9   (9,)    0 non-null      object
dtypes: object(10)
memory usage: 0.0+ bytes


## Цикл парсинга

In [8]:
if sites['banki']['on']:
  url = 'https://www.banki.ru/news/lenta/page1'
  response = load_page(url)
  soup = BeautifulSoup(response.text, 'lxml')
  articles = soup.find_all('ul', class_='text-list text-list--date text-list--date-inline')[0].find_all('li')
  article_ids = []
  article_views = []
  for article in articles[:sites['banki']['count']]:
    # Кол-во просмотров
    views = 0
    article_url = article.find('a', class_='text-list-link')
    if article_url.get('href')[0] == 'h':
        # print('Error url - {}'.format(article_url.get('href')))
        continue
    article_id = str(article_url.get('href').split('=')[1])
    article_info = article.find_all('span', class_='news__info')
    if len(article_info) > 1:
      if article_info[1].text != '':
        views = int(clean_text(article_info[1].text.split('\n\t')[1]))
    else:
      if article_info[0].text != '':
        views = int(clean_text(article_info[0].text.split('\n\t')[1]))
    article_ids.append(article_id)
    article_views.append(views)
  df_views.columns = article_ids
  df_views = df_views.append(pd.DataFrame([article_views], columns=df_views.columns), ignore_index=True)

In [9]:
df_views.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   10941428  1 non-null      object
 1   10941427  1 non-null      object
 2   10941426  1 non-null      object
 3   10941425  1 non-null      object
 4   10941422  1 non-null      object
 5   10941421  1 non-null      object
 6   10941419  1 non-null      object
 7   10941417  1 non-null      object
 8   10941416  1 non-null      object
 9   10941415  1 non-null      object
dtypes: object(10)
memory usage: 208.0+ bytes


In [10]:
if sites['banki']['on']:
  for i in range(120):
    url = 'https://www.banki.ru/news/lenta/page1'
    response = load_page(url)
    soup = BeautifulSoup(response.text, 'lxml')
    articles = soup.find_all('ul', class_='text-list text-list--date text-list--date-inline')[0].find_all('li')
    # article_ids = []
    article_views = []
    for article in articles:
      # Кол-во просмотров
      views = 0
      article_url = article.find('a', class_='text-list-link')
      if article_url.get('href')[0] == 'h':
          # print('Error url - {}'.format(article_url.get('href')))
          continue
      article_id = str(article_url.get('href').split('=')[1])
      if article_id not in df_views.columns:
        continue
      article_info = article.find_all('span', class_='news__info')
      if len(article_info) > 1:
        source = clean_text(article_info[0].text)
        if article_info[1].text != '':
          views = int(clean_text(article_info[1].text.split('\n\t')[1]))
        if article_info[1].find('span', class_='link-with-icon__icon icon-font icon-bubble-16 icon-font--size_small'):
          comments = int(clean_text(article_info[1].text.split('\n\t')[3]))
      else:
        if article_info[0].text != '':
          views = int(clean_text(article_info[0].text.split('\n\t')[1]))
        if article_info[0].find('span', class_='link-with-icon__icon icon-font icon-bubble-16 icon-font--size_small'):
          comments = int(clean_text(article_info[0].text.split('\n\t')[3]))
      article_views.append(views)
    df_views = df_views.append(pd.DataFrame([article_views], columns=df_views.columns), ignore_index=True)
    sleep(30)

In [11]:
df_views.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   10941428  121 non-null    object
 1   10941427  121 non-null    object
 2   10941426  121 non-null    object
 3   10941425  121 non-null    object
 4   10941422  121 non-null    object
 5   10941421  121 non-null    object
 6   10941419  121 non-null    object
 7   10941417  121 non-null    object
 8   10941416  121 non-null    object
 9   10941415  121 non-null    object
dtypes: object(10)
memory usage: 9.6+ KB


In [12]:
df_views.head()

Unnamed: 0,10941428,10941427,10941426,10941425,10941422,10941421,10941419,10941417,10941416,10941415
0,0,223,588,901,750,907,1169,1194,1331,748
1,0,223,588,901,750,907,1169,1194,1331,748
2,0,223,588,901,750,907,1169,1194,1331,748
3,0,223,588,901,750,907,1169,1194,1331,748
4,37,278,618,933,768,921,1182,1203,1345,757


In [13]:
df_views.to_csv('drive/MyDrive/Colab Notebooks/sber/news/data/views.csv', index=False)