🕺 This ipynb file is draft for aggregation models for free events about IT and programming: webinars, hackathons, broadcasts and courses.

# TODO:
- ! For better testing it is useful to save  html files at temporary folder and then delete it in the end of the session
- Write parser for pages of yandex events: div class:events-info
    - Registration url: 
    - Time - div class="events-info__date-and-calendar-link"
    - Location - div class="events-info__location"
    - Reistration link class="events-info__registration-button"
    - Яндекс.Облако: EventHeader__address: time, location, 

# Strategy
Shortly: first we check the main pages, then event pages

- Load `pages.json` - information about start wepbages: `name`, `url`, `func`
- Load `events.json` - information about current events:
    - `title`,
    - `url` (description),
    - `registration url` (optional, often the same as description url),
    - `date`
    - `time`
    - `location`
    - `online status`
    - `price`
    - `[tags]`
    - `language`
    - `speakers and themes (optional)`
    - `type (optional)`
- Check that events on the main pages are displayed in `events.json`
- If the event is not in the file, we need to add basic information about it on the second stage
- Past events are transfered to `past_events.json`
- On the next stage if any information is missing, we need to supplement it by going to specific url and parse the page
- For registration url we will add utm information

In [1]:
import re
import json
import requests
from datetime import datetime, date
from bs4 import BeautifulSoup

In [27]:
headers = requests.utils.default_headers()
current_date = date.today()
required_event_keys = ('url', 'title', 'date', 'time', 'location', 'online status')
optional_event_keys = ('registration_url', 'price', 'tags', 'language', 'speakers', 'themes', 'type')

with open('pages.json') as pages_file:
    pages = json.load(pages_file)
    
with open('events.json') as events_file:
    events = json.load(events_file)

date_alias = {'rus':
                 {'weekdays':
                    {'short': ('пн', 'вт', 'ср', 'чт', 'пт', 'сб', 'вс')},
                  'months':
                    {'gentive': ('января', 'февраля', 'марта',
                                 'апреля', 'мая', 'июня',
                                 'июля', 'августа', 'сентября',
                                 'октября', 'ноября', 'декабря')}
                 }
              }

def utm_cleaner(url):
    '''Delete utm tags from url'''
    index = url.find('?utm')
    if index > 0:
        url = url[:index]
    return url

   
class Event:
    def __init__(self, data):
        self.data = data
        self.data['url'] = utm_cleaner(self.data['url'])
        self.update()
    
    def update(self):
        url = self.data['url']
        events[url] = {}
        for key in self.data:
            events[url][key] = self.data[key]
        del events[url]['url']
    

def main_parser():
    '''Evaluates other parser functions for websites from events.json'''
    for page in pages:
        name = pages[page]['name']
        url = pages[page]['url']
        func = pages[page]['func']
        eval(f'{func}("{name}", "{url}")')
        
def yandex_date_converter(date_string):
    '''Parsing date in format вт, 9 июня'''
    lang = 'rus'
    weekdays = date_alias[lang]['weekdays']['short']
    months = date_alias[lang]['months']['gentive']
    
    # преобразуем дату из строки
    weekday, event_day, month = date_string.split()
    event_day = int(event_day)
    event_month = months.index(month) + 1
    event_weekday = weekdays.index(weekday[:-1])
    event_year = current_date.year
    
    # рассматриваем случай конца года: слишком старые записи не хранят
    # но во второй половине года могут быть записи на следующий год
    if (event_month-current_date.month) < -3:
        event_year += 1

    event_date = date(event_year, event_month, event_day)
    if event_date.weekday() == event_weekday:
        return event_date.isoformat()
    

def yandex(name='Мероприятия Яндекса', url='https://events.yandex.ru/'):
    print(f'Parsing page {name} at {url}')
    page = requests.get(url, headers=headers).text
    soup = BeautifulSoup(page, 'html.parser')
    event_cards = soup.body.find_all('div', {'class':'event-card'})
    for event_card in event_cards:
        date_string = event_card.find('div', {'class':'event-card__date'}).text
        event_date = yandex_date_converter(date_string)
        event_url = event_card.a['href']
        # sometimes urls has relative paths
        if 'http' not in event_url:
            event_url = url[:-1] + event_url
        data = {'url':event_url,
                'title':event_card.a['title'],
                'date':event_date,
                'price':0,
                'language':'ru'}
        event = Event(data)
    return event_cards


events_list = yandex()

events

Parsing page Мероприятия Яндекса at https://events.yandex.ru/


{'https://events.yandex.ru/events/vstrecha-rossijskoj-r-09-06-2020': {'title': 'Встреча Российской рабочей группы по стандартизации С++',
  'date': '2020-06-09',
  'price': 0,
  'language': 'ru'},
 'https://events.yandex.ru/events/tutorial-catboost-04-06-2020': {'title': 'Туториал: новые фичи CatBoost',
  'date': '2020-06-04',
  'price': 0},
 'https://events.yandex.ru/events/toloka-04-06-2020': {'title': 'Вебинар по разметке больших массивов данных от Яндекс.Толоки и Dbrain',
  'date': '2020-06-04',
  'price': 0},
 'https://cloud.yandex.ru/events/137': {'title': 'Облако в сфере образования',
  'date': '2020-06-04',
  'price': 0},
 'http://yandex.ru/promo/events/online/kazakhstan': {'title': 'Яндекс для Казахстана: как продолжить бизнес в новой реальности',
  'date': '2020-06-04',
  'price': 0},
 'https://events.yandex.ru/events/hardware/29-may-2020': {'title': 'Я.Железо: разбираем Яндекс.Станцию Мини в прямом эфире',
  'date': '2020-05-29',
  'price': 0},
 'https://cloud.yandex.ru/even

In [16]:
with open('events.json', 'w', encoding='utf8') as events_file:
    json.dump(events, events_file, ensure_ascii=False)

In [213]:
def yandex_event_time(soup):
    classes = ('events-program-item__time',
               'events-info__registration-date',
               'EventHeader__address')
    divs = [soup.find('div', {'class':selector}) for selector in classes]
    time = next(div for div in divs if div is not None).text
    pattern = re.compile(r'\d{2}:\d{2}')
    time = pattern.search(time)
    time = time.group()
    return time


def yandex_event_speakers(soup):
    speakers = soup.find_all('div', {'class':'speaker__name'})
    companies = soup.find_all('div', {'class':'speaker__company'})
    speakers = [f'{speaker.text}, {company.text}' for (speaker, company) in zip(speakers, companies)]
    return speakers


def yandex_event_online_status(soup):
    selectors = {'EventHeader__place':'span',
                 'events-info__with-translation':'div',
                 'events-info__online':'div'}
    statuses = [soup.find(selectors[key], {'class':key}) for key in selectors]
    statuses = [status for status in statuses if status is not None]
    if statuses:
        status = next(iter(statuses)).text.strip()
        if status in ('Online', 'Будет трансляция', 'Прямая трансляция'):
            status = 'Online'
    else:
        status = 'Offline'
    return status

def yandex_event_page(url):
    page = requests.get(url, headers=headers).text
    soup = BeautifulSoup(page, 'html.parser')
    time = yandex_event_time(soup)
    speakers = yandex_event_speakers(soup)
    online_status = yandex_event_online_status(soup)
    
    return speakers, time, online_status

for url in events:
    print(url)
    print(yandex_event_page(url))

https://events.yandex.ru/events/vstrecha-rossijskoj-r-09-06-2020
(['Антон Полухин, Яндекс.Такси', 'Александр Зайцев, Solarwinds'], '18:00', 'Online')
https://events.yandex.ru/events/tutorial-catboost-04-06-2020
([], '18:00', 'Offline')
https://events.yandex.ru/events/toloka-04-06-2020
(['Алексей Хахунов, Co-owner/ СTO Dbrain'], '18:00', 'Offline')
https://cloud.yandex.ru/events/137
([], '12:00', 'Online')
http://yandex.ru/promo/events/online/kazakhstan
([], '12:00', 'Online')
https://events.yandex.ru/events/hardware/29-may-2020
([], '19:00', 'Online')
https://cloud.yandex.ru/events/131
([], '18:00', 'Online')
https://events.yandex.ru/events/pytup-27-05-2020
(['Эмиль Шарифуллин, СКБ Контур', 'Семен Ханин, Яндекс'], '18:00', 'Online')
https://cloud.yandex.ru/events/143
([], '12:00', 'Online')
https://cloud.yandex.ru/events/138
([], '12:00', 'Online')
https://cloud.yandex.ru/events/140
([], '12:00', 'Online')
https://cloud.yandex.ru/events/141
([], '12:00', 'Online')
https://cloud.yandex.