🕺 This ipynb file is draft for aggregation models for free events about IT and programming: webinars, hackathons, broadcasts and courses.

# TODO:
- We need common classes for StartPage (parent) -> Event (child), all selectors can be described in pages.json and used as strings 
- It will be useful make a common method (in common handler class) that make simple text extracting for soup object with 
- Make specific module for yandex features and common handler class
- For hand-written content: update of events may be not safe
- Write parser for pages of yandex events: div class:events-info
    - Handlers for required_event_keys:
        + 'url'
        + 'title'
        + 'date'
        + 'time'
        + 'location': div class="events-info__location"
        + 'online status'
    - Handlers for optional_event_keys
        + 'registration_url': Registration link 
        + 'price'
        + 'tags'
        + 'language'
        + 'speakers'
        + 'themes'
        - 'type'
    - Check if all required values for the event are found

# Strategy
Shortly: first we check the main pages, then event pages

- Load `pages.json` - information about start wepbages: `name`, `url`, `func`
- Load `events.json` - information about current events:
    - `title`,
    - `url` (description),
    - `registration url` (optional, often the same as description url),
    - `date`
    - `time`
    - `location`
    - `online status`
    - `price`
    - `[tags]`
    - `language` (can be checked by lang of theme)
    - `speakers and themes (optional)`
    - `type (optional)`
- Check that events on the main pages are displayed in `events.json`
- If the event is not in the file, we need to add basic information about it on the second stage
- Past events are transfered to `past_events.json`
- On the next stage if any information is missing, we need to supplement it by going to specific url and parse the page
- For registration url we will add utm information

In [1]:
import re
import json
import requests
from datetime import datetime, date
from bs4 import BeautifulSoup

In [2]:
headers = requests.utils.default_headers()
current_date = date.today()

required_event_keys = ('url', 'title', 'date', 'time', 'location', 'online status')
optional_event_keys = ('registration_url', 'price', 'tags', 'language', 'speakers', 'themes', 'type')


with open('pages.json') as pages_file:
    pages = json.load(pages_file)
    
with open('events.json') as events_file:
    events = json.load(events_file)
    
with open('tags.json') as tags_file:
    tags = json.load(tags_file)
    
with open('cities.txt') as cities_file:
    cities = cities_file.read().split('\n')

    
date_alias = {'rus':
                 {'weekdays':
                    {'short': ('пн', 'вт', 'ср', 'чт', 'пт', 'сб', 'вс')},
                  'months':
                    {'gentive': ('января', 'февраля', 'марта',
                                 'апреля', 'мая', 'июня',
                                 'июля', 'августа', 'сентября',
                                 'октября', 'ноября', 'декабря')}
                 }
              }
soups = {}  # temp dict for storage of soup objects for testing

def get_soup(url):
    '''Returns (with saving) soup objects'''
    if url not in soups:
        page = requests.get(url, headers=headers).text
        soup = BeautifulSoup(page, 'html.parser')
        soups.update({url:soup})
    return soups[url]


def get_tags(url, title, themes):
    '''Returns list of tags for title and themes of event'''
    themes = themes.copy()
    themes.append(title)
    themes.append(url)
    themes_total = ''.join(themes)
    current_tags = []
    for key in tags:
        for tag in tags[key]:
            if tag.lower() in themes_total.lower():
                current_tags.append(key)
    return list(set(current_tags))


def utm_cleaner(url):
    '''Deletes utm tags from url'''
    index = url.find('?utm')
    if index > 0:
        url = url[:index]
    return url

   
class Event:
    def __init__(self, data):
        self.data = data
        self.data['url'] = utm_cleaner(self.data['url'])
        self.update()
    
    def update(self):
        url = self.data['url']
        events[url] = {}
        for key in self.data:
            events[url][key] = self.data[key]
        del events[url]['url']
    

def main_parser():
    '''Evaluates other parser functions for websites from events.json'''
    for page in pages:
        name = pages[page]['name']
        url = pages[page]['url']
        func = pages[page]['func']
        eval(f'{func}("{name}", "{url}")')
        
def yandex_date_converter(date_string):
    '''Parsing date in format вт, 9 июня'''
    lang = 'rus'
    weekdays = date_alias[lang]['weekdays']['short']
    months = date_alias[lang]['months']['gentive']
    
    # преобразуем дату из строки
    weekday, event_day, month = date_string.split()
    event_day = int(event_day)
    event_month = months.index(month) + 1
    event_weekday = weekdays.index(weekday[:-1])
    event_year = current_date.year
    
    # рассматриваем случай конца года: слишком старые записи не хранят
    # но во второй половине года могут быть записи на следующий год
    if (event_month-current_date.month) < -3:
        event_year += 1

    event_date = date(event_year, event_month, event_day)
    if event_date.weekday() == event_weekday:
        return event_date.isoformat()
    

def yandex(name='Мероприятия Яндекса', url='https://events.yandex.ru/'):
    print(f'Parsing page {name} at {url}')
    soup = get_soup(url)
    event_cards = soup.body.find_all('div', {'class':'event-card'})
    for event_card in event_cards:
        date_string = event_card.find('div', {'class':'event-card__date'}).text
        event_date = yandex_date_converter(date_string)
        event_url = event_card.a['href']
        # sometimes urls has relative paths
        if 'http' not in event_url:
            event_url = url[:-1] + event_url
        data = {'group':name,
                'url':event_url,
                'title':event_card.a['title'],
                'date':event_date,
                'price':0,
                'language':'ru'}
        event = Event(data)
    return event_cards


events_list = yandex()

Parsing page Мероприятия Яндекса at https://events.yandex.ru/


In [None]:
with open('events.json', 'w', encoding='utf8') as events_file:
    json.dump(events, events_file, ensure_ascii=False)

In [3]:
def get_selectors_content(soup, selectors, elements = 'all', content_type='text'):
    overlaps = []
    
    for key in selectors:
        tag, attr = selectors[key]
        overlaps += soup.find_all(tag, {attr:key})
    
    if content_type == 'text':
        content = [overlap.text.strip() for overlap in overlaps]
    elif content_type == 'href':
        content = [overlap['href'] for overlap in overlaps]

    if elements != 'all':
        content = content[:elements]

    if (len(content) == 0) & (elements == 1):
        return ''
    elif (len(content) == 1) & (elements == 1):
        return content[0]
    else:
        return content


def yandex_event_time(soup):
    selectors = pages['Яндекс']['time']['selectors']
    time = get_selectors_content(soup, selectors, elements = 1)
    pattern = re.compile(r'\d{2}:\d{2}')
    time = pattern.search(time)
    time = time.group()
    return time


def yandex_event_location(soup):
    selectors = pages['Яндекс']['location']['selectors']
    return get_selectors_content(soup, selectors, elements = 1)


def yandex_event_speakers(soup):
    selectors = pages['Яндекс']['speakers']['selectors']
    return get_selectors_content(soup, selectors)


def yandex_event_speakers_companies(soup):
    selectors = pages['Яндекс']['speakers_companies']['selectors']
    return get_selectors_content(soup, selectors)


def yandex_event_themes(soup):
    selectors = pages['Яндекс']['themes']['selectors']
    return get_selectors_content(soup, selectors)


def yandex_event_online_status(soup):
    selectors = pages['Яндекс']['online_status']['selectors']
    status = get_selectors_content(soup, selectors, elements=1)
    if status in ('Online', 'Будет трансляция', 'Прямая трансляция'):
        return 'Online'
    else:
        return 'Offline'
        

def yandex_event_reg_url(soup):
    selectors = pages['Яндекс']['reg_url']['selectors']
    return get_selectors_content(soup, selectors, elements=1, content_type='href')


def yandex_event_page(url):
    soup = get_soup(url)
    for key in ('time', 'location', 'speakers', 'speakers_companies', 'themes', 'online_status', 'reg_url'):
        events[url][key] = eval(f'yandex_event_{key}(soup)')
    tags = get_tags(url, events[url]['title'], events[url]['themes'])
    events[url]['tags'] = tags


for url in events:
    print(url)
    yandex_event_page(url)

https://events.yandex.ru/events/vstrecha-rossijskoj-r-09-06-2020
https://events.yandex.ru/events/tutorial-catboost-04-06-2020
https://events.yandex.ru/events/toloka-04-06-2020
https://cloud.yandex.ru/events/137
http://yandex.ru/promo/events/online/kazakhstan
https://events.yandex.ru/events/hardware/29-may-2020
https://cloud.yandex.ru/events/131
https://events.yandex.ru/events/pytup-27-05-2020
https://cloud.yandex.ru/events/143
https://cloud.yandex.ru/events/138
https://cloud.yandex.ru/events/140
https://cloud.yandex.ru/events/141
https://cloud.yandex.ru/events/146
https://cloud.yandex.ru/events/142
https://cloud.yandex.ru/events/139
https://yandex.ru/promo/events/online/belarus
https://events.yandex.ru/events/hardware/19-june-2020


In [4]:
events

{'https://events.yandex.ru/events/vstrecha-rossijskoj-r-09-06-2020': {'group': 'Мероприятия Яндекса',
  'title': 'Встреча Российской рабочей группы по стандартизации С++',
  'date': '2020-06-09',
  'price': 0,
  'language': 'ru',
  'time': '18:00',
  'location': '',
  'speakers': ['Антон Полухин', 'Александр Зайцев'],
  'speakers_companies': ['Яндекс.Такси', 'Solarwinds'],
  'themes': ['Итоги встречи комитета C++ в Праге',
   'SG6 Numerics: зачем и почему'],
  'online_status': 'Offline',
  'reg_url': 'https://forms.yandex.ru/surveys/10020198.30a2db645e98ed727eb30244eea2f378109b3302',
  'tags': ['C++']},
 'https://events.yandex.ru/events/tutorial-catboost-04-06-2020': {'group': 'Мероприятия Яндекса',
  'title': 'Туториал: новые фичи CatBoost',
  'date': '2020-06-04',
  'price': 0,
  'time': '18:00',
  'location': '',
  'speakers': [],
  'speakers_companies': [],
  'themes': [],
  'online_status': 'Offline',
  'reg_url': 'https://forms.yandex.ru/surveys/10020098.6a3206e40ab2fbd175aad29cd