üï∫ This ipynb file is draft for aggregation models for free events about IT and programming: webinars, hackathons, broadcasts and courses.

# TODO:
- Make events as common file for all events ("–ú–µ—Ä–æ–ø—Ä–∏—è—Ç–∏—è –Ø–Ω–¥–µ–∫—Å–∞" is only one key)
- For hand-written content: update of events may be not safe
- Write parser for pages of yandex events: div class:events-info
    - Handlers for required_event_keys:
        + 'url'
        + 'title'
        + 'date'
        + 'time'
        - 'location': div class="events-info__location"
        + 'online status'
    - Handlers for optional_event_keys
        - 'registration_url': Registration link class="events-info__registration-button"
        + 'price'
        - 'tags'
        + 'language'
        + 'speakers'
        + 'themes'
        - 'type'
    - Check if all required values for the event are found

# Strategy
Shortly: first we check the main pages, then event pages

- Load `pages.json` - information about start wepbages: `name`, `url`, `func`
- Load `events.json` - information about current events:
    - `title`,
    - `url` (description),
    - `registration url` (optional, often the same as description url),
    - `date`
    - `time`
    - `location`
    - `online status`
    - `price`
    - `[tags]`
    - `language` (can be checked by lang of theme)
    - `speakers and themes (optional)`
    - `type (optional)`
- Check that events on the main pages are displayed in `events.json`
- If the event is not in the file, we need to add basic information about it on the second stage
- Past events are transfered to `past_events.json`
- On the next stage if any information is missing, we need to supplement it by going to specific url and parse the page
- For registration url we will add utm information

In [1]:
import re
import json
import requests
from datetime import datetime, date
from bs4 import BeautifulSoup

In [2]:
headers = requests.utils.default_headers()
current_date = date.today()
required_event_keys = ('url', 'title', 'date', 'time', 'location', 'online status')
optional_event_keys = ('registration_url', 'price', 'tags', 'language', 'speakers', 'themes', 'type')

with open('pages.json') as pages_file:
    pages = json.load(pages_file)
    
with open('events.json') as events_file:
    events = json.load(events_file)
    
with open('tags.json') as tags_file:
    tags = json.load(tags_file)

date_alias = {'rus':
                 {'weekdays':
                    {'short': ('–ø–Ω', '–≤—Ç', '—Å—Ä', '—á—Ç', '–ø—Ç', '—Å–±', '–≤—Å')},
                  'months':
                    {'gentive': ('—è–Ω–≤–∞—Ä—è', '—Ñ–µ–≤—Ä–∞–ª—è', '–º–∞—Ä—Ç–∞',
                                 '–∞–ø—Ä–µ–ª—è', '–º–∞—è', '–∏—é–Ω—è',
                                 '–∏—é–ª—è', '–∞–≤–≥—É—Å—Ç–∞', '—Å–µ–Ω—Ç—è–±—Ä—è',
                                 '–æ–∫—Ç—è–±—Ä—è', '–Ω–æ—è–±—Ä—è', '–¥–µ–∫–∞–±—Ä—è')}
                 }
              }
soups = {}  # temp dict for storage of soup objects for testing

def get_soup(url):
    '''Returns (with saving) soup objects'''
    if url not in soups:
        page = requests.get(url, headers=headers).text
        soup = BeautifulSoup(page, 'html.parser')
        soups.update({url:soup})
    return soups[url]


def get_tags(url, title, themes):
    '''Returns list of tags for title and themes of event'''
    themes = themes.copy()
    themes.append(title)
    themes.append(url)
    themes_total = ''.join(themes)
    current_tags = []
    for key in tags:
        for tag in tags[key]:
            if tag.lower() in themes_total.lower():
                current_tags.append(key)
    return list(set(current_tags))


def utm_cleaner(url):
    '''Deletes utm tags from url'''
    index = url.find('?utm')
    if index > 0:
        url = url[:index]
    return url

   
class Event:
    def __init__(self, data):
        self.data = data
        self.data['url'] = utm_cleaner(self.data['url'])
        self.update()
    
    def update(self):
        url = self.data['url']
        events[url] = {}
        for key in self.data:
            events[url][key] = self.data[key]
        del events[url]['url']
    

def main_parser():
    '''Evaluates other parser functions for websites from events.json'''
    for page in pages:
        name = pages[page]['name']
        url = pages[page]['url']
        func = pages[page]['func']
        eval(f'{func}("{name}", "{url}")')
        
def yandex_date_converter(date_string):
    '''Parsing date in format –≤—Ç, 9 –∏—é–Ω—è'''
    lang = 'rus'
    weekdays = date_alias[lang]['weekdays']['short']
    months = date_alias[lang]['months']['gentive']
    
    # –ø—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –¥–∞—Ç—É –∏–∑ —Å—Ç—Ä–æ–∫–∏
    weekday, event_day, month = date_string.split()
    event_day = int(event_day)
    event_month = months.index(month) + 1
    event_weekday = weekdays.index(weekday[:-1])
    event_year = current_date.year
    
    # —Ä–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ–º —Å–ª—É—á–∞–π –∫–æ–Ω—Ü–∞ –≥–æ–¥–∞: —Å–ª–∏—à–∫–æ–º —Å—Ç–∞—Ä—ã–µ –∑–∞–ø–∏—Å–∏ –Ω–µ —Ö—Ä–∞–Ω—è—Ç
    # –Ω–æ –≤–æ –≤—Ç–æ—Ä–æ–π –ø–æ–ª–æ–≤–∏–Ω–µ –≥–æ–¥–∞ –º–æ–≥—É—Ç –±—ã—Ç—å –∑–∞–ø–∏—Å–∏ –Ω–∞ —Å–ª–µ–¥—É—é—â–∏–π –≥–æ–¥
    if (event_month-current_date.month) < -3:
        event_year += 1

    event_date = date(event_year, event_month, event_day)
    if event_date.weekday() == event_weekday:
        return event_date.isoformat()
    

def yandex(name='–ú–µ—Ä–æ–ø—Ä–∏—è—Ç–∏—è –Ø–Ω–¥–µ–∫—Å–∞', url='https://events.yandex.ru/'):
    print(f'Parsing page {name} at {url}')
    soup = get_soup(url)
    event_cards = soup.body.find_all('div', {'class':'event-card'})
    for event_card in event_cards:
        date_string = event_card.find('div', {'class':'event-card__date'}).text
        event_date = yandex_date_converter(date_string)
        event_url = event_card.a['href']
        # sometimes urls has relative paths
        if 'http' not in event_url:
            event_url = url[:-1] + event_url
        data = {'group':name,
                'url':event_url,
                'title':event_card.a['title'],
                'date':event_date,
                'price':0,
                'language':'ru'}
        event = Event(data)
    return event_cards


events_list = yandex()

events

Parsing page –ú–µ—Ä–æ–ø—Ä–∏—è—Ç–∏—è –Ø–Ω–¥–µ–∫—Å–∞ at https://events.yandex.ru/


{'https://events.yandex.ru/events/vstrecha-rossijskoj-r-09-06-2020': {'group': '–ú–µ—Ä–æ–ø—Ä–∏—è—Ç–∏—è –Ø–Ω–¥–µ–∫—Å–∞',
  'title': '–í—Å—Ç—Ä–µ—á–∞ –†–æ—Å—Å–∏–π—Å–∫–æ–π —Ä–∞–±–æ—á–µ–π –≥—Ä—É–ø–ø—ã –ø–æ —Å—Ç–∞–Ω–¥–∞—Ä—Ç–∏–∑–∞—Ü–∏–∏ –°++',
  'date': '2020-06-09',
  'price': 0,
  'language': 'ru'},
 'https://events.yandex.ru/events/tutorial-catboost-04-06-2020': {'group': '–ú–µ—Ä–æ–ø—Ä–∏—è—Ç–∏—è –Ø–Ω–¥–µ–∫—Å–∞',
  'title': '–¢—É—Ç–æ—Ä–∏–∞–ª: –Ω–æ–≤—ã–µ —Ñ–∏—á–∏ CatBoost',
  'date': '2020-06-04',
  'price': 0},
 'https://events.yandex.ru/events/toloka-04-06-2020': {'group': '–ú–µ—Ä–æ–ø—Ä–∏—è—Ç–∏—è –Ø–Ω–¥–µ–∫—Å–∞',
  'title': '–í–µ–±–∏–Ω–∞—Ä –ø–æ —Ä–∞–∑–º–µ—Ç–∫–µ –±–æ–ª—å—à–∏—Ö –º–∞—Å—Å–∏–≤–æ–≤ –¥–∞–Ω–Ω—ã—Ö –æ—Ç –Ø–Ω–¥–µ–∫—Å.–¢–æ–ª–æ–∫–∏ –∏ Dbrain',
  'date': '2020-06-04',
  'price': 0},
 'https://cloud.yandex.ru/events/137': {'group': '–ú–µ—Ä–æ–ø—Ä–∏—è—Ç–∏—è –Ø–Ω–¥–µ–∫—Å–∞',
  'title': '–û–±–ª–∞–∫–æ –≤ —Å—Ñ–µ—Ä–µ –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è',
  'date': '2020-06-04',
  'price': 0},
 'http://yande

In [3]:
with open('events.json', 'w', encoding='utf8') as events_file:
    json.dump(events, events_file, ensure_ascii=False)

In [4]:
def yandex_event_time(soup):
    classes = ('events-program-item__time',
               'events-info__registration-date',
               'EventHeader__address')
    divs = [soup.find('div', {'class':selector}) for selector in classes]
    time = next(div for div in divs if div is not None).text
    pattern = re.compile(r'\d{2}:\d{2}')
    time = pattern.search(time)
    time = time.group()
    return time


def yandex_event_speakers(soup):
    speakers = soup.find_all('div', {'class':'speaker__name'})
    companies = soup.find_all('div', {'class':'speaker__company'})
    speakers = [f'{speaker.text}, {company.text}' for (speaker, company) in zip(speakers, companies)]
    return speakers

def yandex_event_themes(soup):
    themes = soup.find_all('div', {'class':'events-program-item__talk'})
    themes = [f'{theme.text}' for theme in themes]
    return themes


def yandex_event_online_status(soup):
    selectors = {'EventHeader__place':'span',
                 'events-info__with-translation':'div',
                 'events-info__online':'div'}
    statuses = [soup.find(selectors[key], {'class':key}) for key in selectors]
    statuses = [status for status in statuses if status is not None]
    if statuses:
        status = next(iter(statuses)).text.strip()
        if status in ('Online', '–ë—É–¥–µ—Ç —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è', '–ü—Ä—è–º–∞—è —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è'):
            status = 'Online'
    else:
        status = 'Offline'
    return status

def yandex_event_page(url):
    title = events[url]['title']
    soup = get_soup(url)
    time = yandex_event_time(soup)
    speakers = yandex_event_speakers(soup)
    themes = yandex_event_themes(soup)
    online_status = yandex_event_online_status(soup)
    tags = get_tags(url, title, themes)
    for key in ('time', 'speakers', 'themes', 'online_status', 'tags'):
        events[url][key] = eval(key)
    return speakers, time, online_status

for url in events:
    yandex_event_page(url)

In [5]:
events

{'https://events.yandex.ru/events/vstrecha-rossijskoj-r-09-06-2020': {'group': '–ú–µ—Ä–æ–ø—Ä–∏—è—Ç–∏—è –Ø–Ω–¥–µ–∫—Å–∞',
  'title': '–í—Å—Ç—Ä–µ—á–∞ –†–æ—Å—Å–∏–π—Å–∫–æ–π —Ä–∞–±–æ—á–µ–π –≥—Ä—É–ø–ø—ã –ø–æ —Å—Ç–∞–Ω–¥–∞—Ä—Ç–∏–∑–∞—Ü–∏–∏ –°++',
  'date': '2020-06-09',
  'price': 0,
  'language': 'ru',
  'time': '18:00',
  'speakers': ['–ê–Ω—Ç–æ–Ω –ü–æ–ª—É—Ö–∏–Ω, –Ø–Ω–¥–µ–∫—Å.–¢–∞–∫—Å–∏', '–ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ó–∞–π—Ü–µ–≤, Solarwinds'],
  'themes': ['–ò—Ç–æ–≥–∏ –≤—Å—Ç—Ä–µ—á–∏ –∫–æ–º–∏—Ç–µ—Ç–∞ C++ –≤ –ü—Ä–∞–≥–µ',
   'SG6 Numerics: –∑–∞—á–µ–º –∏ –ø–æ—á–µ–º—É'],
  'online_status': 'Offline',
  'tags': ['C++']},
 'https://events.yandex.ru/events/tutorial-catboost-04-06-2020': {'group': '–ú–µ—Ä–æ–ø—Ä–∏—è—Ç–∏—è –Ø–Ω–¥–µ–∫—Å–∞',
  'title': '–¢—É—Ç–æ—Ä–∏–∞–ª: –Ω–æ–≤—ã–µ —Ñ–∏—á–∏ CatBoost',
  'date': '2020-06-04',
  'price': 0,
  'time': '18:00',
  'speakers': [],
  'themes': [],
  'online_status': 'Offline',
  'tags': ['Data Science']},
 'https://events.yandex.ru/events/toloka-04-06-2020': {'group': '–ú

In [6]:
soups = {}