In [102]:
from urllib.parse import urlparse
import requests
import scrapy

In [92]:
def postprocess_event_json(event_json):
    for k, v in event_json.items():
        if isinstance(v, list):
            v = '\n'.join(v)
        v = v.replace('\xa0', ' ').replace('\u200b', '')
        event_json[k] = v.strip()
    return event_json

def get_event_json(url):
    url_parts = urlparse(url)
    host = url_parts.netloc
    page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0'})
    dom = scrapy.Selector(text=page.content)
    if host == 'www.facebook.com':
        title = dom.css('#seo_h1_tag ::text').extract_first()
        datetime = dom.css('#event_summary').extract()
        json = {
            'title': title,
            'datatime': 'unheard',
            'location': 'na kudykinoy gore',
            'source': 'facebook'
        }
    if host == 'events.yandex.ru':
        json = {
            'title': dom.css('h2.title ::text').extract_first(),
            'datetime': dom.css('.event-header__when ::text').extract_first(),
            'location': (dom.css('.event-header__place ::text').extract_first() or 'Unknown City') + ' Яндекс',
            'source': url,
            'decription': dom.css('.b-static-text ::text').extract()
        }
    if host == 'www.meetup.com':
        json = {
            'title': dom.css('.pageHead-headline ::text').extract_first(),
            'datetime': ' '.join([s.strip() for s in dom.css('.eventTimeDisplay time ::text').extract() if s != ' ']),
            'location': ' '.join(dom.css('.venueDisplay ::text').extract()),
            'source': url,
            'decription': dom.css('.event-description ::text').extract()
        }
    if host.endswith('timepad.ru'):
        json = {
            'title': dom.css('.ep-3-hero__subtitle ::text').extract_first().strip(),
            'datetime': dom.css('.ep3-pagesummary__time-begin span ::text').extract_first(),
            'location': dom.css('.ep3-pagesummary__place-city ::text').extract_first().strip() + ', ' + dom.css('.ep3-pagesummary__place-adress span ::text').extract_first().strip(),
            'source': url,
            'description': dom.css('.ep3-content .clearfix p ::text').extract()
        }
    
    json = postprocess_event_json(json)
    return json

In [103]:
def get_events_markup():
    events_markup = {
      "https://events.yandex.ru/events/yac/29-may-2018/": {
        'datetime': '29 мая, 08:30',
        'description': '',
        'location': 'Unknown City Яндекс',
        'source': 'https://events.yandex.ru/events/yac/29-may-2018/',
        'title': 'Yet another Conference 2018'
      },
      "https://www.meetup.com/PyData-Moscow/events/240661336/": {
        'title': 'Третий PyData Meetup',
        'datetime': 'Friday, June 23, 2017 6:30 PM to 9:30 PM',
        'location': 'Yandex ул. Льва Толстого, 16  ·  Moscow',
        'source': 'https://www.meetup.com/PyData-Moscow/events/240661336/',
        'description': '',
      },
      "https://sdsj.timepad.ru/event/603431/": {
        "title": "Sberbank Data Science Day 2017",
        "datetime": "11 ноября 2017 c 9:30 до 22:00",
        "location": "Москва, ш. Энтузиастов, 5",
        "source": "https://sdsj.timepad.ru/event/603431/",
        "description": ''
      },
      "https://www.facebook.com/events/1727074767621344/": {

      }
    }
    return events_markup

def test_get_event_json():
    events_markup = get_events_markup()
        
    for url, markup_dict in events_markup.items():
        print('url: %s' % url)
        event_dict = get_event_json(url)
        for k, markup_v in markup_dict.items():
            print('%s: ' % k, end='')
            event_v = event_dict.get(k, 'NONE')
            if event_v == markup_v:
                print('OK')
            elif k in ('description'):
                print('SKIP')
            else:
                print('ERROR:\n%s\n----- should be -----\n%s' % (event_v, markup_v))
        print()

test_get_event_json()

url: https://events.yandex.ru/events/yac/29-may-2018/
datetime: OK
description: SKIP
location: OK
source: OK
title: OK

url: https://www.meetup.com/PyData-Moscow/events/240661336/
title: OK
datetime: OK
location: OK
source: OK
description: SKIP

url: https://sdsj.timepad.ru/event/603431/
title: OK
datetime: OK
location: OK
source: OK
description: SKIP

url: https://www.facebook.com/events/1727074767621344/

