In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from math import ceil
from tqdm import tqdm_notebook
import tqdm
from collections import Counter
from operator import itemgetter
from datetime import date, datetime

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

import json

## TASS

In [None]:
months = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}
times = [['000000', '060000'], ['060000', '120000'], ['120000', '180000'], ['180000', '235959']]
dates = dict()
for month in tqdm_notebook(list(months.keys())[:2]):
    for day in tqdm_notebook(list(range(months[month]))):
        date = f'2020{month:02d}{day+1:02d}'
        dates[f'2020.{month:02d}.{day+1:02d}'] = []
        for time in times:
            url = f'https://api.gdeltproject.org/api/v2/doc/doc?query=%20(domain:.tass.ru%20OR%20domainis:tass.ru)%20sourcelang:rus&mode=ArtList&maxrecords=250&sort=DateDesc&format=html&startdatetime={date}{time[0]}&enddatetime={date}{time[1]}'
            r = requests.get(url)
            soup = BeautifulSoup(r.text)
            dates[f'2020.{month:02d}.{day+1:02d}'].extend([a['href'] for a in soup.body.table.find_all('a')])
            print(f'2020.{month:02d}.{day+1:02d}', len(soup.body.table.find_all('a')))

In [None]:
tass_2020 = pd.DataFrame(columns = ['Media', 'Date', 'Link', 'Text', 'Dep', 'Tags'])

for date in tqdm_notebook(list(dates.keys())):
    for link in tqdm_notebook(dates[date]):
        if link not in tass_2020.Link.unique():
            try:
                tass_2020 = tass_2020.append({'Media':'tass.ru', 'Date':date, 'Link':link,
                                           'Text':get_article_tass(link)}, ignore_index=True)
            except (AttributeError, IndexError):
                print(link)
    if date.endswith('1'):
        tass_2020.to_csv('tass_2020.csv', encoding='utf-8')
tass_2020.to_csv('tass_2020.csv', encoding='utf-8')

## RIA

In [None]:
def get_article_ria(link):
    r = requests.get(link)
    soup = BeautifulSoup(r.text)
    try:
        heading = soup.h1.text
        text = ' '.join([i.text for i in soup
                  .find('div', 'article__body js-mediator-article mia-analytics')
                  .find_all('div', 'article__text')])
    except AttributeError:
        print('Error ', link)
        return
    return '. '.join([heading, text])

In [None]:
months = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}
dates = dict()
for month in list(months.keys())[:2]:
    for day in range(months[month]):
        link = f'https://ria.ru/2020{month:0>2}{day+1:0>2}/'
        dates[f'2020.{month:0>2}.{day+1:0>2}'] = link
dates

In [None]:
links_ria_2020 = dict()
for date in tqdm_notebook(list(dates.keys())):
    
    browser = webdriver.Chrome('K:\\Programs\\chromedriver_win32\\chromedriver.exe')
#     browser.set_window_size(1280, 1024)

    browser.get(dates[date])

    more_results = browser.find_element_by_class_name('list-more')
    more_results.click()

    element = browser.find_element_by_tag_name('body')
    for i in range(200):
        element.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.3)

    page_source = browser.page_source
    soup = BeautifulSoup(page_source, 'lxml')

    links = [a['href'] for a in soup.find_all('a', 'list-item__title')]
    links_ria_2020[date] = links

    print(f'{date}: {len(links)}')

    browser.close()
    
with open('ria_links_2020.json', 'w', encoding = 'utf-8') as f:
    json.dump(links_ria_2020, f)

In [None]:
ria_2020 = pd.DataFrame(columns = ['Media', 'Date', 'Link', 'Text', 'Dep', 'Tags'])

for date in tqdm_notebook(list(links_ria_2020.keys())):
    for link in tqdm_notebook(links_ria_2020[date]):
        if link not in ria_2020.Link:
            try:
                ria_2020 = ria_2020.append({'Media':'ria.ru', 'Date':date, 'Link':link, 
                            'Text':get_article_ria(link)}, ignore_index=True)
            except (AttributeError, IndexError):
                print(link)
    ria_2020.to_csv('ria_2020.csv', encoding = 'utf-8')
    
ria_2020

## Interfax

In [None]:
def get_links_interfax(date): #format: '26.11.2019'

    page = 1
    months = {'01':'января', '02':'февраля', '03':'марта', '04':'апреля', 
              '05':'мая', '06':'июня', '07':'июля', '08':'августа', 
              '09':'сентября', '10':'октября', '11':'ноября', '12':'декабря'}
    code = dict(zip('абвгдеийклмнопрустфьюя', 
                    ['%E0', '%E1', '%E2', '%E3', '%E4', '%E5', 
                     '%E8', '%E9', '%EA', '%EB', '%EC', '%ED', 
                     '%EE', '%EF', '%F0', '%F3', '%F1', '%F2', 
                     '%F4', '%FC', '%FE', '%FF']))
    sw = ''.join([code[letter] for letter in months[date.split('.')[1]]])

    link = f'https://www.interfax.ru/search/?sw={sw}&df={date}&dt={date}&sec=0&p=page_{page}'

    r = requests.get(link)
    soup = BeautifulSoup(r.text)

    num_of_articles = int(soup.find('div', 'sPageResult__total').text.split()[1])
    num_of_pages = ceil((num_of_articles)/40)

    links = set()

    for page in range(num_of_pages):

        link = f'https://www.interfax.ru/search/?sw={sw}&df={date}&dt={date}&sec=0&p=page_{page+1}'
        r = requests.get(link)
        soup = BeautifulSoup(r.text)

        for i in soup.find('div', 'leftside').find_all('a'):
            try:
                link = i['href']
                if (not link.endswith('/') and 
                    not link.endswith('=0') and 
                    'story' not in link and 
                    'sw' not in link and 
                    'photo' not in link):

                    if 'http' not in link:
                        links.add('https://www.interfax.ru'+link)
                    else:
                        links.add(link)
            except KeyError:
                if i.attrs['class'] == ['active']:
                    pass
                else:
                    print(i)

    return num_of_articles, len(links), links

In [None]:
def get_article_interfax(link):
    result = ''
    r = requests.get(link)
    soup = BeautifulSoup(r.text)
    try:
        heading = soup.article.h1.text.encode(encoding='cp1252').decode('cp1251')
        result += heading
    except UnicodeEncodeError :
        print(link, soup.article.h1.text)
    try:
        text = ' '.join([p.text.encode(encoding='cp1252').decode('cp1251') 
                     for p in soup.article.find_all('p') if p.text])
        result = result + '. ' + text
    except UnicodeEncodeError:
        print(link, soup.article.find_all('p'))
    return result

## Kommersant

In [None]:
def get_links_kommers(date): #date in format '26.11.2019'
    
    #get first page of results and find the number of such pages (each contains 10 articles)
    search_query='%D0%B8' # letter "и", can't be empty
    page='1'
    date=str(date)
    link=f'https://www.kommersant.ru/search/results?places=&categories=&isbankrupt=\
    &datestart={date}&dateend={date}&sort_type=1&sort_dir=&regions=&results_count=\
    &page={page}&search_query={search_query}&sort_by=1&search_full=1\
    &time_range=2&dateStart={date}&dateEnd={date}'

    r = requests.get(link)
    soup = BeautifulSoup(r.text)

    num_of_articles=(soup.body.find_all('div', {'class':'layout'})[4]
                     .find('form', {'id':'frmSearch'})
                     .find('h3').text.split()[4])

    #gather links from the pages
    results_day=[]
    for page in range(ceil(int(num_of_articles)/10)):

        link=f'https://www.kommersant.ru/search/results?places=&categories=&isbankrupt=\
        &datestart={date}&dateend={date}&sort_type=1&sort_dir=&regions=&results_count=\
        &page={page+1}&search_query={search_query}&sort_by=1&search_full=1\
        &time_range=2&dateStart={date}&dateEnd={date}'

        r = requests.get(link)
        soup = BeautifulSoup(r.text)

        try:
            results=(soup.body
                 .find_all('div', {'class':'layout'})[4]
                 .find('div', {'class':'grid grid--main'})
                 .find('div', {'class':'grid_cell grid_cell_large'})
                 .find('div', {'class':'search_results_holder'})
                 .find_all('div', {'class':'search_results_item'}))
        except AttributeError:
            print(page)

        for i in results:
            results_day.append('https://www.kommersant.ru'+i.find('h2').find('a')['href'].split('?')[0])

    return results_day

In [None]:
def get_article_kommers(link): #like this: 'https://www.kommersant.ru/doc/4149269'

    r = requests.get(link)
    soup = BeautifulSoup(r.text)

    if r.status_code!=404:
        try:
            article = soup.body.find_all('div', {'class':'layout'})[4]
    #      .find('div', {'class':'grid grid--main'})
    #      .find('div', {'class':'grid_cell grid_cell_big js-middle'})
    #      .find('div', {'class':'lenta js-lenta'})
    #      .find('article', {'class':'b-article'}))
            try:
                header = ' '.join([article.header.find('div', 'text').h2.text, 
                                  article.header.find('div', 'text').h1.text])
            except AttributeError: #some articles doesn't have subtitles
                header = article.header.find('div', 'text').h1.text

            text = ' '.join([p.text for p in article.find('div', 'article_text_wrapper').find_all('p')])

            return ' '.join([header, text])
    
        except AttributeError as err:
            print("Something's wrong with", link, '\n', err)
#             article = soup.body.find_all('div', {'class':'layout'})[4]
    #         return link
        except IndexError as ind:
            print("Something's wrong with", link, '\n', ind)
    
    else: 
        print(r, link)

In [None]:
links_commers_2019 = dict()
months = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}
years = [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

for month in tqdm_notebook(list(months.keys())):
    for day in tqdm_notebook(list(range(months[month]))):
        links = get_links_kommers(f'{day+1:0>2}.{month:0>2}.2019')
        links_commers_2019[f'2019.{month:0>2}.{day+1:0>2}'] = links
    with open('links_commers_2019.json', 'w', encoding = 'utf-8') as f:
        json.dump(links_commers_2019, f)
links_commers_2019

In [None]:
commers_2019 = pd.DataFrame(columns = ['Media', 'Date', 'Link', 'Text', 'Dep', 'Tags'])
for date in tqdm_notebook(list(links_commers_2019.keys())):
    for link in links_commers_2019[date]:
        commers_2019 = commers_2019.append({'Media':'kommersant.ru', 'Date':date, 'Link':link,
                                           'Text':get_article_kommers(link)}, ignore_index=True)
    if date.endswith('1'):
        commers_2019.to_csv('commers_2019.csv', encoding='utf-8')
commers_2019.to_csv('commers_2019.csv', encoding='utf-8')
commers_2019 

## Vedomosti

In [None]:
months = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}
dates = dict()
for month in tqdm_notebook(list(months.keys())[:2]):
    for day in tqdm_notebook(list(range(months[month]))):
        date = f'2020{month:02d}{day+1:02d}'
        r = requests.get(f'https://api.gdeltproject.org/api/v2/doc/doc?query=%20(domain:.vedomosti.ru%20OR%20domainis:vedomosti.ru)%20sourcelang:rus&mode=ArtList&maxrecords=250&sort=DateDesc&format=html&startdatetime={date}000000&enddatetime={date}235959')
        soup = BeautifulSoup(r.text)
        dates[f'2020.{month:02d}.{day+1:02d}'] = [a['href'] for a in soup.body.table.find_all('a')]
        print(f'2020.{month:02d}.{day+1:02d}', len(soup.body.table.find_all('a')))

In [None]:
vedomosti_2020 = pd.DataFrame(columns = ['Media', 'Date', 'Link', 'Text', 'Dep', 'Tags'])

for date in tqdm_notebook(links_2020.keys()):
    for link in tqdm_notebook(links_2020[date]):
        if 'vedomosti' in link:
            try:
                r = requests.get(link)
                soup = BeautifulSoup(r.text)
                heading = soup.h1.text.strip()
                text = ' '.join([p.text for p in soup.find("div", 'article__boxes').
                                 find_all('p', 'box-paragraph__text')])
                vedomosti_2020 = vedomosti_2020.append({'Media':'vedomosti.ru', 'Date':date, 'Link':link,
                                           'Text':'. '.join([heading, text])}, ignore_index=True)
            except (AttributeError, IndexError):
                print(link)
    if date.endswith('1'):
        vedomosti_2020.to_csv('vedomosti_2020.csv', encoding='utf-8')
vedomosti_2020.to_csv('vedomosti_2020.csv', encoding='utf-8')

## Rossiyskaya Gazeta

In [None]:
def get_links_rosgazeta(date): # like this: '01.03.2019'

    browser = webdriver.Chrome('C:\\Users\\manal\\chromedriver')
    browser.set_window_size(1080, 1024)

    browser.get(f'https://rg.ru/search/?from={date}&to={date}&?keywords=и')

    time.sleep(4)

    num_of_pages = int(browser.find_element_by_class_name('b-search-info__meta').text.split()[0])

    body = browser.find_element_by_tag_name('body')

    for i in range(num_of_pages//15+1):
        for j in range(6):
            body.send_keys(Keys.PAGE_DOWN)
        try:
            browser.find_element_by_class_name('b-link-btn').click()
        except selenium.common.exceptions.ElementClickInterceptedException:
            continue
        time.sleep(2)

    page_source = browser.page_source
    soup = BeautifulSoup(page_source, 'lxml')
    
    browser.close()

    return ['https://rg.ru'+h2.a['href'] for h2 in soup.find_all('h2', 'b-news-inner__list-item-title')]

In [None]:
def get_article_rosgazeta(link):
    r = requests.get(link)
    soup = BeautifulSoup(r.text)
    try:
        heading = soup.h1.text.strip()
        text = ' '.join([p.text for p in soup.find('div', 'b-material-wrapper b-material-wrapper_art').find_all('p')])
        return '. '.join([heading, text])
    except AttributeError:
        try: 
            heading = soup.h1.text.strip()
            text = ' '.join([p.text for p in soup.find('div', 'b-material-wrapper__text').find_all('p')])
            return '. '.join([heading, text])
        except AttributeError:    
            print(link)

In [None]:
links_rosgazeta_2020 = dict()
months = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}
years = [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

for month in tqdm_notebook(list(months.keys())[1:2]):
    for day in tqdm_notebook(list(range(months[month]))[16:]):
        try:
            links = get_links_rosgazeta(f'{day+1:0>2}.{month:0>2}.2020')
            links_rosgazeta_2020[f'2020.{month:0>2}.{day+1:0>2}'] = links
        except selenium.common.exceptions.NoSuchElementException:
            print(f'{day+1:0>2}.{month:0>2}.2020')
    with open('rosgazeta_links_2020_02.json', 'w', encoding = 'utf-8') as f:
        json.dump(links_rosgazeta_2020, f)
links_rosgazeta_2020

In [None]:
rosgazeta_2020 = pd.DataFrame(columns = ['Media', 'Date', 'Link', 'Text', 'Dep', 'Tags'])
for date in tqdm_notebook(list(links_rosgazeta_2020.keys())[-6:]):
    for link in links_rosgazeta_2020[date]:
        if ('prikaz' not in link) and ('-dok' not in link) and (link not in rosgazeta_2020.Link.values):
            rosgazeta_2020 = rosgazeta_2020.append({'Media':'rg.ru', 'Date':date, 'Link':link,
                                           'Text':get_article_rosgazeta(link)}, ignore_index=True)
    if date.endswith('1'):
        rosgazeta_2020.to_csv('rosgazeta_2020.csv', encoding='utf-8')
rosgazeta_2020.to_csv('rosgazeta_2020.csv', encoding='utf-8')
rosgazeta_2020

## Novaya Gazeta

In [None]:
def get_links_novaya(date): #like this: '23.07.19'
    
    months_names = {'01':'января', '02':'февраля', '03':'марта', '04':'апреля', 
                  '05':'мая', '06':'июня', '07':'июля', '08':'августа', 
                  '09':'сентября', '10':'октября', '11':'ноября', '12':'декабря'}
    q = months_names[date.split('.')[1]]
    
    link = f'https://content.novayagazeta.ru/search?q={q}&date_from={date}&date_to={date}&limit=100'

    r = requests.get(link)
    
    links = []
    for item in r.json()['items']:
        if item['type'] == 'news_entry':
            links.append('https://novayagazeta.ru/news/'+item['code'])
        elif item['type'] == 'article' or item['type'] == 'photo' or item['type'] == 'video':
            links.append('https://novayagazeta.ru/articles/'+item['code'])
        else:
            print(item['code'])
    return links

In [None]:
def get_article_novaya(link):
    r = requests.get(link)
    soup = BeautifulSoup(r.text)
    try:
        heading = soup.h1.text
        text = ' '.join([p.text for p in soup.find_all('p') if len(p.attrs)==0])
        return '. '.join([heading, text.replace('\xa0', ' ')])
    except AttributeError:
        print(link)

In [None]:
novaya_links_2020 = dict()

months = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}

for month in tqdm_notebook(list(months.keys())[:2]):
    for day in tqdm_notebook(range(months[month])):
        try:
            links = get_links_novaya(f'{day+1:0>2}.{month:0>2}.20')
            novaya_links_2020[f'2020.{month:0>2}.{day+1:0>2}'] = links
        except AttributeError:
            print(f'{day+1:0>2}.{month:0>2}.20')
        with open('novaya_links_2020.json', 'w', encoding = 'utf-8') as f:
            json.dump(novaya_links_2020, f)
novaya_links_2020

In [None]:
novaya_2020 = []
for date in tqdm_notebook(list(novaya_links_2020.keys())):
    for link in novaya_links_2020[date]:
        if ('prikaz' not in link) and ('-dok' not in link):
            try:
                novaya_2020.append({'Media':'novayagazeta.ru', 'Date':date, 'Link':link,
                                           'Text':get_article_novaya(link), 'Dep':'', 'Tags':[]})
            except requests.exceptions.ConnectionError:
                print(link)
    if date.endswith('1'):
        pd.DataFrame(novaya_2020).to_csv('novaya_2020.csv', encoding='utf-8')
pd.DataFrame(novaya_2020).to_csv('novaya_2020.csv', encoding='utf-8')
pd.DataFrame(novaya_2020)

## AiF

In [None]:
def get_links_aif(date): #'2020-02-01'
    url = f'https://aif.ru/search/index/index/from/{date}/to/{date}/text/%D1%87%D1%82%D0%BE'
    links = []
    for i in range(20):
        json = {'page':i+1}
        r = requests.get(url, json)
        soup = BeautifulSoup(r.text)
        links_temp = [div.find('a')['href'] for div in soup.find_all('div', 'text_box')]
        if links_temp:
            links.extend(links_temp)
        else:
#             print(i+1)
            break
    return links

In [None]:
def get_article_aif(link):
    r = requests.get(link)
    soup = BeautifulSoup(r.text)
    try:
        heading = soup.h1.text
        text = ' '.join([p.text for p in soup.find('div', 'article_text').find_all('p') 
                         if p.text and 'Подробнее' not in p.text]).replace('\xa0', ' ')
        return '. '.join([heading, text])
    except AttributeError:
        print(link)

In [None]:
aif_links_2020 = dict()
months = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}

for month in tqdm_notebook(list(months.keys())[:2]):
    for day in tqdm_notebook(list(range(months[month]))):
        try:
            links = get_links_aif(f'2020-{month:0>2}-{day+1:0>2}')
            aif_links_2020[f'2020.{month:0>2}.{day+1:0>2}'] = links
        except IndexError:
            print(f'2020-{month:0>2}-{day+1:0>2}')
            
        if day%5 == 0:
            with open('aif_links_2020.json', 'w', encoding = 'utf-8') as f:
                json.dump(aif_links_2020, f)
with open('aif_links_2020.json', 'w', encoding = 'utf-8') as f:
    json.dump(aif_links_2020, f)
aif_links_2020

In [None]:
aif_2020 = []
for date in tqdm_notebook(list(aif_links_2020.keys())):
    for link in aif_links_2020[date]:
        if 'gallery' not in link:
            aif_2020.append({'Media':'aif.ru', 'Date':date,
                                        'Link':link, 'Text':get_article_aif(link)})
    if date.endswith('01'):
        pd.DataFrame(aif_2020).to_csv('aif_2020.csv', encoding='utf-8')
pd.DataFrame(aif_2020).to_csv('aif_2020.csv', encoding='utf-8')
aif_2020 

## Izvestiya

In [None]:
def get_links_izvestiya(date): #like this: '2020-01-02'
    
    links = set()
    
    page = 1
    for text in ['заявил', 'говорит', 'считает', 'известия', 
                 'рассказал', 'написал', 'сообщил', 'сообщает', 
                 'объяснил', 'попал', 'вышла', 'передает', 'пришли']:
        link = f'https://iz.ru/search?type=0&prd=3&from=0&text={text}&date_from={date}&date_to={date}&sort=0'
        r = requests.get(link)
        soup = BeautifulSoup(r.text)

        num_of_articles = int(soup.find('span', 'Number-of-results__nomber').text)
        num_of_pages = ceil(num_of_articles/10)

        for page in range(num_of_pages):
            link = f'https://iz.ru/search?type=0&prd=3&from={page*10}&text={text}&date_from={date}&date_to={date}&sort=0'
            r = requests.get(link)
            soup = BeautifulSoup(r.text)
            for div in soup.find_all('div', 'view-search'):
                links.add(div.a['href'])
    return list(links)

In [None]:
def get_article_izvestiya(link):
    r = requests.get(link)
    soup = BeautifulSoup(r.text)

    heading = soup.h1.span.text
    text = ' '.join([p.text for p in soup.find('div', 'text-article').find_all('p')])
    if not text:
        text = soup.find('div', 'text-article__inside').div.text.replace('\n', ' ')
    return '. '.join([heading, text])

In [None]:
months = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}

izvestiya_links_2019 = dict()

for month in tqdm_notebook(list(months.keys())[-1:]):
    for day in tqdm_notebook(list(range(months[month]))):
        time.sleep(1)
        try:
            links = get_links_izvestiya(f'2019-{month:0>2}-{day+1:0>2}')
            izvestiya_links_2019[f'2019.{month:0>2}.{day+1:0>2}'] = links
        except (AttributeError, IndexError, TimeoutError):
            print(f'2019-{month:0>2}-{day+1:0>2}')

        if day%5 == 0:
            with open('izvestiya_links_2019.json', 'w', encoding = 'utf-8') as f:
                json.dump(izvestiya_links_2019, f)
            
with open('izvestiya_links_2019.json', 'w', encoding = 'utf-8') as f:
    json.dump(izvestiya_links_2019, f)
izvestiya_links_2019

In [None]:
izvestiya_2019 = []
for date in tqdm_notebook(list(izvestiya_links_2019.keys())):
    for link in izvestiya_links_2019[date]:
        try:
            izvestiya_2019.append({'Media':'izvestiya.ru', 'Date':date, 'Link':link,
                                       'Text':get_article_izvestiya(link), 'Dep':'', 'Tags':[]})
        except (requests.exceptions.ConnectionError, AttributeError, IndexError, TimeoutError):
            print(link)
    if date.endswith('1'):
        pd.DataFrame(izvestiya_2019).to_csv('izvestiya_2019.csv', encoding='utf-8')
pd.DataFrame(izvestiya_2019).to_csv('izvestiya_2019.csv', encoding='utf-8')
pd.DataFrame(izvestiya_2019)

## Komsomol'skaya Pravda

In [None]:
def get_article_kp(link):
    try:
        soup = BeautifulSoup(requests.get(link).text)
        text = ' '.join([soup.h1.text, soup.find('div', 'ArticleDescription').text, ' '.join([p.text for p in soup.find('div', 'text').find_all('p') if p.text and 'ИСТОЧНИК' not in p.text])])
        return text
    except (AttributeError, IndexError):
        print(link)

In [None]:
months = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}
times = [['000000', '060000'], ['060000', '120000'], ['120000', '180000'], ['180000', '235959']]
kom_20 = dict()
for month in tqdm_notebook(list(months.keys())[:2]):
    for day in tqdm_notebook(list(range(months[month]))):
        date = f'2020{month:02d}{day+1:02d}'
        kom_20[f'2020.{month:02d}.{day+1:02d}'] = []
        for time in times:
            url = f'https://api.gdeltproject.org/api/v2/doc/doc?query=%20(domain:.kp.ru%20OR%20domainis:kp.ru)%20sourcelang:rus&mode=ArtList&maxrecords=250&sort=DateDesc&format=html&startdatetime={date}{time[0]}&enddatetime={date}{time[1]}'
#             url = f'https://api.gdeltproject.org/api/v2/doc/doc?query=%20(domain:.tass.ru%20OR%20domainis:tass.ru)%20sourcelang:rus&mode=ArtList&maxrecords=250&sort=DateDesc&format=html&startdatetime={date}{time[0]}&enddatetime={date}{time[1]}'
            r = requests.get(url)
            soup = BeautifulSoup(r.text)
            kom_20[f'2020.{month:02d}.{day+1:02d}'].extend([a['href'] for a in soup.body.table.find_all('a')])
            print(f'2020.{month:02d}.{day+1:02d}', len(soup.body.table.find_all('a')))
with open('kp_links_2020.json', 'w', encoding = 'utf-8') as f:
    json.dump(kom_20, f)

In [None]:
kp_2020 = pd.DataFrame(columns = ['Media', 'Date', 'Link', 'Text', 'Dep', 'Tags'])

for date in tqdm_notebook(list(kom_20.keys())):
    for link in tqdm_notebook(kom_20[date]):
        if link not in kp_2020.Link.unique():
            try:
                kp_2020 = kp_2020.append({'Media':'kp.ru', 'Date':date, 'Link':link,
                                           'Text':get_article_kp(link)}, ignore_index=True)
            except (AttributeError, IndexError):
                print(link)
    if date.endswith('1'):
        kp_2020.to_csv('kp_2020.csv', encoding='utf-8')
kp_2020.to_csv('kp_2020.csv', encoding='utf-8')

## Moskovskiy Komsomolets

In [None]:
def get_article_mosc_comsomol(link):
    r = requests.get(link)
    soup = BeautifulSoup(r.text)
    heading = soup.h1.text
    text = (' '.join([p.text for p in soup.find('div', 'content').find_all('p')])
            .replace('\xa0', ' ').replace('\n', ' '))
    return ' '.join([heading, text])

In [None]:
months = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}
times = [['000000', '120000'], ['120000', '235959']]
mk_20 = dict()
for month in tqdm_notebook(list(months.keys())[:2]):
    for day in tqdm_notebook(list(range(months[month]))):
        date = f'2020{month:02d}{day+1:02d}'
        mk_20[f'2020.{month:02d}.{day+1:02d}'] = []
        for time in times:
            try:
                url = f'https://api.gdeltproject.org/api/v2/doc/doc?query=%20(domain:.mk.ru%20OR%20domainis:mk.ru)%20sourcelang:rus&mode=ArtList&maxrecords=250&sort=DateDesc&format=html&startdatetime={date}{time[0]}&enddatetime={date}{time[1]}'
                r = requests.get(url)
                soup = BeautifulSoup(r.text)
                mk_20[f'2020.{month:02d}.{day+1:02d}'].extend([a['href'] for a in soup.body.table.find_all('a')])
                print(f'2020.{month:02d}.{day+1:02d}', len(soup.body.table.find_all('a')))
            except (AttributeError, IndexError):
                print(date)
with open('mk_links_2020.json', 'w', encoding = 'utf-8') as f:
    json.dump(mk_20, f)

In [None]:
mosc_comsomol_2020 = []
for date in tqdm_notebook(list(mk_20.keys())):
    for link in tqdm_notebook(mk_20[date]):
        if link not in pd.DataFrame(mosc_comsomol_2020).Link:
            try:
                mosc_comsomol_2020.append({'Media':'mk.ru', 'Date':date, 'Link':link,
                                           'Text':get_article_mosc_comsomol(link), 'Dep':'', 'Tags':[]})
            except (requests.exceptions.ConnectionError, AttributeError, IndexError):
                print(link)
    if date.endswith('1'):
        pd.DataFrame(mosc_comsomol_2020).to_csv('mosc_comsomol_2020.csv', encoding='utf-8')
pd.DataFrame(mosc_comsomol_2020).to_csv('mosc_comsomol_2020.csv', encoding='utf-8')
pd.DataFrame(mosc_comsomol_2020)