`pip3 install -r requirements.txt`  

Don't forget to install **[Chrome Driver](https://sites.google.com/a/chromium.org/chromedriver/downloads)**

In [3]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options

from time import sleep
import tqdm
import json

import requests
from bs4 import BeautifulSoup
from html2text import html2text

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from src import json2xml

## 1. Парсинг URL адресов с новостями (Selenium)

- Run Chrome with Adblock and some preferences

In [5]:
prefs = {"profile.managed_default_content_settings.images": 2,
         "profile.default_content_settings.cookies": 2}
chrome_options = Options()
chrome_options.add_extension('ext/adblock.crx')
chrome_options.add_experimental_option("prefs",prefs)
chrome_options.add_argument('--disable-application-cache')
driver = Chrome(chrome_options=chrome_options)
driver.implicitly_wait(10)
driver.close()
driver.switch_to_window(driver.window_handles[0])
driver.get('chrome-extension://gighmmpiobklfepjocnamgkkbiglidom/options.html')
driver.find_element_by_id('acceptable_ads').click()
driver.find_elements_by_class_name('ui-state-default')[1].click()
driver.implicitly_wait(10)
driver.find_element_by_id('language_select').find_element_by_xpath("//option[@value='russian']").click()
driver.get('https://news.rambler.ru/politics/latest/?page=1')
driver.find_element_by_class_name('j-footer__switch').click()

- Parse [Rambler](https://news.rambler.ru) news links.

In [None]:
for page in tqdm.tqdm(range(0,args.pages),position=1):
    driver.get('https://news.rambler.ru/' + args.category + '/latest/?page=' + str(page))
    for element in tqdm.tqdm(driver.find_elements_by_class_name('article-card'),position=0):
        with open(args.output,'a') as file:
                    file.write(json.dumps({
                        'link' : element.find_element_by_tag_name('a').get_attribute('href'),
                        'category': args.category
                    },ensure_ascii=False) + '\n')
driver.quit()

## 2. Парсинг Rambler новостей (BeautifulSoup + lxml)

In [4]:
posts = []
with open('rambler-news-links.json') as file:
    for line in file:
        posts.append(json.loads(line))

In [None]:
for element in tqdm.tqdm(posts):
    try:
        soup = BeautifulSoup(requests.get(element['link']).text, "lxml")
        title = html2text(soup.findAll('h1',attrs={'class': 'big-title__title'})[0].text).replace('\n', '')
        text = ''
        for paragraph in soup.findAll('div',attrs={'class': 'article__paragraph'}):
            text += html2text(paragraph.text).replace('\n', ' ') + ' '

        with open('args.output.json','a') as file:
                        file.write(json.dumps({
                            'link' : element['link'],
                            'category': element['category'],
                            'title': title,
                            'text': text
                        },ensure_ascii=False) + '\n')
    except:
        pass

  1%|          | 11/1503 [00:13<31:31,  1.27s/it]

## 3. Обзор полученных данных

In [321]:
news = []
with open('ramler-news.json') as file:
    for line in file:
        news.append(json.loads(line))

In [322]:
data = pd.DataFrame(news)

In [323]:
data.head()

Unnamed: 0,category,link,text,title
0,politics,https://news.rambler.ru/politics/39054690-push...,Член Совета Федерации РФ Алексей Пушков назвал...,Пушков назвал требования Украины лишить РФ пра...
1,politics,https://news.rambler.ru/other/39054497-erdogan...,Президент Турции Реджеп Тайип Эрдоган и презид...,Эрдоган и Макрон обсудили операцию в Африне
2,politics,https://news.rambler.ru/world/39054594-na-kipr...,На Кипре пройдет второй тур президентских выбо...,На Кипре пройдет второй тур выборов президента
3,politics,https://news.rambler.ru/politics/39054551-neme...,"Депутаты делегации из Северной Рейн-Вестфалии,...",Немецкий депутат поставил на место Климкина в ...
4,politics,https://news.rambler.ru/conflicts/39054513-pen...,РИА НОВОСТИ. США НЕ СНАБЖАЛИ СВОИХ СОЮЗНИКОВ П...,Пентагон: США не снабжали союзников по коалици...


## 4. Мешок слов

In [324]:
vectorizer = CountVectorizer()
vectorizer.fit(data['title'].append(data['text']))

inverse_vocabulary = {str(v): k for k, v in vectorizer.vocabulary_.items()}

with open('dictionary.json','w') as file:
    file.write(json.dumps(inverse_vocabulary,ensure_ascii=False,indent=1))

In [325]:
for i in range(0,len(data)):
    sentences = []
    data['title'][i] = [vectorizer.transform([data['title'][i]]).indices]
    data['text'][i] = [list(vectorizer.transform([sentence]).indices) for sentence in data['text'][i].split('.')]
data.columns = ['category', 'link', 'sentence','title']

In [326]:
data.head()

Unnamed: 0,category,link,sentence,title
0,politics,https://news.rambler.ru/politics/39054690-push...,"[[32, 37, 69, 112, 167, 235, 260, 316, 392, 43...","[[69, 112, 167, 235, 255, 260, 392, 435, 479, ..."
1,politics,https://news.rambler.ru/other/39054497-erdogan...,"[[36, 236, 304, 318, 366, 398, 456, 507, 560, ...","[[36, 236, 304, 318, 617]]"
2,politics,https://news.rambler.ru/world/39054594-na-kipr...,"[[31, 33, 92, 95, 131, 148, 198, 205, 230, 238...","[[92, 95, 205, 255, 399, 427, 572]]"
3,politics,https://news.rambler.ru/politics/39054551-neme...,"[[43, 50, 67, 76, 80, 134, 140, 176, 262, 271,...","[[83, 138, 208, 244, 255, 276, 388, 488]]"
4,politics,https://news.rambler.ru/conflicts/39054513-pen...,"[[292, 466], [133, 173, 209, 271, 325, 352, 36...","[[173, 209, 271, 325, 352, 366, 502, 521, 541,..."


## 5. Конвертирование в JSON и XML форматы

In [327]:
js = json.loads(data.to_json(force_ascii=False, orient='records'))

In [328]:
with open('bow.json', 'w') as file:
    for i in js:
        file.write(json.dumps(i) + '\n')

In [320]:
xml = Json2xml.fromstring(json.dumps(js,ensure_ascii=False)).data
xml = Json2xml(xml)
with open('bow.xml','w') as file:
    file.write(xml.json2xml())