# Обработка и дополнение данных

---

### Цели

1. Подготовить "чистые" датасеты для моделей обучения
2. 

In [1]:
import json
import os
from multiprocessing import Pool
import sys
import matplotlib.pyplot as plt
import pandas as pd
import pprint
import progressbar
import re
import requests
import seaborn as sns
import time
import urllib
import warnings

In [7]:
plt.style.use('ggplot')
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 50
pd.options.display.max_rows = 20

In [3]:
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))

## Get full data from api

#### Articles

In [None]:
articles_api = 'https://www.mos.ru/api/newsfeed/v4/frontend/json/ru/articles?expand=spheres,kind,image&fields=id,title,importance,published_at,created_at,updated_at,is_deferred_publication,status,ya_rss,active_from,active_to,oiv_id,search,display_image,label,icon_id,canonical_url,canonical_updated_at,is_powered,has_image,date,has_district,date_timestamp,tags,theme_id,theme_ids,themes,spheres,sphere,kind,is_oiv_publication,organizations,updated_at_timestamp,created_at_timestamp,attach,active_from_timestamp,active_to_timestamp,image,counter,territory_area_id,territory_district_id,preview_text,full_text,url,preview,text,promo,images'

In [None]:
def get_api_data(base_api_url, filename='article_full.json') -> json:
    response = requests.get(base_api_url)
    data = response.json()
    n_pages = data.get("_meta").get("pageCount") + 1
    print("Total number of pages: ", n_pages)
    full_data = []
    pbar = progressbar.ProgressBar(widgets=[
                          progressbar.SimpleProgress(),
                          progressbar.Bar(),
                          progressbar.ETA(),
                          progressbar.FileTransferSpeed(),
                          ],
                          max_value=n_pages).start()
    for i in range(n_pages):
        page = requests.get(base_api_url + f"&page={i}")
        try:
            full_data.append(page.json().get('items'))
        except Exception as e:
            print(f"Sorry, man! We got {e}")
#             questionary.select("Try to limit number of fields and options to expand:",
#                                choices=[
#                                    "spheres",
#                                    "kind",
#                                    "image",
#                                    "preview",
#                                    "text",
#                                    "promo",
#                                    "images",
#                                ]
#                               ).ask()
#             page = requests.get(base_api_url[:62] + f"&page={i}")
#             full_data.append(page.json().get('items')[0])
        pbar.update(i+1)
#         print(f"page {i}: ", len(page.json().get("items", "error")))
    with open(os.path.join(BASE_DIR, "data", filename), "w") as file:
        json.dump(full_data, file)
        pbar.finish()

In [None]:
get_api_data(articles_api, 'articles_full.json')

##### Тут была попытка собрать всё побыстрее, используя мультипроцессинг и map... пока безуспешно. Работает на несколько порядков быстрее, чем обычный перебор страниц выше, но данные в финальном датасете только по первой и последней странице. Кто-нибудь сталкивался с подобным?

In [None]:
# full_articles = []

In [None]:
# for i in range(1, n_pages + 1):
#     r = requests.get(articles_api+f'&page={i}')
#     data = r.json()
#     full_articles.append(data.get('items'))

In [None]:
# def req(page):
#     url = articles_api+f'&page={page}'
#     r = requests.get(url)
#     data = r.json()
#     return data

In [None]:
# with Pool(processes = 5) as P:
#     results = P.map(req, list(range(1, n_pages + 1)))

In [None]:
# results[2]

In [None]:
# flat_results = [item for sublist in results for item in sublist]

In [None]:
# with open(os.path.join(BASE_DIR, 'data', 'articles_full.json'), 'w') as file:
#     json.dump(flat_results, file)

#### News

In [None]:
news_api = 'https://www.mos.ru/api/newsfeed/v4/frontend/json/ru/news?expand=spheres,kind,image,tags&fields=id,title,importance,published_at,created_at,updated_at,is_deferred_publication,status,ya_rss,active_from,active_to,oiv_id,search,display_image,label,icon_id,canonical_url,canonical_updated_at,is_powered,has_image,date,has_district,date_timestamp,tags,theme_id,theme_ids,themes,spheres,sphere,kind,is_oiv_publication,organizations,updated_at_timestamp,created_at_timestamp,attach,active_from_timestamp,active_to_timestamp,image,counter,territory_area_id,territory_district_id,preview,text,url,promo,images'

In [None]:
get_api_data(news_api, 'news_full.json')

In [None]:
# news_api = 'https://www.mos.ru/api/newsfeed/v4/frontend/json/ru/news?expand=spheres,kind,image,tags,attach&fields=id,title,importance,published_at,created_at,updated_at,is_deferred_publication,status,ya_rss,active_from,active_to,oiv_id,search,display_image,label,icon_id,canonical_url,canonical_updated_at,is_powered,has_image,date,has_district,date_timestamp,tags,theme_id,theme_ids,themes,spheres,sphere,kind,is_oiv_publication,organizations,updated_at_timestamp,created_at_timestamp,attach,active_from_timestamp,active_to_timestamp,image,counter,territory_area_id,territory_district_id,preview_text,full_text,url,preview,text,promo,images'

In [None]:
# r = requests.get(news_api)

In [None]:
# data = r.json()

In [None]:
# n_pages = data.get('_meta').get('pageCount')

In [None]:
# def get_urls(n_pages):
#     for i in range(1, n_pages + 1):
#         url = news_api+f'&page={i}'
#         yield url

In [None]:
# urls = [news_api+f'&page={i}' for i in range(1, n_pages + 1)]

In [None]:
# def req(url):
#     r = requests.get(url)
#     time.sleep(0.2)
#     data = r.json()
# #     print('getting url: ', url)
# #     print('received data: ', data.get('items', 'error'))
#     return data

In [None]:
# full_news = []

In [None]:
# with Pool(processes=5) as P:
#     results = P.map(req, get_urls(n_pages))
#     for _ in tqdm.tqdm(P.imap_unordered(req, urls), total=len(urls)):
#         full_news.append(_)
#     results = list(tqdm.tqdm(P.imap_unordered(req, urls), total=len(urls)))
#     full_news.append(results)

In [None]:
# full_news[-100:]
# results[-100:]

In [None]:
# full_news = []
# for i in range(1, n_pages + 1):
#     r = requests.get(news_api+f'&page={i}')
#     data = r.json()
#     full_news.append(data.get('items'))

In [None]:
# flat_results = [item for sublist in results for item in sublist]

In [None]:
# with open(os.path.join(BASE_DIR, 'data', 'news_full.json'), 'w') as file:
#     json.dump(flat_results, file)

##### на самом деле ошибка появлялась поскольку для каждой новости есть определённый набор доступных полей. если в урле указано отсутствуещее поле, то получаем ошибку и None для всей новости. Можно переписать запрос к апи, но организаторы сказали дополнительно ничего парсить не нужно. Работаем с тем, что есть.