# Explanation
This script will look for all news pieces in `db["news"]` collection that have their `id_title==text` this is probably due to multiple updates over the same news piece where the last one actually ovewrote the previous text, this is further guaranteed because we only look for `processed_entities=True` meaning they had to have a valid text somewhere in the past otherwise they would not have any entity processed associated with them. They must also not have been marked as invalid aka `valid=False`. 

Parallel processing and the `explore_news_piece` function has been copied from *collect.py*

Additionally, after the texts have been fixed, all the entities are iterated and added to the respective news to ensure that, even if they were deleted during the previously described news overwriting process, the news documents will have the final correct value for `entities`. 

In [None]:
from src.utils import *
from src.dbmongo import DbMongo, get_db
from loguru import logger
from tqdm.auto import tqdm

In [None]:
config = parse_config("config.json")
db = get_db(config)

In [None]:

from newspaper import Article
from concurrent.futures import ThreadPoolExecutor

In [None]:
@logger.catch
def explore_news_piece(db, n):
    logger.info("fetching (%s) %s" % (n["_id"], n["url"]))
    a = Article(n["url"], _language="pt")
    html = try_request(n["url"])
    if not html:
        if html == False:  # resource will never be available
            n["valid"] = False
            n["processed"] = True
            logger.error("%s will never be available" % (n["url"]))
            db.upsert_news_piece(n)
        return
    a.download(input_html=html.text)
    try:
        a.parse()
        text = assert_valid_article(a)
        # logger.info("%s[%s]" % (n["_id"], n["url"]))
        n["text"] = text
        n["image"] = a.top_image
    except Exception as e:
        logger.error("[%s] while parsing %s" % (e, n))
        n["valid"] = False
    n["processed"] = True
    db.upsert_news_piece(n)
    logger.info("done %s" % n["url"])

In [None]:
def incomplete(): return db["news"].find_one({"$where": "this.text==this._id_title", "processed_entities": True, "valid": {"$exists": False}})

In [None]:
print("Starting...")
batch_limit = 500
while incomplete():
    print("Next loop, incomplete = %s" % incomplete())
    with ThreadPoolExecutor() as pool:
        pool.map(
            lambda n: explore_news_piece(db, n),
            db["news"].find({"$where": "this.text==this._id_title", "processed_entities": True, "valid": {"$exists": False}}).limit(batch_limit)
        )
print("ALL Problematic have been fixed")

In [None]:
# percorrer todas as entidades e addToSet nas noticias
from pymongo import UpdateOne
import pymongo
for fe in db["entities"].find({}, no_cursor_timeout=True):
    try:
        db["news"].bulk_write([UpdateOne({'_id': n}, {
            "$addToSet": {"entities.%s" % fe["label"]: {"$each": [{"_id": fe["_id"], "text": fe["text"]}]}}
        }, upsert=True) for n in fe["news"]], ordered=False)
    except pymongo.errors.BulkWriteError: pass  # ignore duplicate insertion errors

In [None]:
# por fim, fazer dump e guardar