In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
import pymongo
import time


In [11]:
db_client = pymongo.MongoClient('mongodb://localhost:27017')
collections = db_client['db_migration']
tb_article = collections['tb_article']

In [12]:
DOMAIN_URL = 'https://thenewhumanitarian.org'

In [13]:
def scrape_text_data(url):
    try:
        # Set up Chrome WebDriver
        service = Service('../chromedriver') # or geckodriver for firefox
        options = webdriver.ChromeOptions()
        #options.add_argument('headless')
        options.add_argument("disable-gpu")
        options.add_argument("--window-size=0,0")
        options.add_experimental_option("detach", True)

        driver = webdriver.Chrome(service=service, options=options) # or webdriver.Firefox(service=service)

        # Navigate to the webpage
        driver.get(url)
        # driver.minimize_window()
        # driver.set_window_size(1, 1)

        # Wait for JavaScript to load (adjust time as needed)
        driver.implicitly_wait(10)  # Waits up to 10 seconds for elements to appear

        # Get the rendered HTML
        html = driver.page_source

        # Close the browser
        driver.quit()

        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")

        return soup
    except:
        print('Error when scraping webpage')
        return ''

In [14]:
def upsert_article_meta(article_detail):
        db_article = tb_article.find_one({'url': article_detail['url']})
        if db_article is None:
            #insert
            tb_article.insert_one(article_detail)
            # print("Inserted +++++++++++ article: " + article_detail['title'])
        else:
            #update
            tb_article.update_one({'url': article_detail['url']}, {'$set': article_detail})
            # print("Updated ............ article: " + article_detail['title'])

In [15]:
#Our ability to deliver compelling, field-based reporting on humanitarian crises rests on a few key principles: deep expertise, an unwavering commitment to amplifying affected voices, and a belief in the power of independent journalism to drive real change.\nWe need your help to sustain and expand our work. Your donation will support our unique approach to journalism, helping fund everything from field-based investigations to the innovative storytelling that ensures marginalised voices are heard.\nPlease consider joining our membership programme. Together, we can continue to make a meaningful impact on how the world responds to crises.
def not_contain_special_phrases(text):
    return ('Our ability to deliver compelling' not in text) and ('Your donation will support our unique approach to journalism' not in text) and ('joining our membership programme' not in text)

In [16]:
#get text in all tags <p>
def get_content_in_container(container):
    content = ''
    items = container.find_all('p')
    no_of_paragraphs = len(items)
    # print(str(no_of_paragraphs))
    if no_of_paragraphs == 0:
        items = container.find_all('div')
        no_of_paragraphs = len(items)
        # print(str(no_of_paragraphs))
        if no_of_paragraphs == 0:
            items = container.find_all('span')
            no_of_paragraphs = len(items)
            # print(str(no_of_paragraphs))
    for i in range(0, no_of_paragraphs):
        if i == no_of_paragraphs - 1:
            #this is the last item, we need to check whether it is note or not (https://www.thenewhumanitarian.org/opinion/2025/02/04/how-europe-can-escape-migration-deterrence-trap)
            em_tag = items[i].find('em')
            if em_tag is not None:
                break   #do not include this paragraph in the final content
        if (not_contain_special_phrases(items[i].text.strip())):   #remove footnote
            content += items[i].text.strip()
    if content == '':   #till 0
        if len(container.text.strip()) > 100 and not_contain_special_phrases(container.text.strip()):
            return container.text.strip()   #get content itself (https://www.thenewhumanitarian.org/analysis/2015/11/27/new-border-regime-balkans-inequitable-and-illegal)
    return content

In [17]:
#There may have different content type
#1: normal article: (https://www.thenewhumanitarian.org/opinion/2025/02/04/how-europe-can-escape-migration-deterrence-trap)
#2: report article: (https://www.thenewhumanitarian.org/analysis/2025/01/07/trends-will-spur-humanitarian-needs-2025)
#3: stories: https://interactive.thenewhumanitarian.org/stories/2022/05/10/us-asylum-darien-gap-cuba-central-america-mexico/
#4: text in span: https://www.thenewhumanitarian.org/feature/2017/04/26/preying-disaster-how-human-trafficking-has-spiked-quake-shattered-nepal
#5: text in div: https://www.thenewhumanitarian.org/news/2017/04/13/pushed-out-pakistan-war-torn-afghanistan-refugees-are-told-be-patient
#6: section: https://thenewhumanitarian.org/2016/06/21/forgotten-conflicts-blue-nile
#7: https://thenewhumanitarian.org/news/2015/08/25/photo-feature-race-beat-hungary-s-border-fence
def extract_content(soup):
    content = ''
    #type 1 & 4, 5
    article_body = soup.find('div', attrs={'class': 'article__body'})
    if article_body is not None:
        big_contents = article_body.find_all('div', attrs={'class': 'field-name-body flow'})
        for big_content in big_contents:
            content += get_content_in_container(big_content)
            # print('============')
    #type 2
    items = soup.find_all('div', attrs={'class': 'advanced-report-content flow'})
    no_of_paragraphs = len(items)
    #print(str(no_of_paragraphs))
    for i in range(0, no_of_paragraphs):
        p = items[i].find_all('p')
        for j in range(0, len(p)):
            content += p[j].text.strip()
    #type 3
    field_name_bodies = soup.find_all('div', attrs={'class': 'field-name-body'})
    if field_name_bodies is not None:
        # print(str(len(field_name_bodies)))
        for field_name_body in field_name_bodies:
            content += get_content_in_container(field_name_body)
    #type 6
    field_name_bodies = soup.find_all('section', attrs={'class': 'copy left'})
    if field_name_bodies is not None:
        # print(str(len(field_name_bodies)))
        for field_name_body in field_name_bodies:
            content += field_name_body.text.strip()
    #type 7
    field_name_bodies = soup.find_all('div', attrs={'class': 'sqs-html-content'})
    if field_name_bodies is not None:
        for field_name_body in field_name_bodies:
            content += get_content_in_container(field_name_body)
    return content

In [18]:
#find article that haven't scraped its content
db_articles = tb_article.find({'len_content': 0})
index = 0
if db_articles is not None:
    for db_article in db_articles:
        if index < 11000:
            url = db_article['url']
            soup = scrape_text_data(DOMAIN_URL + url)
            if soup == '':
                print('Error no soup: ' + DOMAIN_URL + url)
                db_article['error'] = 'no soup'
                tb_article.update_one({'url': url}, {'$set': db_article})
                break
            else:
                db_article['is_scraped'] = 1
                content = extract_content(soup)
                if content == '':
                    db_article['error'] = 'no content'
                    tb_article.update_one({'url': url}, {'$set': db_article})
                    print('No content in page: ' + DOMAIN_URL + url)
                    break
                else:
                    #correct data
                    db_article['error'] = ''
                    db_article['content'] = content
                    db_article['len_content'] = len(content)
                    tb_article.update_one({'url': url}, {'$set': db_article})
                    #print('Updated content: ' + url)
        else:
            break
        index += 1
        time.sleep(1)   #delay for 1 second
        print('Finish page: ' + str(index) + ' ' + DOMAIN_URL + url)

Finish page: 1 https://thenewhumanitarian.org/news/2013/02/14/humanitarian-crisis-sudan-s-nuba-mountains
Finish page: 2 https://thenewhumanitarian.org/analysis/2013/02/14/becoming-refugees-once-more-palestinians-syria-return-gaza
Finish page: 3 https://thenewhumanitarian.org/feature/2013/02/13/flood-proofing-mozambique
Finish page: 4 https://thenewhumanitarian.org/news/2013/02/13/water-mismanagement-northern-sri-lanka
Finish page: 5 https://thenewhumanitarian.org/news/2013/02/11/healthcare-still-disrupted-central-african-republic
Finish page: 6 https://thenewhumanitarian.org/news/2013/02/11/utter-destitution-north-mali-displaced-icrc
Finish page: 7 https://thenewhumanitarian.org/news/2013/02/08/lack-funds-hits-refugee-health-care-lebanon
Finish page: 8 https://thenewhumanitarian.org/news/2013/02/08/call-humanitarian-access-after-clashes-north-darfur
Finish page: 9 https://thenewhumanitarian.org/news/2013/02/07/aid-workers-cautious-kachin-peace-talks
Finish page: 10 https://thenewhumani