In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import undetected_chromedriver as uc
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support import expected_conditions as EC


In [10]:
def scrape_pages(start_page, end_page):

    options = uc.ChromeOptions()
    options.headless = False
    driver = uc.Chrome(options=options)

    base_url = 'https://www.thejournal.ie/brexit/news/'
    all_article_urls = set()
    article_data = []
    comments_data = []

    for page_num in range(start_page, end_page + 1):
        url = base_url if page_num == 1 else f'{base_url}page/{page_num}/'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', class_='link-overlay-redesign')
        for link in links:
            all_article_urls.add(link['href'])

    for article_url in all_article_urls:
        print(f"Scraping article: {article_url}")
        article_response = requests.get(article_url)
        article_soup = BeautifulSoup(article_response.text, 'html.parser')

        article_div = article_soup.find('div', class_='article-content-redesign')
        article_text = ''
        if article_div:
            paragraphs = article_div.find_all('p')
            article_text = '\n'.join(p.get_text(strip=True) for p in paragraphs)
        else:
            print(f"No article text found for {article_url}")

        creation_date_div = article_soup.find('div', class_='metadata-csp-date')
        article_date = creation_date_div.get_text(strip=True) if creation_date_div else None

        updated_span = article_soup.find('span', class_='article-updated-time-redesign')
        article_updated_date = updated_span.get_text(strip=True) if updated_span else None

        article_data.append({
            'article_url': article_url,
            'article_text': article_text,
            'article_date_created': article_date,
            'article_date_updated': article_updated_date
        })

        driver.get(article_url)
        time.sleep(2)
        try:
            comment_trigger = driver.find_element(By.CLASS_NAME, 'view-side-panel-comments-button')
            driver.execute_script("arguments[0].click();", comment_trigger)
            time.sleep(2)
        except Exception as e:
            print(f"Couldn't click comment section for {article_url}: {e}")
            continue

        comment_elements = driver.find_elements(By.CLASS_NAME, 'comment-body.text')
        date_elements = driver.find_elements(By.CLASS_NAME, 'date')

        for comment_el, date_el in zip(comment_elements, date_elements):
            comment_text = comment_el.text.strip()
            comment_date = date_el.text.strip()
            if comment_text:
                comments_data.append({
                    'article_url': article_url,
                    'comment_text': comment_text,
                    'comment_date': comment_date
                })

    driver.quit()

    articles_df = pd.DataFrame(article_data)
    comments_df = pd.DataFrame(comments_data)

    # Save using start_page and end_page in file name
    articles_df.to_csv(f'articles_{start_page}_{end_page}.csv', index=False)
    comments_df.to_csv(f'comments_{start_page}_{end_page}.csv', index=False)

In [3]:
scrape_pages(1, 10)

Scraping article: https://www.thejournal.ie/rishi-sunak-northern-ireland-protocol-deal-imminent-5997518-Feb2023/
Scraping article: https://www.thejournal.ie/migration-uk-eu-after-brexit-5561000-Oct2021/
Scraping article: https://www.thejournal.ie/irish-exports-highest-level-ever-5683731-Feb2022/
Scraping article: https://www.thejournal.ie/uk-bill-visa-non-eu-citizens-northern-ireland-border-5625136-Dec2021/
Scraping article: https://www.thejournal.ie/no-extension-brexit-residency-scheme-uk-5474654-Jun2021/
Scraping article: https://www.thejournal.ie/readme/northern-ireland-protocol-5579506-Oct2021/
Scraping article: https://www.thejournal.ie/poll-is-frsh-fish-part-of-your-diet-5448421-May2021/
Scraping article: https://www.thejournal.ie/ursula-vaccine-rollout-5350362-Feb2021/
Scraping article: https://www.thejournal.ie/northern-ireland-protocol-donaldson-taoiseach-5675244-Feb2022/
Scraping article: https://www.thejournal.ie/brexit-permanently-changed-northern-ireland-5376526-Mar2021/
S

In [6]:
articles_df = pd.read_csv('articles_1_10.csv')
comments_df = pd.read_csv('comments_1_10.csv')


📰 First 5 articles:
                                         article_url  \
0  https://www.thejournal.ie/government-seeking-e...   
1  https://www.thejournal.ie/rishi-sunak-northern...   
2  https://www.thejournal.ie/migration-uk-eu-afte...   
3  https://www.thejournal.ie/irish-exports-highes...   
4  https://www.thejournal.ie/uk-bill-visa-non-eu-...   

                                        article_text article_date_created  \
1  LAST UPDATE|16 Feb 2023\nBRITISH PRIME MINISTE...  8.01pm, 16 Feb 2023   
2  IT’S NOW OVER five years since the UK voted to...   9.00pm, 3 Oct 2021   
3  THE VALUE OF goods exported from Ireland rose ...  4.01pm, 15 Feb 2022   
4  TÁNAISTE LEO VARADKAR has said he will raise c...   3.42pm, 9 Dec 2021   

  article_date_updated  
0                  NaN  
1          16 Feb 2023  
2                  NaN  
3                  NaN  
4                  NaN  

💬 First 5 comments:
                                         article_url  \
0  https://www.thejournal.ie/

In [7]:
print(articles_df.head())
len(articles_df)

                                         article_url  \
0  https://www.thejournal.ie/government-seeking-e...   
1  https://www.thejournal.ie/rishi-sunak-northern...   
2  https://www.thejournal.ie/migration-uk-eu-afte...   
3  https://www.thejournal.ie/irish-exports-highes...   
4  https://www.thejournal.ie/uk-bill-visa-non-eu-...   

                                        article_text article_date_created  \
1  LAST UPDATE|16 Feb 2023\nBRITISH PRIME MINISTE...  8.01pm, 16 Feb 2023   
2  IT’S NOW OVER five years since the UK voted to...   9.00pm, 3 Oct 2021   
3  THE VALUE OF goods exported from Ireland rose ...  4.01pm, 15 Feb 2022   
4  TÁNAISTE LEO VARADKAR has said he will raise c...   3.42pm, 9 Dec 2021   

  article_date_updated  
0                  NaN  
1          16 Feb 2023  
2                  NaN  
3                  NaN  
4                  NaN  


400

In [8]:
print(comments_df.head())
len(comments_df)

                                         article_url  \
0  https://www.thejournal.ie/government-seeking-e...   
1  https://www.thejournal.ie/government-seeking-e...   
2  https://www.thejournal.ie/government-seeking-e...   
3  https://www.thejournal.ie/government-seeking-e...   
4  https://www.thejournal.ie/government-seeking-e...   

                                        comment_text           comment_date  
1  @Fachtna Roe: At the very least we should be c...  Feb 9th 2021, 7:16 PM  
2  I hate to admit it but for the first time ever...  Feb 9th 2021, 7:52 PM  
3  @Mr Kayfabe: That some people on the remain si...  Feb 9th 2021, 8:11 PM  
4  @Mr Kayfabe: the EU26 stuck with us throughout...  Feb 9th 2021, 9:04 PM  


8363

In [9]:
scrape_pages(11, 20)

Scraping article: https://www.thejournal.ie/brexit-extension-general-election-4869746-Oct2019/
Scraping article: https://www.thejournal.ie/uk-border-plans-700m-5147919-Jul2020/
Scraping article: https://www.thejournal.ie/brexit-flag-4988374-Jan2020/
Scraping article: https://www.thejournal.ie/uk-election-liveblog-2-4931112-Dec2019/
Scraping article: https://www.thejournal.ie/readme/brexit-history-5001062-Jul2020/
Scraping article: https://www.thejournal.ie/central-bank-irish-economy-5004155-Feb2020/
Scraping article: https://www.thejournal.ie/boris-johnson-deal-5222450-Oct2020/
Scraping article: https://www.thejournal.ie/whats-in-the-brexit-deal-5310613-Dec2020/
Scraping article: https://www.thejournal.ie/leo-varadkar-verona-murphy-dara-murphy-election-4940600-Dec2019/
Scraping article: https://www.thejournal.ie/uk-house-of-lords-internal-market-bill-5261610-Nov2020/
Scraping article: https://www.thejournal.ie/michel-barnier-david-frost-5157679-Jul2020/
Scraping article: https://www.th

In [19]:
driver = uc.Chrome(options=options, version_main=136)

NameError: name 'options' is not defined

In [3]:
scrape_pages(21, 30)


📄 Now scraping page 21...

📄 Now scraping page 22...

📄 Now scraping page 23...

📄 Now scraping page 24...

📄 Now scraping page 25...

📄 Now scraping page 26...

📄 Now scraping page 27...

📄 Now scraping page 28...

📄 Now scraping page 29...

📄 Now scraping page 30...
Scraping article: https://www.thejournal.ie/list-of-problems-that-could-affect-ireland-for-brexit-negotiations-4840629-Oct2019/
Scraping article: https://www.thejournal.ie/brexit-newspapers-front-pages-daily-mail-4856302-Oct2019/
Scraping article: https://www.thejournal.ie/poll-dail-recall-brexit-4788481-Aug2019/
Scraping article: https://www.thejournal.ie/readme/britains-unwritten-constitution-discussed-in-history-4823246-Sep2019/
Scraping article: https://www.thejournal.ie/boris-johnson-no-deal-brexit-4696571-Jun2019/
Scraping article: https://www.thejournal.ie/conservative-ministers-resign-brexit-4843539-Oct2019/
Scraping article: https://www.thejournal.ie/boris-johnson-brexit-alternative-4833794-Oct2019/
Scraping art

In [6]:
scrape_pages(31, 40)

Scraping article: https://www.thejournal.ie/theresa-may-meltdown-4540958-Mar2019/
Scraping article: https://www.thejournal.ie/brexit-border-coveney-4556726-Mar2019/
Scraping article: https://www.thejournal.ie/brexit-poll-6-4474331-Feb2019/
Scraping article: https://www.thejournal.ie/no-deal-brexit-4-4539160-Mar2019/
Scraping article: https://www.thejournal.ie/british-passport-european-union-4580013-Apr2019/
Scraping article: https://www.thejournal.ie/independent-group-2-4503269-Feb2019/
Scraping article: https://www.thejournal.ie/mary-lou-brexit-impasse-4453083-Jan2019/
Scraping article: https://www.thejournal.ie/meps-brexit-seats-ireland-4661575-May2019/
Scraping article: https://www.thejournal.ie/front-pages-brexit-vote-4538557-Mar2019/
Scraping article: https://www.thejournal.ie/nigel-dodds-stay-in-eu-4568351-Mar2019/
Scraping article: https://www.thejournal.ie/brexit-theresa-may-jeremy-corbyn-4574321-Apr2019/
Scraping article: https://www.thejournal.ie/article-50-brexit-date-extens

In [7]:
scrape_pages(41, 50)

Scraping article: https://www.thejournal.ie/may-brexit-customs-bill-4131316-Jul2018/
Scraping article: https://www.thejournal.ie/eu-unprecedented-brexit-deal-barnier-4209018-Aug2018/
Scraping article: https://www.thejournal.ie/brexit-horror-4404103-Dec2018/
Scraping article: https://www.thejournal.ie/outrage-gammon-brexit-voters-4013325-May2018/
Scraping article: https://www.thejournal.ie/enda-kenny-european-of-the-year-2-4058700-Jun2018/
Scraping article: https://www.thejournal.ie/phillip-lee-resignation-4066318-Jun2018/
Scraping article: https://www.thejournal.ie/eu-brexit-spanish-veto-4355844-Nov2018/
Scraping article: https://www.thejournal.ie/coveney-raab-brexit-deal-4315702-Oct2018/
Scraping article: https://www.thejournal.ie/tommy-robinson-march-4383144-Dec2018/
Scraping article: https://www.thejournal.ie/theresa-may-confidence-vote-4389702-Dec2018/
Scraping article: https://www.thejournal.ie/mary-lou-mcdonald-border-poll-4156212-Jul2018/
Scraping article: https://www.thejournal

In [13]:
scrape_pages(51, 55)

Scraping article: https://www.thejournal.ie/no-deal-reached-brexit-3731523-Dec2017/
Scraping article: https://www.thejournal.ie/ian-paisley-jr-telegraph-defamatory-3587636-Sep2017/
Scraping article: https://www.thejournal.ie/juncker-ireland-3901318-Mar2018/
Scraping article: https://www.thejournal.ie/post-brexit-border-unique-solution-3646765-Oct2017/
Scraping article: https://www.thejournal.ie/brexit-ireland-border-3718681-Nov2017/
Scraping article: https://www.thejournal.ie/brexit-poll-government-3894104-Mar2018/
Scraping article: https://www.thejournal.ie/british-politician-ireland-3733591-Dec2017/
Scraping article: https://www.thejournal.ie/brexit-customs-3503409-Jul2017/
Scraping article: https://www.thejournal.ie/theresa-may-post-brexit-relationship-3880887-Mar2018/
Scraping article: https://www.thejournal.ie/leo-varadkar-frances-fitzgerald-highs-and-lows-3769098-Dec2017/
Scraping article: https://www.thejournal.ie/power-games-brexit-3734426-Dec2017/
Scraping article: https://www

In [14]:
scrape_pages(56, 60)

Scraping article: https://www.thejournal.ie/united-ireland-border-poll-3136932-Dec2016/
Scraping article: https://www.thejournal.ie/theresa-may-christmas-3159115-Dec2016/
Scraping article: https://www.thejournal.ie/walsh-mushrooms-group-acquisition-tipperary-2-3207322-Jan2017/
Couldn't click comment section for https://www.thejournal.ie/walsh-mushrooms-group-acquisition-tipperary-2-3207322-Jan2017/: Message: no such element: Unable to locate element: {"method":"css selector","selector":".view-side-panel-comments-button"}
  (Session info: chrome=137.0.7151.56); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   undetected_chromedriver             0x0000000102a150c8 undetected_chromedriver + 6140104
1   undetected_chromedriver             0x0000000102a0c82a undetected_chromedriver + 6105130
2   undetected_chromedriver             0x00000001024a03b0 undetected_chromedriver + 41873

In [12]:
scrape_pages(61, 68)

Scraping article: https://www.thejournal.ie/london-dublin-ifsc-2879876-Jul2016/
Scraping article: https://www.thejournal.ie/business-warned-refund-irish-consumers-2853135-Jun2016/
Scraping article: https://www.thejournal.ie/brexit-border-questions-2829259-Jun2016/
Scraping article: https://www.thejournal.ie/news-fix-222-2952650-Aug2016/
Couldn't click comment section for https://www.thejournal.ie/news-fix-222-2952650-Aug2016/: Message: no such element: Unable to locate element: {"method":"css selector","selector":".view-side-panel-comments-button"}
  (Session info: chrome=137.0.7151.56); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   undetected_chromedriver             0x0000000100e4a0c8 undetected_chromedriver + 6140104
1   undetected_chromedriver             0x0000000100e4182a undetected_chromedriver + 6105130
2   undetected_chromedriver             0x00000001008d53b0 und