In [5]:
# biodiversity_terms = [
#     "koral"
# ]

# Below are some of the following Danish terms in the field of Biodiversity:
biodiversity_terms = [
    "biodiversitet","koral", "biosfære","ferskvand"
]

In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import time
import pandas as pd

def selenium_scraper(biodiversity_terms):
    # Start the Selenium WebDriver
    driver = webdriver.Chrome()  # Ensure that `chromedriver` is on your PATH or specify its location.
    base_url = "https://www.dr.dk/soeg?query="  # Base URL for search
    articles = []  # List to store scraped articles

    # Loop through each search term in biodiversity_terms
    for term in biodiversity_terms:
        url = f"{base_url}{term}"  # Construct the search URL for each term
        print(f"Scraping URL: {url}")  # Print the URL being scraped

        driver.get(url)  # Open the URL in the browser
        time.sleep(5)  # Wait for the page to load completely

        # Attempt to click the "Show more" button to load more articles
        while True:

            # Scroll to the bottom to ensure the button is in view
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)  # Short delay to allow scrolling to complete

            # Check for the presence of the "See More" button
            buttons = driver.find_elements(By.XPATH, "//button[contains(@class, 'dre-button') and .//span[contains(text(), 'Vis flere')]]")

            if len(buttons) > 0:
                # If the button is found, click the first one
                see_more_button = buttons[0]

                try:
                    # Scroll the button into view and attempt a click
                    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", see_more_button)
                    time.sleep(1)  # Short delay to allow any page changes
                    see_more_button.click()

                except Exception as click_exception:
                    # Use JavaScript to click if the regular click fails
                    driver.execute_script("arguments[0].click();", see_more_button)



                # Wait for new articles to load
                time.sleep(3)  # Adjust the sleep time as needed
            else:
                print("No more 'See More' buttons found. All articles should be loaded.")
                break    
    
        
        # Parse the entire page HTML using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        

        # Extract all article links
        for article_tag in soup.find_all('a', class_='dre-teaser-title'):
            title_tag = article_tag.find('strong', class_='dre-title-text__prefix')  # Get the title within <strong> tags
            date_tag = article_tag.find_next('span', class_='dre-teaser-meta-label')  # Get the date

            # Ensure that both title and URL exist
            if title_tag and 'href' in article_tag.attrs and date_tag:
                link = article_tag['href']  # Get the article URL
                title = title_tag.get_text(strip=True)  # Get the article title text
#                 date = date_tag.get_text(strip=True)  # Get the article date text
                article_url = "https://www.dr.dk" + link  # Construct the full URL

                # Append the article details to the articles list
                articles.append({
                    'term': term,  # Store the search term
                    'title': title,  # Store the article title
                    'url': article_url  # Store the article URL
                })

        time.sleep(1)  # Be polite and do not overwhelm the service

    driver.quit()  # Close the browser after scraping
    return articles  # Return the list of articles




# Call the function with example terms and print the DataFrame for better visualization
articles = selenium_scraper(biodiversity_terms)
df = pd.DataFrame(articles)
print(df.head())


Scraping URL: https://www.dr.dk/soeg?query=biodiversitet
No more 'See More' buttons found. All articles should be loaded.
Scraping URL: https://www.dr.dk/soeg?query=koral
No more 'See More' buttons found. All articles should be loaded.
Scraping URL: https://www.dr.dk/soeg?query=biosfære
No more 'See More' buttons found. All articles should be loaded.
Scraping URL: https://www.dr.dk/soeg?query=ferskvand
No more 'See More' buttons found. All articles should be loaded.
            term                              title  \
0  biodiversitet                DRTV - Aftenshowet:   
1  biodiversitet  DRTV - Koralrevets hemmeligheder:   
2  biodiversitet                DRTV - Aftenshowet:   
3  biodiversitet  DRTV - Koralrevets hemmeligheder:   
4  biodiversitet     DRTV - Signe Molde på udebane:   

                                                 url  
0  https://www.dr.dk/drtv/episode/aftenshowet_-ek...  
1  https://www.dr.dk/drtv/episode/koralrevets-hem...  
2  https://www.dr.dk/drtv/episode

In [7]:
df.tail()


Unnamed: 0,term,title,url
422,ferskvand,Ny forskning:,https://www.dr.dk/nyheder/udland/ny-forskning-...
423,ferskvand,Ny forskning:,https://www.dr.dk/nyheder/viden/naturvidenskab...
424,ferskvand,Kæmpe opsang til verdens ledere har ikke virket:,https://www.dr.dk/nyheder/viden/klima/kaempe-o...
425,ferskvand,CO2 giver surt vand:,https://www.dr.dk/nyheder/viden/klima/co2-give...
426,ferskvand,Varme vinde bekræfter:,https://www.dr.dk/nyheder/viden/klima/varme-vi...


In [8]:
import re  # we import the regular expression variable for string manipulation or use the raw string import.


def extract_biodiversity_sentences(articles, biodiversity_terms):
    # Firstly, let create a blank list into which we can pick any sentences that contain biodiversity terms
    biodiversity_sentences = []

    # Review each article listed in provided list of articles
    for article in articles:
        # Create an instance of selenium webdriver to open full article content
        driver = webdriver.Chrome()  # Pear any chance, make sure that the `chromedriver` is in your PATH
        driver.get(article['url'])  # Open the URL of the article
        time.sleep(5)  # Allow the page to load to the extent that it stops momentarily.
        
    
        # For the current study, text content of the article shall be extracted.
        soup = BeautifulSoup(driver.page_source, 'html.parser')  # Parse the HTML content
        paragraphs = soup.find_all('p')  # Use query selection #1 to locate all <p> elements which generally comprises the main content of the page
        # Combine the data from the paragraphs into a single line
        article_text = " ".join([para.get_text() for para in paragraphs])
        driver.quit()  # The content is to be extracted next from the following link plus close the browser Close the browser # http://www.information.com London United Kingdom

        # Decide based on regex that separates article text into sentences
        sentences = re.split(r'(?<=[.!?]) +', article_text)
       
        
         # For each of the sentences, search for terms related to biodiversity
        for sentence in sentences:
            # Biodiversity lowercase If there is any of the bounded terms, the sentence is part of the biodiversity area.
            if any(term in sentence.lower() for term in biodiversity_terms):
                # # If a term is found, add more information in the list
                biodiversity_sentences.append({
                    'article_title': article['title'],  # Store the title of article
                    'article_url': article['url'],      # Store the URL of the article
                    'sentence': sentence                  # Store the matching sentence
                })

    return biodiversity_sentences  # Provide list of those sentences that contain terms related to biodiversity

# Call the function to perform exctractio
biodiversity_sentences = extract_biodiversity_sentences(articles, biodiversity_terms)



In [9]:
from transformers import pipeline

# Specify the sentiment analysis model explicitly
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Function to classify sentiment
def classify_sentiment(biodiversity_sentences):
    for sentence_data in biodiversity_sentences:
        result = classifier(sentence_data['sentence'])[0]
        label = result['label']
        
        # Start a new variable to store the numerical sentiment
        sentiment = 0
        
        if label == 'POSITIVE':
            sentiment = 1
        elif label == 'NEGATIVE':
            sentiment = -1
        else:
            sentiment = 0  # Optional: In case you want to handle neutral sentiment
        
        sentence_data['sentiment'] = sentiment  # Insert the sentiment value into the sentence data

    return biodiversity_sentences  # Return the enriched list of sentences with sentiment




  torch.utils._pytree._register_pytree_node(


In [10]:
#Apply the classify function
biodiversity_sentences1 = classify_sentiment(biodiversity_sentences)

In [11]:
from collections import defaultdict # Import defaultdict for easy to use a dictionary like data structure

# I will define a function that uses the average sentiment of biodiversity sentences to compute the totalitarian perception of articles.
def compute_article_sentiment(biodiversity_sentences):
    article_scores = defaultdict(list) # Initialize defaultdict to keep track of sentiments associated to a particular article

    # Untangle positive and negative words by the article URL
    for sentence_data in biodiversity_sentences:
        article_scores[sentence_data['article_url']].append(sentence_data['sentiment']) 

    # Compute average sentiment score for each article
    article_sentiments = [] # Initialize defaultdict to hold lists of sentiments for each article
    # # Iterate through each sentence data in  biodiversity sentences list
    for article_url, sentiments in article_scores.items():
        # Calculate average number of positive words for each article
        avg_sentiment = sum(sentiments) / len(sentiments) ## Storing the sentiment of each given sentence with the list corresponding to the URL of the article
        article_sentiments.append({
            'article_url': article_url, # Include the article URL and the article’s average sentiment as the list
            'average_sentiment': avg_sentiment # Remember the sample average sentiment scor
        })
  
        
    return article_sentiments
# apply a function to compute average sentiment scores for articles based on biodiversity sentences
article_sentiments = compute_article_sentiment(biodiversity_sentences1)


In [12]:
# Connect the articles with the sentiments of the articles.>
df_articles = pd.DataFrame(articles) # Build DataFrame from list of articles using dictionary-like objects
df_sentiments = pd.DataFrame(article_sentiments) 

## Connect the articles with the sentiments of the articles.
df_final = pd.merge(df_articles, df_sentiments, left_on='url', right_on='article_url') # Join the two DataFrames on article URL to match articles to their sentiment scores


In [13]:
df_final.head(10)

Unnamed: 0,term,title,url,article_url,average_sentiment
0,biodiversitet,FN:,https://www.dr.dk/nyheder/seneste/fn-invasive-...,https://www.dr.dk/nyheder/seneste/fn-invasive-...,-1.0
1,biodiversitet,Intro:,https://www.dr.dk/studie/samfundsfag/intro-hvo...,https://www.dr.dk/studie/samfundsfag/intro-hvo...,-1.0
2,biodiversitet,Alternativet trækker sig fra grønne forhandlin...,https://www.dr.dk/nyheder/politik/alternativet...,https://www.dr.dk/nyheder/politik/alternativet...,-1.0
3,biodiversitet,Truet græshoppe fundet på Lolland:,https://www.dr.dk/nyheder/seneste/truet-graesh...,https://www.dr.dk/nyheder/seneste/truet-graesh...,-1.0
4,biodiversitet,Politisk aftale på plads:,https://www.dr.dk/nyheder/seneste/politisk-aft...,https://www.dr.dk/nyheder/seneste/politisk-aft...,-1.0
5,biodiversitet,"Lego-familien køber igen, igen landbrugsjord:",https://www.dr.dk/nyheder/seneste/lego-familie...,https://www.dr.dk/nyheder/seneste/lego-familie...,-1.0
6,biodiversitet,"Forskerne troede, den var så godt som uddød:",https://www.dr.dk/nyheder/seneste/forskerne-tr...,https://www.dr.dk/nyheder/seneste/forskerne-tr...,-1.0
7,biodiversitet,Danmark står i en fuglekrise:,https://www.dr.dk/nyheder/indland/danmark-staa...,https://www.dr.dk/nyheder/indland/danmark-staa...,-1.0
8,biodiversitet,Trods stor modstand fra højrefløjen:,https://www.dr.dk/nyheder/seneste/trods-stor-m...,https://www.dr.dk/nyheder/seneste/trods-stor-m...,-1.0
9,biodiversitet,Signalkrebsen truer biodiversiteten:,https://www.dr.dk/nyheder/viden/signalkrebsen-...,https://www.dr.dk/nyheder/viden/signalkrebsen-...,-1.0


In [14]:
import warnings
warnings.filterwarnings('ignore')


In [15]:
data = df_final.copy()

import requests  # Import requests for using the HTTP protocol

## Function to extract date from a given URL
def get_date_from_url(url):
    try:
        # Adjacent to undertaking HTTP GET request on the article URL
        response = requests.get(url)
        if response.status_code == 200:  # # Check that the request has been responded to
            # # Extract the blob of text from the HTML page
            soup = BeautifulSoup(response.text, 'html.parser')
            # locate an HTML <time> tag with the particular class and identify the datetime attribute
            time_tag = soup.find('time', class_='dre-byline__date')
            if time_tag and 'datetime' in time_tag.attrs:
                # Get the datetime value and format it to date only (Month and Year)
                datetime_value = time_tag['datetime']
                # Output Conversion of datetime into format (Month Year)
                formatted_date = pd.to_datetime(datetime_value).strftime('%b %Y')
                return formatted_date
            else:
                return None
        else:
            return None
    except Exception as e:
        print(f"Error fetching date from URL {url}: {e}")
        return None

# Feed this function to navigate and extract the date for each article URL
data['date'] = data['article_url'].apply(get_date_from_url)
# created from a single column of data named `date’, hence we shall name these new columns as `Month’ and `Year”.
data['Month'] = pd.to_datetime(data['date'], format='%b %Y').dt.month_name()  # Get full month name
data['Year'] = pd.to_datetime(data['date'], format='%b %Y').dt.year           # Get year

# Analysing the above code clearly shows that this line of code will display the updated DataFrame.
data.head(10)






Unnamed: 0,term,title,url,article_url,average_sentiment,date,Month,Year
0,biodiversitet,FN:,https://www.dr.dk/nyheder/seneste/fn-invasive-...,https://www.dr.dk/nyheder/seneste/fn-invasive-...,-1.0,,,
1,biodiversitet,Intro:,https://www.dr.dk/studie/samfundsfag/intro-hvo...,https://www.dr.dk/studie/samfundsfag/intro-hvo...,-1.0,Nov 2020,November,2020.0
2,biodiversitet,Alternativet trækker sig fra grønne forhandlin...,https://www.dr.dk/nyheder/politik/alternativet...,https://www.dr.dk/nyheder/politik/alternativet...,-1.0,Sep 2024,September,2024.0
3,biodiversitet,Truet græshoppe fundet på Lolland:,https://www.dr.dk/nyheder/seneste/truet-graesh...,https://www.dr.dk/nyheder/seneste/truet-graesh...,-1.0,,,
4,biodiversitet,Politisk aftale på plads:,https://www.dr.dk/nyheder/seneste/politisk-aft...,https://www.dr.dk/nyheder/seneste/politisk-aft...,-1.0,,,
5,biodiversitet,"Lego-familien køber igen, igen landbrugsjord:",https://www.dr.dk/nyheder/seneste/lego-familie...,https://www.dr.dk/nyheder/seneste/lego-familie...,-1.0,,,
6,biodiversitet,"Forskerne troede, den var så godt som uddød:",https://www.dr.dk/nyheder/seneste/forskerne-tr...,https://www.dr.dk/nyheder/seneste/forskerne-tr...,-1.0,,,
7,biodiversitet,Danmark står i en fuglekrise:,https://www.dr.dk/nyheder/indland/danmark-staa...,https://www.dr.dk/nyheder/indland/danmark-staa...,-1.0,Sep 2024,September,2024.0
8,biodiversitet,Trods stor modstand fra højrefløjen:,https://www.dr.dk/nyheder/seneste/trods-stor-m...,https://www.dr.dk/nyheder/seneste/trods-stor-m...,-1.0,,,
9,biodiversitet,Signalkrebsen truer biodiversiteten:,https://www.dr.dk/nyheder/viden/signalkrebsen-...,https://www.dr.dk/nyheder/viden/signalkrebsen-...,-1.0,Nov 2023,November,2023.0


In [17]:
# Selecting the required columns in the csv
df_final_with_dates = data[['date', 'Month', 'Year','title', 'url', 'average_sentiment']]
df_final_with_dates.to_csv("Final_Output.csv", index=False)  # save df into csv file
print("Exported CSV with expiry dates successfully!")



Exported CSV with expiry dates successfully!
