In [1]:
#%% 1. Loading packages
import time
import requests
import datetime
import jsonlines
import random
from pathlib import Path
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.exceptions import HTTPError, RequestException, ConnectionError, ReadTimeout


ModuleNotFoundError: No module named 'jsonlines'

In [7]:
# Define WriteMode for file handling
class WriteMode:
    APPEND = 'a'
    OVERWRITE = 'w'

def write_jsonl(input_entries: list[dict], output_file: str, write_mode: WriteMode):
    """Write content to jsonl format."""
    folder = Path(output_file).parent
    if not folder.exists():
        os.makedirs(folder)

    with jsonlines.open(output_file, write_mode) as f:
        for entry in input_entries:
            f.write(entry)

#%% 2. Define Crawler class
class Crawler:
    """Crawler class for extracting data from the CABI Digital Library."""
    
    def __init__(self):
        self.session = self.get_session()
        self.data_buffer = []
    
    def get_session(self, total_retries: int = 3, backoff_factor: float = 0.1, status_forcelist: list[int] = [500, 502, 503, 504, 429]):
        """Generate a session object with retry settings."""
        retries = requests.packages.urllib3.util.retry.Retry(total=total_retries, backoff_factor=backoff_factor, status_forcelist=status_forcelist)
        adapter = HTTPAdapter(max_retries=retries)
        session = requests.Session()
        session.mount('http://', adapter)
        session.mount('https://', adapter)
        return session
        
    def get_url(self, url: str, timeout: int = 300):
        """Get the response from a URL."""
        try:
            response = self.session.get(url, timeout=timeout)
            response.raise_for_status()
            return response
        except (HTTPError, RequestException, ConnectionError, ReadTimeout) as err:
            print(f'Error when connecting: {err}')
            return None
    
    def extract_data(self, url: str, output_file: str):
        """Extract data from the given URL and write to output."""
        response = self.get_url(url)
        if not response:
            return
        soup = BeautifulSoup(response.text, 'html.parser')

        # Example of extracting specific elements - adjust as needed for your use case
        articles = soup.find_all('div', class_='search-result__body')  # Adjust based on actual HTML structure
        buffer = []

        for article in articles:
            title = article.find('h2').get_text().strip() if article.find('h2') else 'No title'
            authors = article.find('div', class_='meta__authors').get_text().strip() if article.find('div', class_='meta__authors') else 'No authors'
            date = article.find('div', class_='meta__date').get_text().strip() if article.find('div', class_='meta__date') else 'No date'
            abstract_preview = article.find('div', class_='abstract-preview').get_text().strip() if article.find('div', class_='abstract-preview') else 'No abstract preview'
            
            # Adding more details if available
            full_text_link = article.find('a', text='FULL TEXT')['href'] if article.find('a', text='FULL TEXT') else None
            pdf_link = article.find('a', text='PDF/EPUB')['href'] if article.find('a', text='PDF/EPUB') else None
            
            result = {
                'title': title,
                'authors': authors,
                'date': date,
                'abstract_preview': abstract_preview,
                'full_text_link': full_text_link,
                'pdf_link': pdf_link
            }
            buffer.append(result)
        
        write_jsonl(buffer, output_file, WriteMode.APPEND)

In [None]:
#%% 3. Running the crawler
if __name__ == '__main__':
    base_url = "https://www-cabidigitallibrary-org.ezp.sub.su.se/action/doSearch?ConceptID=500060&startPage=0&sortBy=Earliest"
    output_file = 'cabi_data.jsonl'
    crawler = Crawler()
    
    # Extract data from the base URL
    crawler.extract_data(base_url, output_file)
