In [3]:
import pandas as pd

# Load the datasets
dataset = pd.read_json("../data/PLOS/PLOSArticles.json")

collectedPosts = pd.read_csv("../data/PLOS/PLOSArticlesAbbr.csv")

# Ensure the column names are consistent
article_titles_to_remove = collectedPosts['articleTitle']

# Filter rows in `dataset` where the `articleTitle` is not in `collectedPosts`
filtered_dataset = dataset[~dataset['articleTitle'].isin(article_titles_to_remove)]

# Save or inspect the result
print(len(filtered_dataset))

FileNotFoundError: [Errno 2] No such file or directory: '../data/PLOS/PLOSArticlesAbbr.csv'

In [4]:
import re

def extractAbbrev(element):
    try:
        # Regex pattern to find fully uppercase words in parentheses
        pattern = r'\(([A-Z]+)\)'

        # Find all matches with their start positions
        matches = [(match.group(1), match.start()) for match in re.finditer(pattern, element)]
        
        # Function to remove HTML tags from a string
        def remove_html_tags(text):
            return re.sub(r'<[^>]*>', '', text)

        # Extract words based on the new condition
        result = []
        for word, position in matches:
            
            # Find the preceding content up to the word's position
            preceding_text = element[:position].split()
            
            # Extract words starting with the same letter as the abbreviation
            extracted_words = []
            for w in reversed(preceding_text):
                extracted_words.insert(0,w)  # Insert at the beginning of the list
                # Stop extracting if the word doesn't contain the first letter
                if w[0].lower() not in word.lower():
                    break

            result.append((word, [remove_html_tags(item) for item in extracted_words]))  # Remove HTML tags

    except Exception as e:
        print(f"Error: {e}")
        return []
    
    return result
def preprocessData(resultsAnnotation):
    for index in range(len(resultsAnnotation)):
        try :
            word = resultsAnnotation[index][0]
            listOfWords = resultsAnnotation[index][1]
            firstLetter = word[0]
            # Ensure listOfWords has words starting with the same letter as the abbreviation
            while listOfWords and listOfWords[0][0].lower() != firstLetter.lower():
                resultsAnnotation[index][1].pop(0)
        except:
            continue
    return resultsAnnotation

In [None]:
import time
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import csv
import pandas as pd
import concurrent.futures

# Headers for making requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.137 Safari/537.36'
}

# Function to save data periodically by appending to CSV
def periodicSave(data):
    try:
        # Open file in append mode and write data
        with open("../data/PLOS/PLOSArticlesAbbr.csv", mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerows(data)  # Append the new data rows
        print("A Periodic Save is done.")
    except Exception as e:
        print(f"Error during periodic save: {e}")
# Function to get the surrounding sentence of the abbreviation
def get_surrounding_sentence(text, abbreviation):
    try:
        # Find the position of the abbreviation in the text
        match = re.search(re.escape(abbreviation), text)
        if not match:
            return None

        # Start and end positions of the abbreviation
        start, end = match.start(), match.end()

        # Find the closest sentence boundaries
        before = text[:start].rfind('. ')
        before = before if before != -1 else 0
        after = text[end:].find('. ')
        after = after if after != -1 else len(text)

        # Extract the sentence
        surrounding_sentence = text[before + 2 : end + after].strip()  # +2 to skip the period and space
        return surrounding_sentence
    except Exception as e:
        print(f"Error in extracting sentence: {e}")
        return None

# Function to collect article details from a given article URL
def collectArticleDetails(articleUrl):
    try:
        # Construct the full URL and initiate the request
        articleUrl = "https://journals.plos.org" + articleUrl
        print(f"Navigating to {articleUrl}")
        request = Request(articleUrl, headers=headers)

        # Open the URL and read the page content
        with urlopen(request) as response:
            page_source = response.read()

        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Find the download menu and the XML link
        downloadMenu = soup.find("div", class_="dload-menu")
        liElement = downloadMenu.find("a", id="downloadXml")
        
        # Construct the full XML link
        xmlLink = "https://journals.plos.org" + liElement.get("href")
        
        # Request the XML content
        request = Request(xmlLink, headers=headers)
        with urlopen(request) as response:
            xml_content = response.read()

        return xml_content
    except Exception as e:
        print(f"Error while fetching article details: {e}")
        return "Content is not available"

# Updated processRow function
def processRow(index, row, dataset):
    try:
        # Collect the XML content for each article
        xml_content = collectArticleDetails(row["articleLink"])
        
        # Parse the XML content using BeautifulSoup
        soup_xml = BeautifulSoup(xml_content, 'xml')

        # Extract content between <body> tags
        textContent = soup_xml.find("body")
        combined_content = textContent.get_text().strip()

        # Extract abbreviations and expansions
        extracted_abbreviations = preprocessData(extractAbbrev(combined_content))
        
        # Process each abbreviation and retrieve its sentence
        results = []
        for element in extracted_abbreviations:
            if len(element[0]) > 1 and len(element[1]) > 1:
                sentence = get_surrounding_sentence(combined_content, element[0])
                if sentence:
                    results.append({
                        "abbreviation": element[0],
                        "expansion": element[1],
                        "sentence": sentence
                    })

        
        return results

    except Exception as e:
        print(f"Error processing row {index}: {e}")
        return None

# Main function remains unchanged
def processDataset(dataset):
    data_to_save = []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        future_to_row = {executor.submit(processRow, index, row, dataset): row for index, row in dataset.iterrows()}
        
        for future in concurrent.futures.as_completed(future_to_row):
            result = future.result()
            if result:
                data_to_save.append(result)

            if len(data_to_save) >= 500:
                periodicSave(data_to_save)
                data_to_save = []  # Reset data after saving

    if data_to_save:
        periodicSave(data_to_save)

# Start processing the dataset
processDataset(dataset.iloc[100000:200000,:])


Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0167440
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0167435
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0167425
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0167414
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0167413
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0167407
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0167401
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0167388
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0167353
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0167327
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0167325
Navigating to https://journals.p