In [2]:
import pandas as pd 
dataset=pd.read_json("../data/PLOS/PLOSArticles.json")

In [3]:
import re

def extractAbbrev(element):
    try:
        # Regex pattern to find fully uppercase words in parentheses
        pattern = r'\(([A-Z]+)\)'

        # Find all matches with their start positions
        matches = [(match.group(1), match.start()) for match in re.finditer(pattern, element)]
        
        # Function to remove HTML tags from a string
        def remove_html_tags(text):
            return re.sub(r'<[^>]*>', '', text)

        # Extract words based on the new condition
        result = []
        for word, position in matches:
            
            # Find the preceding content up to the word's position
            preceding_text = element[:position].split()
            
            # Extract words starting with the same letter as the abbreviation
            extracted_words = []
            for w in reversed(preceding_text):
                extracted_words.insert(0,w)  # Insert at the beginning of the list
                # Stop extracting if the word doesn't contain the first letter
                if w[0].lower() not in word.lower():
                    break

            result.append((word, [remove_html_tags(item) for item in extracted_words]))  # Remove HTML tags

    except Exception as e:
        print(f"Error: {e}")
        return []
    
    return result
def preprocessData(resultsAnnotation):
    for index in range(len(resultsAnnotation)):
        try :
            word = resultsAnnotation[index][0]
            listOfWords = resultsAnnotation[index][1]
            firstLetter = word[0]
            # Ensure listOfWords has words starting with the same letter as the abbreviation
            while listOfWords and listOfWords[0][0].lower() != firstLetter.lower():
                resultsAnnotation[index][1].pop(0)
        except:
            continue
    return resultsAnnotation

In [None]:
import time
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import csv
import pandas as pd
import concurrent.futures

# Headers for making requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.137 Safari/537.36'
}

# Function to save data periodically by appending to CSV
def periodicSave(data):
    try:
        # Open file in append mode and write data
        with open("../data/PLOS/PLOSArticlesDetails.csv", mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerows(data)  # Append the new data rows
        print("A Periodic Save is done.")
    except Exception as e:
        print(f"Error during periodic save: {e}")

# Function to collect article details from a given article URL
def collectArticleDetails(articleUrl):
    try:
        # Construct the full URL and initiate the request
        articleUrl = "https://journals.plos.org" + articleUrl
        print(f"Navigating to {articleUrl}")
        request = Request(articleUrl, headers=headers)

        # Open the URL and read the page content
        with urlopen(request) as response:
            page_source = response.read()

        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Find the download menu and the XML link
        downloadMenu = soup.find("div", class_="dload-menu")
        liElement = downloadMenu.find("a", id="downloadXml")
        
        # Construct the full XML link
        xmlLink = "https://journals.plos.org" + liElement.get("href")
        
        # Request the XML content
        request = Request(xmlLink, headers=headers)
        with urlopen(request) as response:
            xml_content = response.read()

        return xml_content
    except Exception as e:
        print(f"Error while fetching article details: {e}")
        return "Content is not available"

# Function to process a single row asynchronously
def processRow(index, row, dataset):
    try:
        # Collect the XML content for each article
        xml_content = collectArticleDetails(row["articleLink"])
        
        # Parse the XML content using BeautifulSoup
        soup_xml = BeautifulSoup(xml_content, 'xml')

        # Extract content between <abstract> and </abstract> tags
        textContent = soup_xml.find("body")
        combined_content = textContent.get_text().strip()

        
        # Update the dataset with the XML content
        dataset.loc[index, "abbreviations"] =str(preprocessData(extractAbbrev(combined_content)))
        print(dataset.loc[index, "abbreviations"])
        return dataset.iloc[index].values.tolist()

    except Exception as e:
        print(f"Error processing row {index}: {e}")
        return None

# Main function to process dataset with threading
def processDataset(dataset):
    data_to_save = []
    
    # Use ThreadPoolExecutor for concurrent processing of dataset rows
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        future_to_row = {executor.submit(processRow, index, row, dataset): row for index, row in dataset.iterrows()}
        
        # As each future is completed, process the result and save it periodically
        for future in concurrent.futures.as_completed(future_to_row):
            result = future.result()
            if result:
                data_to_save.append(result)

            # Save periodically after processing 500 entries
            if len(data_to_save) >= 500:
                periodicSave(data_to_save)
                data_to_save = []  # Reset data after saving

    # Save any remaining data
    if data_to_save:
        periodicSave(data_to_save)


# Start processing the dataset
processDataset(dataset.iloc[:50000,:])


Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0314876
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0314873
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0314791
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0314783
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0314771
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0314764
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0314747
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0314744


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.loc[index, "abbreviations"] =str(preprocessData(extractAbbrev(combined_content)))


[('ECM', ['extracellular', 'matrix']), ('PBMC', ['peripheral', 'blood', 'mononuclear', 'cell']), ('IPF', ['idiopathic', 'pulmonary', 'fibrosis']), ('HP', ['hypersensitivity', 'pneumonitis']), ('GEO', ['Gene', 'Expression', 'Omnibus']), ('TMA', ['Tissue', 'microarray']), ('ISH', ['In', 'situ', 'hybridization']), ('FOV', ['field', 'of', 'view']), ('IPAF', []), ('PC', ['principal', 'component']), ('IPF', ['idiopathic', 'pulmonary', 'fibrosis']), ('HP', ['hypersensitivity', 'pneumonitis']), ('FDR', ['false', 'discovery', 'rate']), ('IPF', ['idiopathic', 'pulmonary', 'fibrosis']), ('HP', ['hypersensitivity', 'pneumonitis']), ('XLSX', []), ('CSV', []), ('XLSX', []), ('PCA', ['principal', 'component', 'analysis']), ('TIF', [])]
Navigating to https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0314737
[('IPCC', []), ('NMDS', ['non-metric', 'multidimensional', 'scaling']), ('A', ['as']), ('B', []), ('A', []), ('B', []), ('DOCX', [])]
Navigating to https://journals.plos.org/plosone