# Journal of Experimental Neurology Extraction Code

In [1]:
# Import necessary modules
import os
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import uniform
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import json
from math import ceil
from collections import defaultdict, Counter
from statistics import mean, median, stdev
import matplotlib.pyplot as plt

In [2]:
# List all files in the current directory
os.listdir()

['final-JNeurotrauma.ipynb',
 'additional-section',
 '.DS_Store',
 'final-additional',
 'Extracted JNeurotrauma',
 'Extracted Jneurotrauma-new',
 'oa_file_list.txt',
 'pmid-Experiment-set.txt',
 'pmcid-Experiment-set.txt',
 'pmcid-JNeurotrau-set.txt',
 'pmid-JNeurotrau-set.txt',
 'Extracted Experimental Neurology',
 '.ipynb_checkpoints',
 'package',
 'final-Experimental-Neurology.ipynb',
 'extraction-github']

## PMID to PMCID Conversion

In [3]:
# Count number of PMIDs in pmid-JNeurotrau-set.txt
with open('pmid-Experiment-set.txt', 'r') as file:
    pmid_lines = file.readlines()
    number_of_pmids = len(pmid_lines)

print(f"Number of PMIDs: {number_of_pmids}")


Number of PMIDs: 1060


In [4]:
# Count number of PMCIDs in pmcid-JNeurotrau-set.txt
with open('pmcid-Experiment-set.txt', 'r') as file:
    pmcid_lines = file.readlines()
    number_of_pmcids = len(pmcid_lines)

print(f"Number of PMCIDs: {number_of_pmcids}")


Number of PMCIDs: 810


In [5]:
pmid_list = [line.strip().split('-')[0] for line in pmid_lines]
pmcid_list = [line.strip().split('-')[0] for line in pmcid_lines]

# Find missing PMIDs that are in pmid-JNeurotrau-set.txt but not in pmcid-JNeurotrau-set.txt
missing_pmids = set(pmid_list) - set(pmcid_list)
number_of_missing_pmids = len(missing_pmids)
number_difference = number_of_pmids - number_of_pmcids

print(f"Number of missing PMIDs: {number_of_missing_pmids}\n")
print(f"Missing PMIDs:\n{missing_pmids}")


Number of missing PMIDs: 250

Missing PMIDs:
{'25792481', '27063582', '25816736', '26428905', '25476493', '26079646', '24945601', '26687972', '24999027', '25891441', '25939697', '25708986', '23178582', '26091850', '26169930', '26791254', '26287750', '25483398', '25541322', '26387938', '25432068', '25246228', '25118620', '24858805', '26093037', '26188381', '26044197', '25863021', '24731947', '25842268', '25889458', '26851542', '25500111', '26698925', '25542980', '26836322', '25131640', '26071088', '25447938', '25592627', '26408049', '25263581', '25797576', '24918341', '25058045', '25681575', '26263843', '26746986', '26546833', '25433215', '26854932', '25863022', '24999028', '27060489', '27018320', '26953231', '26626971', '25987538', '26172316', '26775176', '26689323', '26597542', '25523813', '25862287', '26079647', '25446721', '23036599', '26416261', '25084518', '25981889', '26607913', '26439313', '25597650', '25447935', '26988764', '25432069', '26366525', '24731945', '26853136', '25814

#### We can see that 250 PMIDs were not converted to PMCIDs so now we're dealing with 810 PMCIDs or papers.

## Methods Section Extraction

In [6]:
# Function to fetch HTML content of a given PMCID
def fetch_pmc_html(pmcid):
    url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Referer': 'https://www.ncbi.nlm.nih.gov/',
    }
    try:
        response = requests.get(url, headers=headers)
        sleep(uniform(10, 15))
        if response.status_code == 200:
            return response.content
        elif response.status_code == 404:
            print(f"PMCID {pmcid} not found (404). Skipping.")
            return None
        else:
            print(f"Failed to fetch HTML for PMCID: {pmcid} - Status Code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching HTML for PMCID: {pmcid} - Error: {e}")
        return None

# Function to extract the Methods section from HTML content
def extract_methods_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    potential_headings = ['methods', 'method', 'materials and methods', 'methodology']
    methods_sections = []

    for heading_text in potential_headings:
        methods_heading = soup.find(lambda tag: tag.name in ['h2', 'h3', 'h4'] and heading_text in tag.get_text(strip=True).lower())
        if methods_heading:
            current_element = methods_heading.find_next()
            while current_element and current_element.name in ['p', 'div', 'section']:
                if current_element.name == 'p':
                    methods_sections.append(current_element.get_text(strip=True))
                elif current_element.name == 'div' and ('sec' in current_element.get('class', []) or 'tsec' in current_element.get('class', [])):
                    methods_sections.append(current_element.get_text(separator="\n", strip=True))
                elif current_element.name == 'section':
                    methods_sections.append(current_element.get_text(separator="\n", strip=True))
                current_element = current_element.find_next_sibling()
            if methods_sections:
                break

    return "\n\n".join(methods_sections) if methods_sections else "Methods section not found."

# Function to process each PMCID
def process_pmcid(pmcid):
    html_content = fetch_pmc_html(pmcid)
    if html_content:
        methods_text = extract_methods_from_html(html_content)
        return {"PMCID": pmcid, "Methods": methods_text}, None
    return None, pmcid

# Function to save results in JSON format
def save_json(data, file_index, output_dir):
    """
    Saves the collected data into a JSON file in the specified output directory.
    """
    os.makedirs(output_dir, exist_ok=True)
    file_path = os.path.join(output_dir, f"methods_{file_index}.json")
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=2)
    print(f"Saved {len(data)} entries to {file_path}")


In [8]:
def main(input_file, file_number, chunk_size=100):
    """
    Main function to process a specific chunk of PMCIDs based on the file number and save results.

    Args:
        input_file (str): The file containing PMCIDs in the format PMID-PMCID.
        file_number (int): The file number to process (1 for the first 0-99, 2 for 100-199, etc.).
        chunk_size (int): The number of PMCIDs per output JSON file.
    """
    output_dir = 'methods-Experiment-set-new'
    failed_pmcids = []

    # Read all PMCIDs from the input file
    with open(input_file, 'r') as f:
        lines = f.readlines()
        pmcids = [line.strip().split('-')[1] for line in lines]

    # Calculate start and end indices based on file number and chunk size
    start_index = (file_number - 1) * chunk_size
    end_index = start_index + chunk_size

    # Ensure the indices do not go out of bounds
    if start_index >= len(pmcids):
        print("Start index exceeds the total number of PMCIDs.")
        return

    pmcids_to_process = pmcids[start_index:end_index]
    results = []
    file_index = file_number

    # Process PMCIDs in chunks using multithreading
    with ThreadPoolExecutor(max_workers=10) as executor:
        for result, failed in tqdm(executor.map(process_pmcid, pmcids_to_process), total=len(pmcids_to_process), desc=f"Processing File {file_number}"):
            if result:
                results.append(result)
            if failed:
                failed_pmcids.append(failed)

        # Save the results for the current file number
        save_json(results, file_index, output_dir)

    # Save all failed PMCIDs to a file
    if failed_pmcids:
        with open(os.path.join(output_dir, 'failed_pmcids.txt'), 'a') as f:
            for pmcid in failed_pmcids:
                f.write(f"{pmcid}\n")

    print(f"Processing of file number {file_number} completed. {len(results)} sections saved, {len(failed_pmcids)} failed.")

# Example of running the main function with the file number input
if __name__ == "__main__":
    main(input_file='pmcid-Experiment-set.txt', file_number=1, chunk_size=100)


## Analysis of retrieved methods section

In [10]:
with open('pmcid-Experiment-set.txt', 'r') as file:
    pmcid_list = [line.strip().split('-')[1] for line in file.readlines()]

json_pmcids = set()
missing_methods_pmcids = set()

# Loop through the JSON files
for i in range(1, 12):
    file_path = f'methods-Experiment-set-new/methods_{i}.json'
    if os.path.exists(file_path):
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
            for entry in data:
                pmcid = entry['PMCID']
                json_pmcids.add(pmcid)
                if entry['Methods'] == "Methods section not found.":
                    missing_methods_pmcids.add(pmcid)

pmcid_set = set(pmcid_list)
missing_pmcids = pmcid_set - json_pmcids

all_missing = missing_pmcids.union(missing_methods_pmcids)
print(f"Total Methods Sections extracted: {len(pmcid_set) - len(all_missing)}\n")
print(f"Total missing PMCIDs (including 'Methods section not found'): {len(all_missing)}\n")
print(f"Missing PMCIDs: {all_missing}")


Total Methods Sections extracted: 628

Total missing PMCIDs (including 'Methods section not found'): 182

Missing PMCIDs: {'PMC7484028', 'PMC10443427', 'PMC6544476', 'PMC5077668', 'PMC9707338', 'PMC5693633', 'PMC8012222', 'PMC5491308', 'PMC5010931', 'PMC8527806', 'PMC4724057', 'PMC5656543', 'PMC7856041', 'PMC4679498', 'PMC4500746', 'PMC10035810', 'PMC7869696', 'PMC4609591', 'PMC4194231', 'PMC4688066', 'PMC4598257', 'PMC7750306', 'PMC6894499', 'PMC4234697', 'PMC10872660', 'PMC7877222', 'PMC6530785', 'PMC4581889', 'PMC4686380', 'PMC8063173', 'PMC4586307', 'PMC10207636', 'PMC10163623', 'PMC7874968', 'PMC7237325', 'PMC9892278', 'PMC5400679', 'PMC5340082', 'PMC5878012', 'PMC6956249', 'PMC4691235', 'PMC7237336', 'PMC5056828', 'PMC9789191', 'PMC4631710', 'PMC4920549', 'PMC4466166', 'PMC9357217', 'PMC6156776', 'PMC11160556', 'PMC4569542', 'PMC6275553', 'PMC4689601', 'PMC6588420', 'PMC8820239', 'PMC9749756', 'PMC5010953', 'PMC4466144', 'PMC5097896', 'PMC4977201', 'PMC5994754', 'PMC5019113', 'PM

#### We can see that out of 810 papers, we were able to extract only 628 methods section and 182 papers did not have any methdos section in the papers according to the extraction technique. 