In [1]:
import re

# PubMed

In [4]:
def pubmed_preprocessing(file_path):
    
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Regex pattern to match lines starting with an integer followed by a dot and a space
    pattern = r"^(\d+)\. (.+)"
    previous_number = 0  # Track the previous number to enforce the condition

    start_stop_indices = []
    for i, line in enumerate(lines):
        match = re.match(pattern, line.strip())
        if match:
            current_number = int(match.group(1))  # Extract the number
            if current_number == previous_number + 1:  # Check if it's sequential
                start_stop_indices.append(i)
                previous_number += 1

    # Extract paragraphs between start and stop lines
    paragraphs = []
    for idx in range(len(start_stop_indices)):
        start = start_stop_indices[idx]
        end = start_stop_indices[idx + 1] if idx + 1 < len(start_stop_indices) else len(lines)
        paragraph = "".join(lines[start:end]).strip()  # Join lines and trim whitespace
        paragraph = paragraph.lower()

        # This list was developed through an iterative process for PubMed data, 
        # incorporating a manual review to ensure comprehensive coverage of the available data.
        if not ('retraction notice' not in paragraph and 'retracted article' not in paragraph 
            and 'retraction: ' not in paragraph and 'retracted: ' not in paragraph
            and '[retracted]' not in paragraph and 'retraction note: ' not in paragraph
            and ': retraction' not in paragraph and '\n\nretraction' not in paragraph):

            paragraph = re.sub(r'(?m)^author information:.*\n(?:.*\n)*', '', paragraph)

            # This list was developed during the ClinicalTrials analysis:
            # https://github.com/maksymfritsak/Clinical_Trials_Analysis_ClinicalTrials.Gov_Database
            # where over 4,000 clinical studies were analyzed.

            keys= ['radiation', 'radiotherapy', 'radiosurgery', 'brachy', 'tomotherapy', 'radiodynamic',
              'hypofraction', 'Radiochemo', 'fractionated', 'fractionation', 'conv', 'stereotactic',
              'external beam', 'cyberknife', 'simultaneous integrated boost', 'simultaneous boost',
              'IMRT', 'VMAT', 'SBRT', 'SABR', 'BNCT', 'IMPT', 'SRS', 'PBI', 'PRDR', 'TBI', 'TLI',
              'TSEB', 'HDR', 'PBT', 'SIB', 'WBRT', 'CCRT', 'volume modulated arc therapy', 'pencil beam scanning', 
              'particle therapy', 'proton', 'carbon', 'electron', 'photon', 'gray', 'radio', 'radia'] 

            for key in keys:
                if key in paragraph and paragraph not in paragraphs:
                    paragraphs.append(paragraph)
    
    return paragraphs

In [6]:
file_path = "PubMed.txt"
paragraphs = pubmed_preprocessing(file_path)

with open('PubMed_result.txt', 'w', encoding='utf-8') as file:
    for paragraph in paragraphs:
        # As part of preprocessing, the studies were manually screened, and those containing the word 
        # 'retracted' (or similar terms) with a different contextual meaning unrelated to scientific 
        # retraction or were not RT-related were excluded.

        if not (paragraph.startswith('90. ') or paragraph.startswith('979. ') 
                or paragraph.startswith('1020. ') or paragraph.startswith('1205. ')             
                or paragraph.startswith('36. ') or paragraph.startswith('37. ') 
                or paragraph.startswith('39. ') or paragraph.startswith('87. ')
                or paragraph.startswith('116. ') or paragraph.startswith('167. ')
                or paragraph.startswith('194. ') or paragraph.startswith('205. ') 
                or paragraph.startswith('233. ') or paragraph.startswith('274. ')
                or paragraph.startswith('286. ') or paragraph.startswith('301. ')
                or paragraph.startswith('363. ') or paragraph.startswith('464. ') 
                or paragraph.startswith('467. ') or paragraph.startswith('571. ')
                or paragraph.startswith('646. ') or paragraph.startswith('679. ')
                or paragraph.startswith('700. ') or paragraph.startswith('766. ') 
                or paragraph.startswith('777. ') or paragraph.startswith('858. ')
                or paragraph.startswith('891. ') or paragraph.startswith('902. ')
                or paragraph.startswith('906. ') or paragraph.startswith('950. ')
                or paragraph.startswith('970. ') or paragraph.startswith('981. ')
                or paragraph.startswith('983. ') or paragraph.startswith('1001. ')
                or paragraph.startswith('1017. ') or paragraph.startswith('1041. ')
                or paragraph.startswith('1200. ') or paragraph.startswith('1220. ')
                ):
            file.write('\n'+ paragraph + '\n') 

# Embase

In [8]:
def embase_preprocessing(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Regex pattern to match lines starting with <number>
    pattern = r"^<(\d+)>"
    previous_number = 0  # Track the previous number to enforce the condition

    start_stop_indices = []
    for i, line in enumerate(lines):
        match = re.match(pattern, line.strip())
        if match:
            current_number = int(match.group(1))  # Extract the number
            if current_number == previous_number + 1:  # Check if it's sequential
                start_stop_indices.append(i)
                previous_number += 1

    # Extract paragraphs between start and stop lines
    paragraphs = []
    date_lines = []
    for idx in range(len(start_stop_indices)):
        start = start_stop_indices[idx]
        end = start_stop_indices[idx + 1] if idx + 1 < len(start_stop_indices) else len(lines)
        paragraph = "".join(lines[start:end]).strip()  # Join lines and trim whitespace
        paragraph = paragraph.lower()
        
        match_date = re.search(r"date of publication:.*?(\n|$)", paragraph, re.IGNORECASE)
        
        txt_date = match_date.group().strip()
        if txt_date[-5:-1].isdigit() and int(txt_date[-5:-1]) >= 2017:

            # This list was developed through an iterative process for Embase data, 
            # incorporating a manual review to ensure comprehensive coverage of the available data.
            if ('retracted: ' in paragraph 
                or 'retraction: ' in paragraph 
                or 'retraction note: ' in paragraph 
                or 'retraction notice' in paragraph
                or 'retraction to:' in paragraph
                or 'retracted article:' in paragraph
                or 'retraction of:' in paragraph
                or 'retraction note to:' in paragraph
                or 'notice of retraction and replacement' in paragraph):

                title = re.search(r"title\s*(.*?)\s*source", paragraph, re.DOTALL | re.IGNORECASE).group().strip()

                # This list was developed during the ClinicalTrials analysis:
                # https://github.com/maksymfritsak/Clinical_Trials_Analysis_ClinicalTrials.Gov_Database
                # where over 4,000 clinical studies were analyzed.
                
                keys= ['radiation', 'radiotherapy', 'radiosurgery', 'brachy', 'tomotherapy', 'radiodynamic',
                       'hypofraction', 'Radiochemo', 'fractionated', 'fractionation', 'conv', 'stereotactic',
                       'external beam', 'cyberknife', 'simultaneous integrated boost', 'simultaneous boost',
                       'IMRT', 'VMAT', 'SBRT', 'SABR', 'BNCT', 'IMPT', 'SRS', 'PBI', 'PRDR', 'TBI', 'TLI',
                       'TSEB', 'HDR', 'PBT', 'SIB', 'WBRT', 'CCRT', 'volume modulated arc therapy', 'pencil beam scanning', 
                       'particle therapy', 'proton', 'carbon', 'electron', 'photon', 'gray', 'radio', 'radia']

                for key in keys:
                    if key in title and paragraph not in paragraphs:
                        
                        paragraphs.append(paragraph)

    return paragraphs

In [10]:
file_path = "Embase_Ovid.txt"
paragraphs = embase_preprocessing(file_path)

with open('Embase_result.txt', 'w', encoding='utf-8') as file:
    for paragraph in paragraphs:
        # As part of preprocessing, the studies were manually screened, and those containing the word 
        # 'retracted' (or similar terms) with a different contextual meaning unrelated to scientific 
        # retraction or were not RT-related  were excluded.
        
        if not (paragraph.startswith('<27>') or paragraph.startswith('<35>') 
                or paragraph.startswith('<387>') or paragraph.startswith('<451>')
                or paragraph.startswith('<534>') or paragraph.startswith('<564>') 
                or paragraph.startswith('<769>')):

            file.write('\n'+ paragraph + '\n') 