In [2]:
from pathlib import Path
import re
from unidecode import unidecode as unidecode_func
from bs4 import BeautifulSoup
import csv

### Processing Functions

 1. Creates a set (list) of all authors:  `extract_authors_set`
 2. Defines irrelevant info in abstract text:  `remove_irrelevant_info` 
 3. Process the file and create an output csv: ` process_abstracts` 

In [3]:
def extract_authors_set(authors_path: Path) -> set:
    # Read the 'authors.txt' file
    with open(authors_path, "r") as file:
        lines = file.readlines()

    # Create the authors_dict
    authors_dict = {int(line.split('||')[0]): line.split('||')[1].strip() for line in lines}

    all_authors = []

    for authors in authors_dict.values():
        for author in authors.split(','):
            # Replace unicode characters
            author = unidecode_func(author)
            # Remove numerical values
            author = re.sub(r'\d+', '', author)
            # Remove capital letters followed by a dot
            author = re.sub(r'\b[A-Z]\. ?', '', author)
            # Remove leading and trailing whitespaces
            author = author.strip()
            
            if author:
                all_authors.append(author)

    # Convert list to set
    all_authors = set(all_authors)
    
    return all_authors

In [4]:
def remove_irrelevant_info(text, all_authors: set):

    ### Remove Math formulas ###
    # Remove inline math formulas between single dollar signs ($...$)
    text = re.sub(r'\$[^$]*\$', '', text)
    # Remove display math formulas between double dollar signs ($$...$$)
    text = re.sub(r'\$\$[^$]*\$\$', '', text)
    # Remove math formulas between \begin{equation} and \end{equation}
    text = re.sub(r'\\begin\{equation\}.*?\\end\{equation\}', '', text, flags=re.DOTALL)
    # Remove math formulas between \begin{align} and \end{align}
    text = re.sub(r'\\begin\{align\}.*?\\end\{align\}', '', text, flags=re.DOTALL)

    # Remove email addresses
    text = re.sub(r'\S+@\S+\.\S+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()

    # Remove numerical elements in () and []
    text = re.sub(r'(\(\d+\)|\[\d+\])', '', text)

    # Remove text elements in () --> ex. (a),(b),(i)
    text = re.sub(r'\(([a-f]|[ivx]+)\)', '', text)

    # Remove Authors First Names (ex. A., B., C.)
    text = re.sub(r'\b[A-Z]\.', '', text)

    # Remove text inside parentheses that has the expression et al.
    text = re.sub(r'\([^\(\)]*et al\.[^\(\)]*\)', '', text)

    # Remove text with non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    ##### Remove author names #####   
    # Tokenize the text by splitting it at whitespace characters
    tokens = text.split()
    # Remove tokens that match author names
    tokens = [token for token in tokens if token not in all_authors]
    # Rejoin the tokens into a single string
    text = ' '.join(tokens)

    return text.strip()

In [6]:
def process_abstracts(abstract_path: Path, all_authors: set, output_csv_path: Path) -> csv:
    # Read abstracts of research papers
    abstracts = dict()
    with open(abstract_path, "r") as f:
        for line in f:
            t = line.split('||')
            abstracts[int(t[0])] = t[1][:-1]

    # Creates a dictionary of cleaned/processed abstracts
    cleaned_abstracts = {}
    for paper_id, abstract in abstracts.items():
        cleaned_abstracts[paper_id] = remove_irrelevant_info(abstract, all_authors)

    # Export data to csv
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['paper_id', 'abstract'])
        
        for paper_id, abstract in cleaned_abstracts.items():
            writer.writerow([paper_id, abstract])  

### Results

In [7]:
# Set the absolute path to your data directory
data_directory = Path("E:/panag/Desktop/Ms Data Science/6 Quarter/Data Science Challenge/data_challenge_aueb_2023")

# Authors data
authors_file = "authors.txt"
# Abstract data
abstract_file = "abstract.txt"

authors_path  = data_directory / authors_file
abstract_path = data_directory / abstract_file

In [10]:
output_csv_name = 'cleaned_abstracts.csv'

# Create author list
authors_lst = extract_authors_set(authors_path)
# Create output csv
process_abstracts(abstract_path, authors_lst, output_csv_name)